From 9594a4986192f99c01a7c0a1779b5ac0eff8e208 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Mon, 2 Jul 2012 17:53:25 +0900
Subject: KVM: MMU: Use __gfn_to_rmap() to clean up kvm_handle_hva()

We can treat every level uniformly.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 28c8fbcc676..2beb95b57cb 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1281,14 +1281,14 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
 			gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
 			gfn_t gfn = memslot->base_gfn + gfn_offset;
 
-			ret = handler(kvm, &memslot->rmap[gfn_offset], data);
+			ret = 0;
 
-			for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
-				struct kvm_lpage_info *linfo;
+			for (j = PT_PAGE_TABLE_LEVEL;
+			     j < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++j) {
+				unsigned long *rmapp;
 
-				linfo = lpage_info_slot(gfn, memslot,
-							PT_DIRECTORY_LEVEL + j);
-				ret |= handler(kvm, &linfo->rmap_pde, data);
+				rmapp = __gfn_to_rmap(gfn, j, memslot);
+				ret |= handler(kvm, rmapp, data);
 			}
 			trace_kvm_age_page(hva, memslot, ret);
 			retval |= ret;
-- 
cgit v1.2.3


From d19a748b1c42b133e9263e9023c1d162efa6f4ad Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Mon, 2 Jul 2012 17:54:30 +0900
Subject: KVM: Introduce hva_to_gfn_memslot() for kvm_handle_hva()

This restricts hva handling in mmu code and makes it easier to extend
kvm_handle_hva() so that it can treat a range of addresses later in this
patch series.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Cc: Alexander Graf <agraf@suse.de>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/powerpc/kvm/book3s_64_mmu_hv.c | 6 +++---
 arch/x86/kvm/mmu.c                  | 3 +--
 2 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index d03eb6f7b05..37037553fe6 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -772,10 +772,10 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
 
 		end = start + (memslot->npages << PAGE_SHIFT);
 		if (hva >= start && hva < end) {
-			gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
+			gfn_t gfn = hva_to_gfn_memslot(hva, memslot);
+			gfn_t gfn_offset = gfn - memslot->base_gfn;
 
-			ret = handler(kvm, &memslot->rmap[gfn_offset],
-				      memslot->base_gfn + gfn_offset);
+			ret = handler(kvm, &memslot->rmap[gfn_offset], gfn);
 			retval |= ret;
 		}
 	}
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 2beb95b57cb..170a632d9d3 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1278,8 +1278,7 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
 
 		end = start + (memslot->npages << PAGE_SHIFT);
 		if (hva >= start && hva < end) {
-			gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
-			gfn_t gfn = memslot->base_gfn + gfn_offset;
+			gfn_t gfn = hva_to_gfn_memslot(hva, memslot);
 
 			ret = 0;
 
-- 
cgit v1.2.3


From 84504ef38673fa021b3d8f3da2b79cf878b33315 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Mon, 2 Jul 2012 17:55:48 +0900
Subject: KVM: MMU: Make kvm_handle_hva() handle range of addresses

When guest's memory is backed by THP pages, MMU notifier needs to call
kvm_unmap_hva(), which in turn leads to kvm_handle_hva(), in a loop to
invalidate a range of pages which constitute one huge page:

  for each page
    for each memslot
      if page is in memslot
        unmap using rmap

This means although every page in that range is expected to be found in
the same memslot, we are forced to check unrelated memslots many times.
If the guest has more memslots, the situation will become worse.

Furthermore, if the range does not include any pages in the guest's
memory, the loop over the pages will just consume extra time.

This patch, together with the following patches, solves this problem by
introducing kvm_handle_hva_range() which makes the loop look like this:

  for each memslot
    for each page in memslot
      unmap using rmap

In this new processing, the actual work is converted to a loop over rmap
which is much more cache friendly than before.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Cc: Alexander Graf <agraf@suse.de>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/powerpc/kvm/book3s_64_mmu_hv.c | 36 ++++++++++++++++++++++++-------
 arch/x86/kvm/mmu.c                  | 42 ++++++++++++++++++++++++++++---------
 2 files changed, 60 insertions(+), 18 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 37037553fe6..1a470bc2876 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -756,9 +756,12 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	goto out_put;
 }
 
-static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
-			  int (*handler)(struct kvm *kvm, unsigned long *rmapp,
-					 unsigned long gfn))
+static int kvm_handle_hva_range(struct kvm *kvm,
+				unsigned long start,
+				unsigned long end,
+				int (*handler)(struct kvm *kvm,
+					       unsigned long *rmapp,
+					       unsigned long gfn))
 {
 	int ret;
 	int retval = 0;
@@ -767,12 +770,22 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
 
 	slots = kvm_memslots(kvm);
 	kvm_for_each_memslot(memslot, slots) {
-		unsigned long start = memslot->userspace_addr;
-		unsigned long end;
+		unsigned long hva_start, hva_end;
+		gfn_t gfn, gfn_end;
+
+		hva_start = max(start, memslot->userspace_addr);
+		hva_end = min(end, memslot->userspace_addr +
+					(memslot->npages << PAGE_SHIFT));
+		if (hva_start >= hva_end)
+			continue;
+		/*
+		 * {gfn(page) | page intersects with [hva_start, hva_end)} =
+		 * {gfn, gfn+1, ..., gfn_end-1}.
+		 */
+		gfn = hva_to_gfn_memslot(hva_start, memslot);
+		gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
 
-		end = start + (memslot->npages << PAGE_SHIFT);
-		if (hva >= start && hva < end) {
-			gfn_t gfn = hva_to_gfn_memslot(hva, memslot);
+		for (; gfn < gfn_end; ++gfn) {
 			gfn_t gfn_offset = gfn - memslot->base_gfn;
 
 			ret = handler(kvm, &memslot->rmap[gfn_offset], gfn);
@@ -783,6 +796,13 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
 	return retval;
 }
 
+static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
+			  int (*handler)(struct kvm *kvm, unsigned long *rmapp,
+					 unsigned long gfn))
+{
+	return kvm_handle_hva_range(kvm, hva, hva + 1, handler);
+}
+
 static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
 			   unsigned long gfn)
 {
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 170a632d9d3..7235b0c9587 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1259,10 +1259,13 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
 	return 0;
 }
 
-static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
-			  unsigned long data,
-			  int (*handler)(struct kvm *kvm, unsigned long *rmapp,
-					 unsigned long data))
+static int kvm_handle_hva_range(struct kvm *kvm,
+				unsigned long start,
+				unsigned long end,
+				unsigned long data,
+				int (*handler)(struct kvm *kvm,
+					       unsigned long *rmapp,
+					       unsigned long data))
 {
 	int j;
 	int ret;
@@ -1273,13 +1276,22 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
 	slots = kvm_memslots(kvm);
 
 	kvm_for_each_memslot(memslot, slots) {
-		unsigned long start = memslot->userspace_addr;
-		unsigned long end;
+		unsigned long hva_start, hva_end;
+		gfn_t gfn, gfn_end;
 
-		end = start + (memslot->npages << PAGE_SHIFT);
-		if (hva >= start && hva < end) {
-			gfn_t gfn = hva_to_gfn_memslot(hva, memslot);
+		hva_start = max(start, memslot->userspace_addr);
+		hva_end = min(end, memslot->userspace_addr +
+					(memslot->npages << PAGE_SHIFT));
+		if (hva_start >= hva_end)
+			continue;
+		/*
+		 * {gfn(page) | page intersects with [hva_start, hva_end)} =
+		 * {gfn, gfn+1, ..., gfn_end-1}.
+		 */
+		gfn = hva_to_gfn_memslot(hva_start, memslot);
+		gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
 
+		for (; gfn < gfn_end; ++gfn) {
 			ret = 0;
 
 			for (j = PT_PAGE_TABLE_LEVEL;
@@ -1289,7 +1301,9 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
 				rmapp = __gfn_to_rmap(gfn, j, memslot);
 				ret |= handler(kvm, rmapp, data);
 			}
-			trace_kvm_age_page(hva, memslot, ret);
+			trace_kvm_age_page(memslot->userspace_addr +
+					(gfn - memslot->base_gfn) * PAGE_SIZE,
+					memslot, ret);
 			retval |= ret;
 		}
 	}
@@ -1297,6 +1311,14 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
 	return retval;
 }
 
+static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
+			  unsigned long data,
+			  int (*handler)(struct kvm *kvm, unsigned long *rmapp,
+					 unsigned long data))
+{
+	return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler);
+}
+
 int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
 {
 	return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp);
-- 
cgit v1.2.3


From b3ae2096974b12c3af2ad1a4e7716b084949867f Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Mon, 2 Jul 2012 17:56:33 +0900
Subject: KVM: Introduce kvm_unmap_hva_range() for
 kvm_mmu_notifier_invalidate_range_start()

When we tested KVM under memory pressure, with THP enabled on the host,
we noticed that MMU notifier took a long time to invalidate huge pages.

Since the invalidation was done with mmu_lock held, it not only wasted
the CPU but also made the host harder to respond.

This patch mitigates this by using kvm_handle_hva_range().

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Cc: Alexander Graf <agraf@suse.de>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/powerpc/include/asm/kvm_host.h | 2 ++
 arch/powerpc/kvm/book3s_64_mmu_hv.c | 7 +++++++
 arch/x86/include/asm/kvm_host.h     | 1 +
 arch/x86/kvm/mmu.c                  | 5 +++++
 4 files changed, 15 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 50ea12fd7bf..572ad014126 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -52,6 +52,8 @@
 
 struct kvm;
 extern int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
+extern int kvm_unmap_hva_range(struct kvm *kvm,
+			       unsigned long start, unsigned long end);
 extern int kvm_age_hva(struct kvm *kvm, unsigned long hva);
 extern int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
 extern void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 1a470bc2876..3c635c0616b 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -870,6 +870,13 @@ int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
 	return 0;
 }
 
+int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
+{
+	if (kvm->arch.using_mmu_notifiers)
+		kvm_handle_hva_range(kvm, start, end, kvm_unmap_rmapp);
+	return 0;
+}
+
 static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
 			 unsigned long gfn)
 {
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index a3e9409e90b..d4aab865606 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -944,6 +944,7 @@ extern bool kvm_rebooting;
 
 #define KVM_ARCH_WANT_MMU_NOTIFIER
 int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
+int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end);
 int kvm_age_hva(struct kvm *kvm, unsigned long hva);
 int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
 void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 7235b0c9587..d2855f895fd 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1324,6 +1324,11 @@ int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
 	return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp);
 }
 
+int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
+{
+	return kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp);
+}
+
 void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
 {
 	kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
-- 
cgit v1.2.3


From 77d11309b3a10e1ce112058ec2c9b7b979bcf311 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Mon, 2 Jul 2012 17:57:17 +0900
Subject: KVM: Separate rmap_pde from kvm_lpage_info->write_count

This makes it possible to loop over rmap_pde arrays in the same way as
we do over rmap so that we can optimize kvm_handle_hva_range() easily in
the following patch.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  2 +-
 arch/x86/kvm/mmu.c              |  6 +++---
 arch/x86/kvm/x86.c              | 11 +++++++++++
 3 files changed, 15 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index d4aab865606..4f98da9243f 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -500,11 +500,11 @@ struct kvm_vcpu_arch {
 };
 
 struct kvm_lpage_info {
-	unsigned long rmap_pde;
 	int write_count;
 };
 
 struct kvm_arch_memory_slot {
+	unsigned long *rmap_pde[KVM_NR_PAGE_SIZES - 1];
 	struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1];
 };
 
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index d2855f895fd..3b3f5ae5da6 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -960,13 +960,13 @@ static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn)
 static unsigned long *__gfn_to_rmap(gfn_t gfn, int level,
 				    struct kvm_memory_slot *slot)
 {
-	struct kvm_lpage_info *linfo;
+	unsigned long idx;
 
 	if (likely(level == PT_PAGE_TABLE_LEVEL))
 		return &slot->rmap[gfn - slot->base_gfn];
 
-	linfo = lpage_info_slot(gfn, slot, level);
-	return &linfo->rmap_pde;
+	idx = gfn_to_index(gfn, slot->base_gfn, level);
+	return &slot->arch.rmap_pde[level - PT_DIRECTORY_LEVEL][idx];
 }
 
 /*
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 59b59508ff0..829b4e97255 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6314,6 +6314,10 @@ void kvm_arch_free_memslot(struct kvm_memory_slot *free,
 	int i;
 
 	for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
+		if (!dont || free->arch.rmap_pde[i] != dont->arch.rmap_pde[i]) {
+			kvm_kvfree(free->arch.rmap_pde[i]);
+			free->arch.rmap_pde[i] = NULL;
+		}
 		if (!dont || free->arch.lpage_info[i] != dont->arch.lpage_info[i]) {
 			kvm_kvfree(free->arch.lpage_info[i]);
 			free->arch.lpage_info[i] = NULL;
@@ -6333,6 +6337,11 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
 		lpages = gfn_to_index(slot->base_gfn + npages - 1,
 				      slot->base_gfn, level) + 1;
 
+		slot->arch.rmap_pde[i] =
+			kvm_kvzalloc(lpages * sizeof(*slot->arch.rmap_pde[i]));
+		if (!slot->arch.rmap_pde[i])
+			goto out_free;
+
 		slot->arch.lpage_info[i] =
 			kvm_kvzalloc(lpages * sizeof(*slot->arch.lpage_info[i]));
 		if (!slot->arch.lpage_info[i])
@@ -6361,7 +6370,9 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
 
 out_free:
 	for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
+		kvm_kvfree(slot->arch.rmap_pde[i]);
 		kvm_kvfree(slot->arch.lpage_info[i]);
+		slot->arch.rmap_pde[i] = NULL;
 		slot->arch.lpage_info[i] = NULL;
 	}
 	return -ENOMEM;
-- 
cgit v1.2.3


From 048212d0bc0b1769a4bbecd7ace8c8d237577d1b Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Mon, 2 Jul 2012 17:57:59 +0900
Subject: KVM: MMU: Add memslot parameter to hva handlers

This is needed to push trace_kvm_age_page() into kvm_age_rmapp() in the
following patch.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 3b3f5ae5da6..dfd7a9a3115 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1200,7 +1200,7 @@ static bool rmap_write_protect(struct kvm *kvm, u64 gfn)
 }
 
 static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
-			   unsigned long data)
+			   struct kvm_memory_slot *slot, unsigned long data)
 {
 	u64 *sptep;
 	struct rmap_iterator iter;
@@ -1218,7 +1218,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
 }
 
 static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
-			     unsigned long data)
+			     struct kvm_memory_slot *slot, unsigned long data)
 {
 	u64 *sptep;
 	struct rmap_iterator iter;
@@ -1265,6 +1265,7 @@ static int kvm_handle_hva_range(struct kvm *kvm,
 				unsigned long data,
 				int (*handler)(struct kvm *kvm,
 					       unsigned long *rmapp,
+					       struct kvm_memory_slot *slot,
 					       unsigned long data))
 {
 	int j;
@@ -1299,7 +1300,7 @@ static int kvm_handle_hva_range(struct kvm *kvm,
 				unsigned long *rmapp;
 
 				rmapp = __gfn_to_rmap(gfn, j, memslot);
-				ret |= handler(kvm, rmapp, data);
+				ret |= handler(kvm, rmapp, memslot, data);
 			}
 			trace_kvm_age_page(memslot->userspace_addr +
 					(gfn - memslot->base_gfn) * PAGE_SIZE,
@@ -1314,6 +1315,7 @@ static int kvm_handle_hva_range(struct kvm *kvm,
 static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
 			  unsigned long data,
 			  int (*handler)(struct kvm *kvm, unsigned long *rmapp,
+					 struct kvm_memory_slot *slot,
 					 unsigned long data))
 {
 	return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler);
@@ -1335,7 +1337,7 @@ void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
 }
 
 static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
-			 unsigned long data)
+			 struct kvm_memory_slot *slot, unsigned long data)
 {
 	u64 *sptep;
 	struct rmap_iterator uninitialized_var(iter);
@@ -1350,7 +1352,7 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
 	 * out actively used pages or breaking up actively used hugepages.
 	 */
 	if (!shadow_accessed_mask)
-		return kvm_unmap_rmapp(kvm, rmapp, data);
+		return kvm_unmap_rmapp(kvm, rmapp, slot, data);
 
 	for (sptep = rmap_get_first(*rmapp, &iter); sptep;
 	     sptep = rmap_get_next(&iter)) {
@@ -1367,7 +1369,7 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
 }
 
 static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
-			      unsigned long data)
+			      struct kvm_memory_slot *slot, unsigned long data)
 {
 	u64 *sptep;
 	struct rmap_iterator iter;
@@ -1405,7 +1407,7 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
 
 	rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
 
-	kvm_unmap_rmapp(vcpu->kvm, rmapp, 0);
+	kvm_unmap_rmapp(vcpu->kvm, rmapp, NULL, 0);
 	kvm_flush_remote_tlbs(vcpu->kvm);
 }
 
-- 
cgit v1.2.3


From f395302e09ef783b8f82d1160510a95aa8c66dbc Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Mon, 2 Jul 2012 17:58:48 +0900
Subject: KVM: MMU: Push trace_kvm_age_page() into kvm_age_rmapp()

This restricts the tracing to page aging and makes it possible to
optimize kvm_handle_hva_range() further in the following patch.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c | 23 ++++++++++-------------
 1 file changed, 10 insertions(+), 13 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index dfd7a9a3115..58adec38448 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1269,8 +1269,7 @@ static int kvm_handle_hva_range(struct kvm *kvm,
 					       unsigned long data))
 {
 	int j;
-	int ret;
-	int retval = 0;
+	int ret = 0;
 	struct kvm_memslots *slots;
 	struct kvm_memory_slot *memslot;
 
@@ -1293,8 +1292,6 @@ static int kvm_handle_hva_range(struct kvm *kvm,
 		gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
 
 		for (; gfn < gfn_end; ++gfn) {
-			ret = 0;
-
 			for (j = PT_PAGE_TABLE_LEVEL;
 			     j < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++j) {
 				unsigned long *rmapp;
@@ -1302,14 +1299,10 @@ static int kvm_handle_hva_range(struct kvm *kvm,
 				rmapp = __gfn_to_rmap(gfn, j, memslot);
 				ret |= handler(kvm, rmapp, memslot, data);
 			}
-			trace_kvm_age_page(memslot->userspace_addr +
-					(gfn - memslot->base_gfn) * PAGE_SIZE,
-					memslot, ret);
-			retval |= ret;
 		}
 	}
 
-	return retval;
+	return ret;
 }
 
 static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
@@ -1351,8 +1344,10 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
 	 * This has some overhead, but not as much as the cost of swapping
 	 * out actively used pages or breaking up actively used hugepages.
 	 */
-	if (!shadow_accessed_mask)
-		return kvm_unmap_rmapp(kvm, rmapp, slot, data);
+	if (!shadow_accessed_mask) {
+		young = kvm_unmap_rmapp(kvm, rmapp, slot, data);
+		goto out;
+	}
 
 	for (sptep = rmap_get_first(*rmapp, &iter); sptep;
 	     sptep = rmap_get_next(&iter)) {
@@ -1364,7 +1359,9 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
 				 (unsigned long *)sptep);
 		}
 	}
-
+out:
+	/* @data has hva passed to kvm_age_hva(). */
+	trace_kvm_age_page(data, slot, young);
 	return young;
 }
 
@@ -1413,7 +1410,7 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
 
 int kvm_age_hva(struct kvm *kvm, unsigned long hva)
 {
-	return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp);
+	return kvm_handle_hva(kvm, hva, hva, kvm_age_rmapp);
 }
 
 int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
-- 
cgit v1.2.3


From bcd3ef58283a471d6b65855b83f78bd39eb55391 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Mon, 2 Jul 2012 17:59:33 +0900
Subject: KVM: MMU: Avoid handling same rmap_pde in kvm_handle_hva_range()

When we invalidate a THP page, we call the handler with the same
rmap_pde argument 512 times in the following loop:

  for each guest page in the range
    for each level
      unmap using rmap

This patch avoids these extra handler calls by changing the loop order
like this:

  for each level
    for each rmap in the range
      unmap using rmap

With the preceding patches in the patch series, this made THP page
invalidation more than 5 times faster on our x86 host: the host became
more responsive during swapping the guest's memory as a result.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c | 28 ++++++++++++++++++----------
 1 file changed, 18 insertions(+), 10 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 58adec38448..a5d6ef785b7 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1277,7 +1277,7 @@ static int kvm_handle_hva_range(struct kvm *kvm,
 
 	kvm_for_each_memslot(memslot, slots) {
 		unsigned long hva_start, hva_end;
-		gfn_t gfn, gfn_end;
+		gfn_t gfn_start, gfn_end;
 
 		hva_start = max(start, memslot->userspace_addr);
 		hva_end = min(end, memslot->userspace_addr +
@@ -1286,19 +1286,27 @@ static int kvm_handle_hva_range(struct kvm *kvm,
 			continue;
 		/*
 		 * {gfn(page) | page intersects with [hva_start, hva_end)} =
-		 * {gfn, gfn+1, ..., gfn_end-1}.
+		 * {gfn_start, gfn_start+1, ..., gfn_end-1}.
 		 */
-		gfn = hva_to_gfn_memslot(hva_start, memslot);
+		gfn_start = hva_to_gfn_memslot(hva_start, memslot);
 		gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
 
-		for (; gfn < gfn_end; ++gfn) {
-			for (j = PT_PAGE_TABLE_LEVEL;
-			     j < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++j) {
-				unsigned long *rmapp;
+		for (j = PT_PAGE_TABLE_LEVEL;
+		     j < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++j) {
+			unsigned long idx, idx_end;
+			unsigned long *rmapp;
 
-				rmapp = __gfn_to_rmap(gfn, j, memslot);
-				ret |= handler(kvm, rmapp, memslot, data);
-			}
+			/*
+			 * {idx(page_j) | page_j intersects with
+			 *  [hva_start, hva_end)} = {idx, idx+1, ..., idx_end}.
+			 */
+			idx = gfn_to_index(gfn_start, memslot->base_gfn, j);
+			idx_end = gfn_to_index(gfn_end - 1, memslot->base_gfn, j);
+
+			rmapp = __gfn_to_rmap(gfn_start, j, memslot);
+
+			for (; idx <= idx_end; ++idx)
+				ret |= handler(kvm, rmapp++, memslot, data);
 		}
 	}
 
-- 
cgit v1.2.3


From 9d3c92af47d853d4e31ee971dba7bc086275b7b3 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Tue, 17 Jul 2012 21:50:48 +0800
Subject: KVM: x86: remove unnecessary mark_page_dirty

fix:
[  132.474633] 3.5.0-rc1+ #50 Not tainted
[  132.474634] -------------------------------
[  132.474635] include/linux/kvm_host.h:369 suspicious rcu_dereference_check() usage!
[  132.474636]
[  132.474636] other info that might help us debug this:
[  132.474636]
[  132.474638]
[  132.474638] rcu_scheduler_active = 1, debug_locks = 1
[  132.474640] 1 lock held by qemu-kvm/2832:
[  132.474657]  #0:  (&vcpu->mutex){+.+.+.}, at: [<ffffffffa01e1636>] vcpu_load+0x1e/0x91 [kvm]
[  132.474658]
[  132.474658] stack backtrace:
[  132.474660] Pid: 2832, comm: qemu-kvm Not tainted 3.5.0-rc1+ #50
[  132.474661] Call Trace:
[  132.474665]  [<ffffffff81092f40>] lockdep_rcu_suspicious+0xfc/0x105
[  132.474675]  [<ffffffffa01e0c85>] kvm_memslots+0x6d/0x75 [kvm]
[  132.474683]  [<ffffffffa01e0ca1>] gfn_to_memslot+0x14/0x4c [kvm]
[  132.474693]  [<ffffffffa01e3575>] mark_page_dirty+0x17/0x2a [kvm]
[  132.474706]  [<ffffffffa01f21ea>] kvm_arch_vcpu_ioctl+0xbcf/0xc07 [kvm]

Actually, we do not write vcpu->arch.time at this time, mark_page_dirty
should be removed.

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 829b4e97255..ecc71dde4bb 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2632,7 +2632,6 @@ static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
 	if (!vcpu->arch.time_page)
 		return -EINVAL;
 	src->flags |= PVCLOCK_GUEST_STOPPED;
-	mark_page_dirty(vcpu->kvm, vcpu->arch.time >> PAGE_SHIFT);
 	kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
 	return 0;
 }
-- 
cgit v1.2.3


From 86fde74cf5b829627b37ca86322acfdd99b524b8 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Tue, 17 Jul 2012 21:52:52 +0800
Subject: KVM: MMU: track the refcount when unmap the page

It will trigger a WARN_ON if the page has been freed but it is still
used in mmu, it can help us to detect mm bug early

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index a5d6ef785b7..685a4855738 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -556,6 +556,14 @@ static int mmu_spte_clear_track_bits(u64 *sptep)
 		return 0;
 
 	pfn = spte_to_pfn(old_spte);
+
+	/*
+	 * KVM does not hold the refcount of the page used by
+	 * kvm mmu, before reclaiming the page, we should
+	 * unmap it from mmu first.
+	 */
+	WARN_ON(!kvm_is_mmio_pfn(pfn) && !page_count(pfn_to_page(pfn)));
+
 	if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
 		kvm_set_pfn_accessed(pfn);
 	if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask))
-- 
cgit v1.2.3


From 903816fa4d016e20ec71a1a97700cfcdda115580 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Tue, 17 Jul 2012 21:54:11 +0800
Subject: KVM: using get_fault_pfn to get the fault pfn

Using get_fault_pfn to cleanup the code

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 685a4855738..f85cc21ae95 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2513,10 +2513,8 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
 	unsigned long hva;
 
 	slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log);
-	if (!slot) {
-		get_page(fault_page);
-		return page_to_pfn(fault_page);
-	}
+	if (!slot)
+		return get_fault_pfn();
 
 	hva = gfn_to_hva_memslot(slot, gfn);
 
-- 
cgit v1.2.3


From d566104853361cc377c61f70e41c1ad3d44b86c6 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Tue, 17 Jul 2012 21:56:16 +0800
Subject: KVM: remove the unused parameter of gfn_to_pfn_memslot

The parameter, 'kvm', is not used in gfn_to_pfn_memslot, we can happily remove
it

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/powerpc/kvm/e500_tlb.c | 2 +-
 arch/x86/kvm/mmu.c          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c
index c510fc96130..c8f6c582674 100644
--- a/arch/powerpc/kvm/e500_tlb.c
+++ b/arch/powerpc/kvm/e500_tlb.c
@@ -520,7 +520,7 @@ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
 
 	if (likely(!pfnmap)) {
 		unsigned long tsize_pages = 1 << (tsize + 10 - PAGE_SHIFT);
-		pfn = gfn_to_pfn_memslot(vcpu_e500->vcpu.kvm, slot, gfn);
+		pfn = gfn_to_pfn_memslot(slot, gfn);
 		if (is_error_pfn(pfn)) {
 			printk(KERN_ERR "Couldn't get real page for gfn %lx!\n",
 					(long)gfn);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index f85cc21ae95..4f77f7ac6d2 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2518,7 +2518,7 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
 
 	hva = gfn_to_hva_memslot(slot, gfn);
 
-	return hva_to_pfn_atomic(vcpu->kvm, hva);
+	return hva_to_pfn_atomic(hva);
 }
 
 static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
-- 
cgit v1.2.3


From 0fa060714753ef65aef294b3462efb0212520933 Mon Sep 17 00:00:00 2001
From: Guo Chao <yan@linux.vnet.ibm.com>
Date: Thu, 28 Jun 2012 15:16:19 +0800
Subject: KVM: VMX: Fix typos

Signed-off-by: Guo Chao <yan@linux.vnet.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c39b60707e0..2300e5319ed 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1343,7 +1343,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
 	guest_efer = vmx->vcpu.arch.efer;
 
 	/*
-	 * NX is emulated; LMA and LME handled by hardware; SCE meaninless
+	 * NX is emulated; LMA and LME handled by hardware; SCE meaningless
 	 * outside long mode
 	 */
 	ignore_bits = EFER_NX | EFER_SCE;
@@ -3261,7 +3261,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
 	 * qemu binaries.
 	 *   IA32 arch specifies that at the time of processor reset the
 	 * "Accessed" bit in the AR field of segment registers is 1. And qemu
-	 * is setting it to 0 in the usedland code. This causes invalid guest
+	 * is setting it to 0 in the userland code. This causes invalid guest
 	 * state vmexit when "unrestricted guest" mode is turned on.
 	 *    Fix for this setup issue in cpu_reset is being pushed in the qemu
 	 * tree. Newer qemu binaries with that qemu fix would not need this
@@ -4446,7 +4446,7 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
 	hypercall[2] = 0xc1;
 }
 
-/* called to set cr0 as approriate for a mov-to-cr0 exit. */
+/* called to set cr0 as appropriate for a mov-to-cr0 exit. */
 static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
 {
 	if (to_vmx(vcpu)->nested.vmxon &&
-- 
cgit v1.2.3


From c5ec2e56d0a654797ee43a31001738ccd05eef0b Mon Sep 17 00:00:00 2001
From: Guo Chao <yan@linux.vnet.ibm.com>
Date: Thu, 28 Jun 2012 15:16:43 +0800
Subject: KVM: SVM: Fix typos

Signed-off-by: Guo Chao <yan@linux.vnet.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/svm.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index baead950d6c..687d0c30e55 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2063,7 +2063,7 @@ static inline bool nested_svm_intr(struct vcpu_svm *svm)
 	if (svm->nested.intercept & 1ULL) {
 		/*
 		 * The #vmexit can't be emulated here directly because this
-		 * code path runs with irqs and preemtion disabled. A
+		 * code path runs with irqs and preemption disabled. A
 		 * #vmexit emulation might sleep. Only signal request for
 		 * the #vmexit here.
 		 */
@@ -2409,7 +2409,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
 {
 	/*
 	 * This function merges the msr permission bitmaps of kvm and the
-	 * nested vmcb. It is omptimized in that it only merges the parts where
+	 * nested vmcb. It is optimized in that it only merges the parts where
 	 * the kvm msr permission bitmap may contain zero bits
 	 */
 	int i;
-- 
cgit v1.2.3


From 4a9699807c491740c4dfe7b6a06703e1d262e802 Mon Sep 17 00:00:00 2001
From: Guo Chao <yan@linux.vnet.ibm.com>
Date: Thu, 28 Jun 2012 15:17:27 +0800
Subject: KVM: x86: Fix typos in x86.c

Signed-off-by: Guo Chao <yan@linux.vnet.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ecc71dde4bb..3d9d08edbf2 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1093,7 +1093,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
 		 * For each generation, we track the original measured
 		 * nanosecond time, offset, and write, so if TSCs are in
 		 * sync, we can match exact offset, and if not, we can match
-		 * exact software computaion in compute_guest_tsc()
+		 * exact software computation in compute_guest_tsc()
 		 *
 		 * These values are tracked in kvm->arch.cur_xxx variables.
 		 */
@@ -1500,7 +1500,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
 {
 	gpa_t gpa = data & ~0x3f;
 
-	/* Bits 2:5 are resrved, Should be zero */
+	/* Bits 2:5 are reserved, Should be zero */
 	if (data & 0x3c)
 		return 1;
 
@@ -1723,7 +1723,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 		 * Ignore all writes to this no longer documented MSR.
 		 * Writes are only relevant for old K7 processors,
 		 * all pre-dating SVM, but a recommended workaround from
-		 * AMD for these chips. It is possible to speicify the
+		 * AMD for these chips. It is possible to specify the
 		 * affected processor models on the command line, hence
 		 * the need to ignore the workaround.
 		 */
@@ -4491,7 +4491,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
 
 	/*
 	 * if emulation was due to access to shadowed page table
-	 * and it failed try to unshadow page and re-entetr the
+	 * and it failed try to unshadow page and re-enter the
 	 * guest to let CPU execute the instruction.
 	 */
 	if (kvm_mmu_unprotect_page_virt(vcpu, gva))
@@ -5587,7 +5587,7 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 		/*
 		 * We are here if userspace calls get_regs() in the middle of
 		 * instruction emulation. Registers state needs to be copied
-		 * back from emulation context to vcpu. Usrapace shouldn't do
+		 * back from emulation context to vcpu. Userspace shouldn't do
 		 * that usually, but some bad designed PV devices (vmware
 		 * backdoor interface) need this to work
 		 */
@@ -6116,7 +6116,7 @@ int kvm_arch_hardware_enable(void *garbage)
 	 * as we reset last_host_tsc on all VCPUs to stop this from being
 	 * called multiple times (one for each physical CPU bringup).
 	 *
-	 * Platforms with unnreliable TSCs don't have to deal with this, they
+	 * Platforms with unreliable TSCs don't have to deal with this, they
 	 * will be compensated by the logic in vcpu_load, which sets the TSC to
 	 * catchup mode.  This will catchup all VCPUs to real time, but cannot
 	 * guarantee that they stay in perfect synchronization.
@@ -6391,7 +6391,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 		map_flags = MAP_SHARED | MAP_ANONYMOUS;
 
 	/*To keep backward compatibility with older userspace,
-	 *x86 needs to hanlde !user_alloc case.
+	 *x86 needs to handle !user_alloc case.
 	 */
 	if (!user_alloc) {
 		if (npages && !old.rmap) {
-- 
cgit v1.2.3


From fc0586807dc4e307da6d3ba4ed5c927b6d27276c Mon Sep 17 00:00:00 2001
From: Guo Chao <yan@linux.vnet.ibm.com>
Date: Thu, 28 Jun 2012 15:19:51 +0800
Subject: KVM: x86: Fix typos in emulate.c

Signed-off-by: Guo Chao <yan@linux.vnet.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 97d9a9914ba..85b611e13e8 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -642,7 +642,7 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
 			if (addr.ea > lim || (u32)(addr.ea + size - 1) > lim)
 				goto bad;
 		} else {
-			/* exapand-down segment */
+			/* expand-down segment */
 			if (addr.ea <= lim || (u32)(addr.ea + size - 1) <= lim)
 				goto bad;
 			lim = desc.d ? 0xffffffff : 0xffff;
@@ -1383,7 +1383,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 	err_code = selector & 0xfffc;
 	err_vec = GP_VECTOR;
 
-	/* can't load system descriptor into segment selecor */
+	/* can't load system descriptor into segment selector */
 	if (seg <= VCPU_SREG_GS && !seg_desc.s)
 		goto exception;
 
@@ -2398,7 +2398,7 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
 	set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS);
 
 	/*
-	 * Now load segment descriptors. If fault happenes at this stage
+	 * Now load segment descriptors. If fault happens at this stage
 	 * it is handled in a context of new task
 	 */
 	ret = load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR);
@@ -2640,7 +2640,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
 	 *
 	 * 1. jmp/call/int to task gate: Check against DPL of the task gate
 	 * 2. Exception/IRQ/iret: No check is performed
-	 * 3. jmp/call to TSS: Check agains DPL of the TSS
+	 * 3. jmp/call to TSS: Check against DPL of the TSS
 	 */
 	if (reason == TASK_SWITCH_GATE) {
 		if (idt_index != -1) {
@@ -2681,7 +2681,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
 		ctxt->eflags = ctxt->eflags & ~X86_EFLAGS_NT;
 
 	/* set back link to prev task only if NT bit is set in eflags
-	   note that old_tss_sel is not used afetr this point */
+	   note that old_tss_sel is not used after this point */
 	if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
 		old_tss_sel = 0xffff;
 
-- 
cgit v1.2.3


From bbbda79510b6e5d399fae76bdb9b999286eb1b59 Mon Sep 17 00:00:00 2001
From: Guo Chao <yan@linux.vnet.ibm.com>
Date: Thu, 28 Jun 2012 15:20:58 +0800
Subject: KVM: x86: Fix typos in cpuid.c

Signed-off-by: Guo Chao <yan@linux.vnet.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/cpuid.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 0595f1397b7..b496da684bd 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -316,7 +316,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 	}
 	case 7: {
 		entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
-		/* Mask ebx against host capbability word 9 */
+		/* Mask ebx against host capability word 9 */
 		if (index == 0) {
 			entry->ebx &= kvm_supported_word9_x86_features;
 			cpuid_mask(&entry->ebx, 9);
-- 
cgit v1.2.3


From d5b0b5b196b474ef26f46f2036c8fbaa8cca2cf5 Mon Sep 17 00:00:00 2001
From: Guo Chao <yan@linux.vnet.ibm.com>
Date: Thu, 28 Jun 2012 15:22:57 +0800
Subject: KVM: x86: Fix typos in lapic.c

Signed-off-by: Guo Chao <yan@linux.vnet.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/lapic.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index ce878788a39..fff7173f6a7 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -719,7 +719,7 @@ static int apic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
 {
 	unsigned char alignment = offset & 0xf;
 	u32 result;
-	/* this bitmask has a bit cleared for each reserver register */
+	/* this bitmask has a bit cleared for each reserved register */
 	static const u64 rmask = 0x43ff01ffffffe70cULL;
 
 	if ((alignment + len) > 4) {
@@ -792,7 +792,7 @@ static void start_apic_timer(struct kvm_lapic *apic)
 	atomic_set(&apic->lapic_timer.pending, 0);
 
 	if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) {
-		/* lapic timer in oneshot or peroidic mode */
+		/* lapic timer in oneshot or periodic mode */
 		now = apic->lapic_timer.timer.base->get_time();
 		apic->lapic_timer.period = (u64)apic_get_reg(apic, APIC_TMICT)
 			    * APIC_BUS_CYCLE_NS * apic->divide_count;
-- 
cgit v1.2.3


From c7a7062fa00db7dc66280a72cd9dad0f3595bc66 Mon Sep 17 00:00:00 2001
From: Guo Chao <yan@linux.vnet.ibm.com>
Date: Thu, 28 Jun 2012 15:23:08 +0800
Subject: KVM: x86: Fix typos in pmu.c

Signed-off-by: Guo Chao <yan@linux.vnet.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/pmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 2e88438ffd8..db206a46e0d 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -1,5 +1,5 @@
 /*
- * Kernel-based Virtual Machine -- Performane Monitoring Unit support
+ * Kernel-based Virtual Machine -- Performance Monitoring Unit support
  *
  * Copyright 2011 Red Hat, Inc. and/or its affiliates.
  *
-- 
cgit v1.2.3


From 93b6547e2219784b2df790353e083e0bdbebd2c2 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Thu, 19 Jul 2012 13:55:53 +0300
Subject: KVM: switch to symbolic name for irq_states size

Use PIC_NUM_PINS instead of hard-coded 16 for pic pins.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/irq.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 2086f2bfba3..2d03568e949 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -70,7 +70,7 @@ struct kvm_pic {
 	struct kvm_io_device dev_slave;
 	struct kvm_io_device dev_eclr;
 	void (*ack_notifier)(void *opaque, int irq);
-	unsigned long irq_states[16];
+	unsigned long irq_states[PIC_NUM_PINS];
 };
 
 struct kvm_pic *kvm_create_pic(struct kvm *kvm);
-- 
cgit v1.2.3


From f2a743473194a1ad44a85f8b63aeef9d63e5bf47 Mon Sep 17 00:00:00 2001
From: Raghavendra K T <raghavendra.kt@linux.vnet.ibm.com>
Date: Wed, 18 Jul 2012 19:07:32 +0530
Subject: KVM: Add config to support ple or cpu relax optimzation

Suggested-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Raghavendra K T <raghavendra.kt@linux.vnet.ibm.com>
Reviewed-by: Marcelo Tosatti <mtosatti@redhat.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Tested-by: Christian Borntraeger <borntraeger@de.ibm.com> # on s390x
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/s390/kvm/Kconfig | 1 +
 arch/x86/kvm/Kconfig  | 1 +
 2 files changed, 2 insertions(+)

(limited to 'arch')

diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig
index 78eb9847008..a6e2677724e 100644
--- a/arch/s390/kvm/Kconfig
+++ b/arch/s390/kvm/Kconfig
@@ -21,6 +21,7 @@ config KVM
 	depends on HAVE_KVM && EXPERIMENTAL
 	select PREEMPT_NOTIFIERS
 	select ANON_INODES
+	select HAVE_KVM_CPU_RELAX_INTERCEPT
 	---help---
 	  Support hosting paravirtualized guest machines using the SIE
 	  virtualization capability on the mainframe. This should work
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index a28f338843e..45c044f0fff 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -37,6 +37,7 @@ config KVM
 	select TASK_DELAY_ACCT
 	select PERF_EVENTS
 	select HAVE_KVM_MSI
+	select HAVE_KVM_CPU_RELAX_INTERCEPT
 	---help---
 	  Support hosting fully virtualized guest machines using hardware
 	  virtualization extensions.  You will need a fairly recent
-- 
cgit v1.2.3


From 3b2bd2f800ba9488d9ad493216a0c07d71055b56 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Thu, 26 Jul 2012 11:57:43 +0800
Subject: KVM: MMU: use kvm_release_pfn_clean to release pfn

The current code depends on the fact that fault_page is the normal page,
however, we will use the error code instead of these dummy pages in the
later patch, so we use kvm_release_pfn_clean to release pfn which will
release the error code properly

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 24199344359..a9a20528e70 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3275,7 +3275,7 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
 	if (!async)
 		return false; /* *pfn has correct page already */
 
-	put_page(pfn_to_page(*pfn));
+	kvm_release_pfn_clean(*pfn);
 
 	if (!prefault && can_do_async_pf(vcpu)) {
 		trace_kvm_try_async_get_page(gva, gfn);
-- 
cgit v1.2.3


From f23b070e662ca55a8fdaaa28537af06cab664499 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Thu, 26 Jul 2012 13:12:22 +0800
Subject: KVM: x86 emulator: simplify read_emulated

No need split mmio read region into 8-bits pieces since we do it in
emulator_read_write_onepage

Changelog:
  Add a WARN_ON to check read-cache overflow

Acked-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 31 ++++++++++++++-----------------
 1 file changed, 14 insertions(+), 17 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 85b611e13e8..2c5d1e65d9d 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1166,24 +1166,21 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt,
 	int rc;
 	struct read_cache *mc = &ctxt->mem_read;
 
-	while (size) {
-		int n = min(size, 8u);
-		size -= n;
-		if (mc->pos < mc->end)
-			goto read_cached;
-
-		rc = ctxt->ops->read_emulated(ctxt, addr, mc->data + mc->end, n,
-					      &ctxt->exception);
-		if (rc != X86EMUL_CONTINUE)
-			return rc;
-		mc->end += n;
+	if (mc->pos < mc->end)
+		goto read_cached;
 
-	read_cached:
-		memcpy(dest, mc->data + mc->pos, n);
-		mc->pos += n;
-		dest += n;
-		addr += n;
-	}
+	WARN_ON((mc->end + size) >= sizeof(mc->data));
+
+	rc = ctxt->ops->read_emulated(ctxt, addr, mc->data + mc->end, size,
+				      &ctxt->exception);
+	if (rc != X86EMUL_CONTINUE)
+		return rc;
+
+	mc->end += size;
+
+read_cached:
+	memcpy(dest, mc->data + mc->pos, size);
+	mc->pos += size;
 	return X86EMUL_CONTINUE;
 }
 
-- 
cgit v1.2.3


From 99245b507dc3b1b2815d6a6cb4e94a6b7018a24b Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Wed, 25 Jul 2012 15:49:42 +0300
Subject: KVM: x86 emulator: drop unneeded call to get_segment()

setup_syscalls_segments() calls get_segment() and than overwrites all
but one of the structure fields and this one should also be overwritten
anyway, so we can drop call to get_segment() and avoid a couple of vmreads
on vmx. Also drop zeroing ss/cs structures since most of the fields are
set anyway. Just set those that were not set explicitly.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 2c5d1e65d9d..10f0136f50c 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2035,12 +2035,6 @@ static void
 setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
 			struct desc_struct *cs, struct desc_struct *ss)
 {
-	u16 selector;
-
-	memset(cs, 0, sizeof(struct desc_struct));
-	ctxt->ops->get_segment(ctxt, &selector, cs, NULL, VCPU_SREG_CS);
-	memset(ss, 0, sizeof(struct desc_struct));
-
 	cs->l = 0;		/* will be adjusted later */
 	set_desc_base(cs, 0);	/* flat segment */
 	cs->g = 1;		/* 4kb granularity */
@@ -2050,6 +2044,7 @@ setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
 	cs->dpl = 0;		/* will be adjusted later */
 	cs->p = 1;
 	cs->d = 1;
+	cs->avl = 0;
 
 	set_desc_base(ss, 0);	/* flat segment */
 	set_desc_limit(ss, 0xfffff);	/* 4GB limit */
@@ -2059,6 +2054,8 @@ setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
 	ss->d = 1;		/* 32bit stack segment */
 	ss->dpl = 0;
 	ss->p = 1;
+	ss->l = 0;
+	ss->avl = 0;
 }
 
 static bool vendor_intel(struct x86_emulate_ctxt *ctxt)
-- 
cgit v1.2.3


From 23d43cf998275bc97437931c0cdee1df2c1aa3ca Mon Sep 17 00:00:00 2001
From: Christoffer Dall <c.dall@virtualopensystems.com>
Date: Tue, 24 Jul 2012 08:51:20 -0400
Subject: KVM: Move KVM_IRQ_LINE to arch-generic code

Handle KVM_IRQ_LINE and KVM_IRQ_LINE_STATUS in the generic
kvm_vm_ioctl() function and call into kvm_vm_ioctl_irq_line().

This is even more relevant when KVM/ARM also uses this ioctl.

Signed-off-by: Christoffer Dall <c.dall@virtualopensystems.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/ia64/kvm/kvm-ia64.c | 33 ++++++++++-----------------------
 arch/x86/kvm/x86.c       | 33 ++++++++++-----------------------
 2 files changed, 20 insertions(+), 46 deletions(-)

(limited to 'arch')

diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index bd77cb507c1..eac65380bd2 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -924,6 +924,16 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 	return 0;
 }
 
+int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event)
+{
+	if (!irqchip_in_kernel(kvm))
+		return -ENXIO;
+
+	irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
+					irq_event->irq, irq_event->level);
+	return 0;
+}
+
 long kvm_arch_vm_ioctl(struct file *filp,
 		unsigned int ioctl, unsigned long arg)
 {
@@ -963,29 +973,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
 			goto out;
 		}
 		break;
-	case KVM_IRQ_LINE_STATUS:
-	case KVM_IRQ_LINE: {
-		struct kvm_irq_level irq_event;
-
-		r = -EFAULT;
-		if (copy_from_user(&irq_event, argp, sizeof irq_event))
-			goto out;
-		r = -ENXIO;
-		if (irqchip_in_kernel(kvm)) {
-			__s32 status;
-			status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
-				    irq_event.irq, irq_event.level);
-			if (ioctl == KVM_IRQ_LINE_STATUS) {
-				r = -EFAULT;
-				irq_event.status = status;
-				if (copy_to_user(argp, &irq_event,
-							sizeof irq_event))
-					goto out;
-			}
-			r = 0;
-		}
-		break;
-		}
 	case KVM_GET_IRQCHIP: {
 		/* 0: PIC master, 1: PIC slave, 2: IOAPIC */
 		struct kvm_irqchip chip;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3d9d08edbf2..b6379e55ee2 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3165,6 +3165,16 @@ out:
 	return r;
 }
 
+int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event)
+{
+	if (!irqchip_in_kernel(kvm))
+		return -ENXIO;
+
+	irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
+					irq_event->irq, irq_event->level);
+	return 0;
+}
+
 long kvm_arch_vm_ioctl(struct file *filp,
 		       unsigned int ioctl, unsigned long arg)
 {
@@ -3271,29 +3281,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
 	create_pit_unlock:
 		mutex_unlock(&kvm->slots_lock);
 		break;
-	case KVM_IRQ_LINE_STATUS:
-	case KVM_IRQ_LINE: {
-		struct kvm_irq_level irq_event;
-
-		r = -EFAULT;
-		if (copy_from_user(&irq_event, argp, sizeof irq_event))
-			goto out;
-		r = -ENXIO;
-		if (irqchip_in_kernel(kvm)) {
-			__s32 status;
-			status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
-					irq_event.irq, irq_event.level);
-			if (ioctl == KVM_IRQ_LINE_STATUS) {
-				r = -EFAULT;
-				irq_event.status = status;
-				if (copy_to_user(argp, &irq_event,
-							sizeof irq_event))
-					goto out;
-			}
-			r = 0;
-		}
-		break;
-	}
 	case KVM_GET_IRQCHIP: {
 		/* 0: PIC master, 1: PIC slave, 2: IOAPIC */
 		struct kvm_irqchip *chip;
-- 
cgit v1.2.3


From 9b7fb990e080f3a7c5dff9a829d11247e629b98f Mon Sep 17 00:00:00 2001
From: Cornelia Huck <cornelia.huck@de.ibm.com>
Date: Mon, 23 Jul 2012 17:20:28 +0200
Subject: s390/dis: Instruction decoding interface

Provide a new function, insn_to_mnemonic, by which e.g. kvm can obtain
a human-readable decoding of an instruction's opcode.

Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Reviewed-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/s390/include/asm/processor.h |  1 +
 arch/s390/kernel/dis.c            | 27 +++++++++++++++++++++++++++
 2 files changed, 28 insertions(+)

(limited to 'arch')

diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h
index c40fa91e38a..31feac63054 100644
--- a/arch/s390/include/asm/processor.h
+++ b/arch/s390/include/asm/processor.h
@@ -138,6 +138,7 @@ extern int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
 extern unsigned long thread_saved_pc(struct task_struct *t);
 
 extern void show_code(struct pt_regs *regs);
+extern int insn_to_mnemonic(unsigned char *instruction, char buf[8]);
 
 unsigned long get_wchan(struct task_struct *p);
 #define task_pt_regs(tsk) ((struct pt_regs *) \
diff --git a/arch/s390/kernel/dis.c b/arch/s390/kernel/dis.c
index 1f6b428e276..c02310b8db0 100644
--- a/arch/s390/kernel/dis.c
+++ b/arch/s390/kernel/dis.c
@@ -1468,6 +1468,33 @@ static struct insn *find_insn(unsigned char *code)
 	return NULL;
 }
 
+/**
+ * insn_to_mnemonic - decode an s390 instruction
+ * @instruction: instruction to decode
+ * @buf: buffer to fill with mnemonic
+ *
+ * Decode the instruction at @instruction and store the corresponding
+ * mnemonic into @buf.
+ * @buf is left unchanged if the instruction could not be decoded.
+ * Returns:
+ *  %0 on success, %-ENOENT if the instruction was not found.
+ */
+int insn_to_mnemonic(unsigned char *instruction, char buf[8])
+{
+	struct insn *insn;
+
+	insn = find_insn(instruction);
+	if (!insn)
+		return -ENOENT;
+	if (insn->name[0] == '\0')
+		snprintf(buf, sizeof(buf), "%s",
+			 long_insn_name[(int) insn->name[1]]);
+	else
+		snprintf(buf, sizeof(buf), "%.5s", insn->name);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(insn_to_mnemonic);
+
 static int print_insn(char *buffer, unsigned char *code, unsigned long addr)
 {
 	struct insn *insn;
-- 
cgit v1.2.3


From 5786fffa96ae7c3f8111e29fb62e073a45efc557 Mon Sep 17 00:00:00 2001
From: Cornelia Huck <cornelia.huck@de.ibm.com>
Date: Mon, 23 Jul 2012 17:20:29 +0200
Subject: KVM: s390: Add architectural trace events

Add trace events for several s390 architecture specifics:

- SIE entry/exit
- common intercepts
- common instructions (sigp/diagnose)

Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/s390/kvm/diag.c      |   2 +
 arch/s390/kvm/intercept.c |   8 ++
 arch/s390/kvm/kvm-s390.c  |   7 +
 arch/s390/kvm/priv.c      |   9 +-
 arch/s390/kvm/sigp.c      |   2 +
 arch/s390/kvm/trace.h     | 341 ++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 368 insertions(+), 1 deletion(-)
 create mode 100644 arch/s390/kvm/trace.h

(limited to 'arch')

diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c
index c88bb779339..f5d4416ebfe 100644
--- a/arch/s390/kvm/diag.c
+++ b/arch/s390/kvm/diag.c
@@ -14,6 +14,7 @@
 #include <linux/kvm.h>
 #include <linux/kvm_host.h>
 #include "kvm-s390.h"
+#include "trace.h"
 
 static int diag_release_pages(struct kvm_vcpu *vcpu)
 {
@@ -105,6 +106,7 @@ int kvm_s390_handle_diag(struct kvm_vcpu *vcpu)
 {
 	int code = (vcpu->arch.sie_block->ipb & 0xfff0000) >> 16;
 
+	trace_kvm_s390_handle_diag(vcpu, code);
 	switch (code) {
 	case 0x10:
 		return diag_release_pages(vcpu);
diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index adae539f12e..db541d62c77 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -19,6 +19,7 @@
 
 #include "kvm-s390.h"
 #include "gaccess.h"
+#include "trace.h"
 
 static int handle_lctlg(struct kvm_vcpu *vcpu)
 {
@@ -45,6 +46,7 @@ static int handle_lctlg(struct kvm_vcpu *vcpu)
 
 	VCPU_EVENT(vcpu, 5, "lctlg r1:%x, r3:%x,b2:%x,d2:%x", reg1, reg3, base2,
 		   disp2);
+	trace_kvm_s390_handle_lctl(vcpu, 1, reg1, reg3, useraddr);
 
 	do {
 		rc = get_guest_u64(vcpu, useraddr,
@@ -82,6 +84,7 @@ static int handle_lctl(struct kvm_vcpu *vcpu)
 
 	VCPU_EVENT(vcpu, 5, "lctl r1:%x, r3:%x,b2:%x,d2:%x", reg1, reg3, base2,
 		   disp2);
+	trace_kvm_s390_handle_lctl(vcpu, 0, reg1, reg3, useraddr);
 
 	reg = reg1;
 	do {
@@ -171,6 +174,7 @@ static int handle_validity(struct kvm_vcpu *vcpu)
 	int rc;
 
 	vcpu->stat.exit_validity++;
+	trace_kvm_s390_intercept_validity(vcpu, viwhy);
 	if (viwhy == 0x37) {
 		vmaddr = gmap_fault(vcpu->arch.sie_block->prefix,
 				    vcpu->arch.gmap);
@@ -213,6 +217,9 @@ static int handle_instruction(struct kvm_vcpu *vcpu)
 	intercept_handler_t handler;
 
 	vcpu->stat.exit_instruction++;
+	trace_kvm_s390_intercept_instruction(vcpu,
+					     vcpu->arch.sie_block->ipa,
+					     vcpu->arch.sie_block->ipb);
 	handler = instruction_handlers[vcpu->arch.sie_block->ipa >> 8];
 	if (handler)
 		return handler(vcpu);
@@ -222,6 +229,7 @@ static int handle_instruction(struct kvm_vcpu *vcpu)
 static int handle_prog(struct kvm_vcpu *vcpu)
 {
 	vcpu->stat.exit_program_interruption++;
+	trace_kvm_s390_intercept_prog(vcpu, vcpu->arch.sie_block->iprcc);
 	return kvm_s390_inject_program_int(vcpu, vcpu->arch.sie_block->iprcc);
 }
 
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index d470ccbfaba..4613602e123 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -32,6 +32,9 @@
 #include "kvm-s390.h"
 #include "gaccess.h"
 
+#define CREATE_TRACE_POINTS
+#include "trace.h"
+
 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
 
 struct kvm_stats_debugfs_item debugfs_entries[] = {
@@ -607,18 +610,22 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
 	local_irq_enable();
 	VCPU_EVENT(vcpu, 6, "entering sie flags %x",
 		   atomic_read(&vcpu->arch.sie_block->cpuflags));
+	trace_kvm_s390_sie_enter(vcpu,
+				 atomic_read(&vcpu->arch.sie_block->cpuflags));
 	rc = sie64a(vcpu->arch.sie_block, vcpu->run->s.regs.gprs);
 	if (rc) {
 		if (kvm_is_ucontrol(vcpu->kvm)) {
 			rc = SIE_INTERCEPT_UCONTROL;
 		} else {
 			VCPU_EVENT(vcpu, 3, "%s", "fault in sie instruction");
+			trace_kvm_s390_sie_fault(vcpu);
 			kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
 			rc = 0;
 		}
 	}
 	VCPU_EVENT(vcpu, 6, "exit sie icptcode %d",
 		   vcpu->arch.sie_block->icptcode);
+	trace_kvm_s390_sie_exit(vcpu, vcpu->arch.sie_block->icptcode);
 	local_irq_disable();
 	kvm_guest_exit();
 	local_irq_enable();
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index 60da903d6f3..ed256fdd7b5 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -20,6 +20,7 @@
 #include <asm/sysinfo.h>
 #include "gaccess.h"
 #include "kvm-s390.h"
+#include "trace.h"
 
 static int handle_set_prefix(struct kvm_vcpu *vcpu)
 {
@@ -59,6 +60,7 @@ static int handle_set_prefix(struct kvm_vcpu *vcpu)
 	kvm_s390_set_prefix(vcpu, address);
 
 	VCPU_EVENT(vcpu, 5, "setting prefix to %x", address);
+	trace_kvm_s390_handle_prefix(vcpu, 1, address);
 out:
 	return 0;
 }
@@ -91,6 +93,7 @@ static int handle_store_prefix(struct kvm_vcpu *vcpu)
 	}
 
 	VCPU_EVENT(vcpu, 5, "storing prefix to %x", address);
+	trace_kvm_s390_handle_prefix(vcpu, 0, address);
 out:
 	return 0;
 }
@@ -119,6 +122,7 @@ static int handle_store_cpu_address(struct kvm_vcpu *vcpu)
 	}
 
 	VCPU_EVENT(vcpu, 5, "storing cpu address to %llx", useraddr);
+	trace_kvm_s390_handle_stap(vcpu, useraddr);
 out:
 	return 0;
 }
@@ -164,9 +168,11 @@ static int handle_stfl(struct kvm_vcpu *vcpu)
 			   &facility_list, sizeof(facility_list));
 	if (rc == -EFAULT)
 		kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
-	else
+	else {
 		VCPU_EVENT(vcpu, 5, "store facility list value %x",
 			   facility_list);
+		trace_kvm_s390_handle_stfl(vcpu, facility_list);
+	}
 	return 0;
 }
 
@@ -278,6 +284,7 @@ static int handle_stsi(struct kvm_vcpu *vcpu)
 		kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
 		goto out_mem;
 	}
+	trace_kvm_s390_handle_stsi(vcpu, fc, sel1, sel2, operand2);
 	free_page(mem);
 	vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44);
 	vcpu->run->s.regs.gprs[0] = 0;
diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c
index 56f80e1f98f..566ddf6e8df 100644
--- a/arch/s390/kvm/sigp.c
+++ b/arch/s390/kvm/sigp.c
@@ -18,6 +18,7 @@
 #include <asm/sigp.h>
 #include "gaccess.h"
 #include "kvm-s390.h"
+#include "trace.h"
 
 static int __sigp_sense(struct kvm_vcpu *vcpu, u16 cpu_addr,
 			u64 *reg)
@@ -344,6 +345,7 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu)
 	else
 		parameter = vcpu->run->s.regs.gprs[r1 + 1];
 
+	trace_kvm_s390_handle_sigp(vcpu, order_code, cpu_addr, parameter);
 	switch (order_code) {
 	case SIGP_SENSE:
 		vcpu->stat.instruction_sigp_sense++;
diff --git a/arch/s390/kvm/trace.h b/arch/s390/kvm/trace.h
new file mode 100644
index 00000000000..2b29e62351d
--- /dev/null
+++ b/arch/s390/kvm/trace.h
@@ -0,0 +1,341 @@
+#if !defined(_TRACE_KVM_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_KVM_H
+
+#include <linux/tracepoint.h>
+#include <asm/sigp.h>
+#include <asm/debug.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM kvm
+#define TRACE_INCLUDE_PATH .
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace
+
+/*
+ * Helpers for vcpu-specific tracepoints containing the same information
+ * as s390dbf VCPU_EVENTs.
+ */
+#define VCPU_PROTO_COMMON struct kvm_vcpu *vcpu
+#define VCPU_ARGS_COMMON vcpu
+#define VCPU_FIELD_COMMON __field(int, id)			\
+	__field(unsigned long, pswmask)				\
+	__field(unsigned long, pswaddr)
+#define VCPU_ASSIGN_COMMON do {						\
+	__entry->id = vcpu->vcpu_id;					\
+	__entry->pswmask = vcpu->arch.sie_block->gpsw.mask;		\
+	__entry->pswaddr = vcpu->arch.sie_block->gpsw.addr;		\
+	} while (0);
+#define VCPU_TP_PRINTK(p_str, p_args...)				\
+	TP_printk("%02d[%016lx-%016lx]: " p_str, __entry->id,		\
+		  __entry->pswmask, __entry->pswaddr, p_args)
+
+/*
+ * Tracepoints for SIE entry and exit.
+ */
+TRACE_EVENT(kvm_s390_sie_enter,
+	    TP_PROTO(VCPU_PROTO_COMMON, int cpuflags),
+	    TP_ARGS(VCPU_ARGS_COMMON, cpuflags),
+
+	    TP_STRUCT__entry(
+		    VCPU_FIELD_COMMON
+		    __field(int, cpuflags)
+		    ),
+
+	    TP_fast_assign(
+		    VCPU_ASSIGN_COMMON
+		    __entry->cpuflags = cpuflags;
+		    ),
+
+	    VCPU_TP_PRINTK("entering sie flags %x", __entry->cpuflags)
+	);
+
+TRACE_EVENT(kvm_s390_sie_fault,
+	    TP_PROTO(VCPU_PROTO_COMMON),
+	    TP_ARGS(VCPU_ARGS_COMMON),
+
+	    TP_STRUCT__entry(
+		    VCPU_FIELD_COMMON
+		    ),
+
+	    TP_fast_assign(
+		    VCPU_ASSIGN_COMMON
+		    ),
+
+	    VCPU_TP_PRINTK("%s", "fault in sie instruction")
+	);
+
+#define sie_intercept_code				\
+	{0x04, "Instruction"},				\
+	{0x08, "Program interruption"},			\
+	{0x0C, "Instruction and program interuption"},	\
+	{0x10, "External request"},			\
+	{0x14, "External interruption"},		\
+	{0x18, "I/O request"},				\
+	{0x1C, "Wait state"},				\
+	{0x20, "Validity"},				\
+	{0x28, "Stop request"}
+
+TRACE_EVENT(kvm_s390_sie_exit,
+	    TP_PROTO(VCPU_PROTO_COMMON, u8 icptcode),
+	    TP_ARGS(VCPU_ARGS_COMMON, icptcode),
+
+	    TP_STRUCT__entry(
+		    VCPU_FIELD_COMMON
+		    __field(u8, icptcode)
+		    ),
+
+	    TP_fast_assign(
+		    VCPU_ASSIGN_COMMON
+		    __entry->icptcode = icptcode;
+		    ),
+
+	    VCPU_TP_PRINTK("exit sie icptcode %d (%s)", __entry->icptcode,
+			   __print_symbolic(__entry->icptcode,
+					    sie_intercept_code))
+	);
+
+/*
+ * Trace point for intercepted instructions.
+ */
+TRACE_EVENT(kvm_s390_intercept_instruction,
+	    TP_PROTO(VCPU_PROTO_COMMON, __u16 ipa, __u32 ipb),
+	    TP_ARGS(VCPU_ARGS_COMMON, ipa, ipb),
+
+	    TP_STRUCT__entry(
+		    VCPU_FIELD_COMMON
+		    __field(__u64, instruction)
+		    __field(char, insn[8])
+		    ),
+
+	    TP_fast_assign(
+		    VCPU_ASSIGN_COMMON
+		    __entry->instruction = ((__u64)ipa << 48) |
+		    ((__u64)ipb << 16);
+		    ),
+
+	    VCPU_TP_PRINTK("intercepted instruction %016llx (%s)",
+			   __entry->instruction,
+			   insn_to_mnemonic((unsigned char *)
+					    &__entry->instruction,
+					 __entry->insn) ?
+			   "unknown" : __entry->insn)
+	);
+
+/*
+ * Trace point for intercepted program interruptions.
+ */
+TRACE_EVENT(kvm_s390_intercept_prog,
+	    TP_PROTO(VCPU_PROTO_COMMON, __u16 code),
+	    TP_ARGS(VCPU_ARGS_COMMON, code),
+
+	    TP_STRUCT__entry(
+		    VCPU_FIELD_COMMON
+		    __field(__u16, code)
+		    ),
+
+	    TP_fast_assign(
+		    VCPU_ASSIGN_COMMON
+		    __entry->code = code;
+		    ),
+
+	    VCPU_TP_PRINTK("intercepted program interruption %04x",
+			   __entry->code)
+	);
+
+/*
+ * Trace point for validity intercepts.
+ */
+TRACE_EVENT(kvm_s390_intercept_validity,
+	    TP_PROTO(VCPU_PROTO_COMMON, __u16 viwhy),
+	    TP_ARGS(VCPU_ARGS_COMMON, viwhy),
+
+	    TP_STRUCT__entry(
+		    VCPU_FIELD_COMMON
+		    __field(__u16, viwhy)
+		    ),
+
+	    TP_fast_assign(
+		    VCPU_ASSIGN_COMMON
+		    __entry->viwhy = viwhy;
+		    ),
+
+	    VCPU_TP_PRINTK("got validity intercept %04x", __entry->viwhy)
+	);
+
+/*
+ * Trace points for instructions that are of special interest.
+ */
+
+#define sigp_order_codes					\
+	{SIGP_SENSE, "sense"},					\
+	{SIGP_EXTERNAL_CALL, "external call"},			\
+	{SIGP_EMERGENCY_SIGNAL, "emergency signal"},		\
+	{SIGP_STOP, "stop"},					\
+	{SIGP_STOP_AND_STORE_STATUS, "stop and store status"},	\
+	{SIGP_SET_ARCHITECTURE, "set architecture"},		\
+	{SIGP_SET_PREFIX, "set prefix"},			\
+	{SIGP_SENSE_RUNNING, "sense running"},			\
+	{SIGP_RESTART, "restart"}
+
+TRACE_EVENT(kvm_s390_handle_sigp,
+	    TP_PROTO(VCPU_PROTO_COMMON, __u8 order_code, __u16 cpu_addr, \
+		     __u32 parameter),
+	    TP_ARGS(VCPU_ARGS_COMMON, order_code, cpu_addr, parameter),
+
+	    TP_STRUCT__entry(
+		    VCPU_FIELD_COMMON
+		    __field(__u8, order_code)
+		    __field(__u16, cpu_addr)
+		    __field(__u32, parameter)
+		    ),
+
+	    TP_fast_assign(
+		    VCPU_ASSIGN_COMMON
+		    __entry->order_code = order_code;
+		    __entry->cpu_addr = cpu_addr;
+		    __entry->parameter = parameter;
+		    ),
+
+	    VCPU_TP_PRINTK("handle sigp order %02x (%s), cpu address %04x, " \
+			   "parameter %08x", __entry->order_code,
+			   __print_symbolic(__entry->order_code,
+					    sigp_order_codes),
+			   __entry->cpu_addr, __entry->parameter)
+	);
+
+#define diagnose_codes				\
+	{0x10, "release pages"},		\
+	{0x44, "time slice end"},		\
+	{0x308, "ipl functions"},		\
+	{0x500, "kvm hypercall"},		\
+	{0x501, "kvm breakpoint"}
+
+TRACE_EVENT(kvm_s390_handle_diag,
+	    TP_PROTO(VCPU_PROTO_COMMON, __u16 code),
+	    TP_ARGS(VCPU_ARGS_COMMON, code),
+
+	    TP_STRUCT__entry(
+		    VCPU_FIELD_COMMON
+		    __field(__u16, code)
+		    ),
+
+	    TP_fast_assign(
+		    VCPU_ASSIGN_COMMON
+		    __entry->code = code;
+		    ),
+
+	    VCPU_TP_PRINTK("handle diagnose call %04x (%s)", __entry->code,
+			   __print_symbolic(__entry->code, diagnose_codes))
+	);
+
+TRACE_EVENT(kvm_s390_handle_lctl,
+	    TP_PROTO(VCPU_PROTO_COMMON, int g, int reg1, int reg3, u64 addr),
+	    TP_ARGS(VCPU_ARGS_COMMON, g, reg1, reg3, addr),
+
+	    TP_STRUCT__entry(
+		    VCPU_FIELD_COMMON
+		    __field(int, g)
+		    __field(int, reg1)
+		    __field(int, reg3)
+		    __field(u64, addr)
+		    ),
+
+	    TP_fast_assign(
+		    VCPU_ASSIGN_COMMON
+		    __entry->g = g;
+		    __entry->reg1 = reg1;
+		    __entry->reg3 = reg3;
+		    __entry->addr = addr;
+		    ),
+
+	    VCPU_TP_PRINTK("%s: loading cr %x-%x from %016llx",
+			   __entry->g ? "lctlg" : "lctl",
+			   __entry->reg1, __entry->reg3, __entry->addr)
+	);
+
+TRACE_EVENT(kvm_s390_handle_prefix,
+	    TP_PROTO(VCPU_PROTO_COMMON, int set, u32 address),
+	    TP_ARGS(VCPU_ARGS_COMMON, set, address),
+
+	    TP_STRUCT__entry(
+		    VCPU_FIELD_COMMON
+		    __field(int, set)
+		    __field(u32, address)
+		    ),
+
+	    TP_fast_assign(
+		    VCPU_ASSIGN_COMMON
+		    __entry->set = set;
+		    __entry->address = address;
+		    ),
+
+	    VCPU_TP_PRINTK("%s prefix to %08x",
+			   __entry->set ? "setting" : "storing",
+			   __entry->address)
+	);
+
+TRACE_EVENT(kvm_s390_handle_stap,
+	    TP_PROTO(VCPU_PROTO_COMMON, u64 address),
+	    TP_ARGS(VCPU_ARGS_COMMON, address),
+
+	    TP_STRUCT__entry(
+		    VCPU_FIELD_COMMON
+		    __field(u64, address)
+		    ),
+
+	    TP_fast_assign(
+		    VCPU_ASSIGN_COMMON
+		    __entry->address = address;
+		    ),
+
+	    VCPU_TP_PRINTK("storing cpu address to %016llx",
+			   __entry->address)
+	);
+
+TRACE_EVENT(kvm_s390_handle_stfl,
+	    TP_PROTO(VCPU_PROTO_COMMON, unsigned int facility_list),
+	    TP_ARGS(VCPU_ARGS_COMMON, facility_list),
+
+	    TP_STRUCT__entry(
+		    VCPU_FIELD_COMMON
+		    __field(unsigned int, facility_list)
+		    ),
+
+	    TP_fast_assign(
+		    VCPU_ASSIGN_COMMON
+		    __entry->facility_list = facility_list;
+		    ),
+
+	    VCPU_TP_PRINTK("store facility list value %08x",
+			   __entry->facility_list)
+	);
+
+TRACE_EVENT(kvm_s390_handle_stsi,
+	    TP_PROTO(VCPU_PROTO_COMMON, int fc, int sel1, int sel2, u64 addr),
+	    TP_ARGS(VCPU_ARGS_COMMON, fc, sel1, sel2, addr),
+
+	    TP_STRUCT__entry(
+		    VCPU_FIELD_COMMON
+		    __field(int, fc)
+		    __field(int, sel1)
+		    __field(int, sel2)
+		    __field(u64, addr)
+		    ),
+
+	    TP_fast_assign(
+		    VCPU_ASSIGN_COMMON
+		    __entry->fc = fc;
+		    __entry->sel1 = sel1;
+		    __entry->sel2 = sel2;
+		    __entry->addr = addr;
+		    ),
+
+	    VCPU_TP_PRINTK("STSI %d.%d.%d information stored to %016llx",
+			   __entry->fc, __entry->sel1, __entry->sel2,
+			   __entry->addr)
+	);
+
+#endif /* _TRACE_KVM_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
-- 
cgit v1.2.3


From ade38c311a0ad8c32e902fe1d0ae74d0d44bc71e Mon Sep 17 00:00:00 2001
From: Cornelia Huck <cornelia.huck@de.ibm.com>
Date: Mon, 23 Jul 2012 17:20:30 +0200
Subject: KVM: s390: Add implementation-specific trace events

Introduce a new trace system, kvm-s390, for some kvm/s390 specific
trace points:

- injection of interrupts
- delivery of interrupts to the guest
- creation/destruction of kvm machines and vcpus
- stop actions for vcpus
- reset requests for userspace

Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/s390/kvm/diag.c       |   2 +
 arch/s390/kvm/intercept.c  |   3 +
 arch/s390/kvm/interrupt.c  |  23 +++++
 arch/s390/kvm/kvm-s390.c   |   3 +
 arch/s390/kvm/trace-s390.h | 210 +++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 241 insertions(+)
 create mode 100644 arch/s390/kvm/trace-s390.h

(limited to 'arch')

diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c
index f5d4416ebfe..a390687feb1 100644
--- a/arch/s390/kvm/diag.c
+++ b/arch/s390/kvm/diag.c
@@ -15,6 +15,7 @@
 #include <linux/kvm_host.h>
 #include "kvm-s390.h"
 #include "trace.h"
+#include "trace-s390.h"
 
 static int diag_release_pages(struct kvm_vcpu *vcpu)
 {
@@ -99,6 +100,7 @@ static int __diag_ipl_functions(struct kvm_vcpu *vcpu)
 	vcpu->run->exit_reason = KVM_EXIT_S390_RESET;
 	VCPU_EVENT(vcpu, 3, "requesting userspace resets %llx",
 	  vcpu->run->s390_reset_flags);
+	trace_kvm_s390_request_resets(vcpu->run->s390_reset_flags);
 	return -EREMOTE;
 }
 
diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index db541d62c77..22798ec33fd 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -20,6 +20,7 @@
 #include "kvm-s390.h"
 #include "gaccess.h"
 #include "trace.h"
+#include "trace-s390.h"
 
 static int handle_lctlg(struct kvm_vcpu *vcpu)
 {
@@ -138,6 +139,8 @@ static int handle_stop(struct kvm_vcpu *vcpu)
 	vcpu->stat.exit_stop_request++;
 	spin_lock_bh(&vcpu->arch.local_int.lock);
 
+	trace_kvm_s390_stop_request(vcpu->arch.local_int.action_bits);
+
 	if (vcpu->arch.local_int.action_bits & ACTION_RELOADVCPU_ON_STOP) {
 		vcpu->arch.local_int.action_bits &= ~ACTION_RELOADVCPU_ON_STOP;
 		rc = SIE_INTERCEPT_RERUNVCPU;
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index b7bc1aac8ed..7556231fb07 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -19,6 +19,7 @@
 #include <asm/uaccess.h>
 #include "kvm-s390.h"
 #include "gaccess.h"
+#include "trace-s390.h"
 
 static int psw_extint_disabled(struct kvm_vcpu *vcpu)
 {
@@ -130,6 +131,8 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu,
 	case KVM_S390_INT_EMERGENCY:
 		VCPU_EVENT(vcpu, 4, "%s", "interrupt: sigp emerg");
 		vcpu->stat.deliver_emergency_signal++;
+		trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
+						 inti->emerg.code, 0);
 		rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x1201);
 		if (rc == -EFAULT)
 			exception = 1;
@@ -152,6 +155,8 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu,
 	case KVM_S390_INT_EXTERNAL_CALL:
 		VCPU_EVENT(vcpu, 4, "%s", "interrupt: sigp ext call");
 		vcpu->stat.deliver_external_call++;
+		trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
+						 inti->extcall.code, 0);
 		rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x1202);
 		if (rc == -EFAULT)
 			exception = 1;
@@ -175,6 +180,8 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu,
 		VCPU_EVENT(vcpu, 4, "interrupt: sclp parm:%x",
 			   inti->ext.ext_params);
 		vcpu->stat.deliver_service_signal++;
+		trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
+						 inti->ext.ext_params, 0);
 		rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x2401);
 		if (rc == -EFAULT)
 			exception = 1;
@@ -198,6 +205,9 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu,
 		VCPU_EVENT(vcpu, 4, "interrupt: virtio parm:%x,parm64:%llx",
 			   inti->ext.ext_params, inti->ext.ext_params2);
 		vcpu->stat.deliver_virtio_interrupt++;
+		trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
+						 inti->ext.ext_params,
+						 inti->ext.ext_params2);
 		rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x2603);
 		if (rc == -EFAULT)
 			exception = 1;
@@ -229,6 +239,8 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu,
 	case KVM_S390_SIGP_STOP:
 		VCPU_EVENT(vcpu, 4, "%s", "interrupt: cpu stop");
 		vcpu->stat.deliver_stop_signal++;
+		trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
+						 0, 0);
 		__set_intercept_indicator(vcpu, inti);
 		break;
 
@@ -236,12 +248,16 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu,
 		VCPU_EVENT(vcpu, 4, "interrupt: set prefix to %x",
 			   inti->prefix.address);
 		vcpu->stat.deliver_prefix_signal++;
+		trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
+						 inti->prefix.address, 0);
 		kvm_s390_set_prefix(vcpu, inti->prefix.address);
 		break;
 
 	case KVM_S390_RESTART:
 		VCPU_EVENT(vcpu, 4, "%s", "interrupt: cpu restart");
 		vcpu->stat.deliver_restart_signal++;
+		trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
+						 0, 0);
 		rc = copy_to_guest(vcpu, offsetof(struct _lowcore,
 		  restart_old_psw), &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
 		if (rc == -EFAULT)
@@ -259,6 +275,8 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu,
 			   inti->pgm.code,
 			   table[vcpu->arch.sie_block->ipa >> 14]);
 		vcpu->stat.deliver_program_int++;
+		trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
+						 inti->pgm.code, 0);
 		rc = put_guest_u16(vcpu, __LC_PGM_INT_CODE, inti->pgm.code);
 		if (rc == -EFAULT)
 			exception = 1;
@@ -515,6 +533,7 @@ int kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code)
 	inti->pgm.code = code;
 
 	VCPU_EVENT(vcpu, 3, "inject: program check %d (from kernel)", code);
+	trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, inti->type, code, 0, 1);
 	spin_lock_bh(&li->lock);
 	list_add(&inti->list, &li->list);
 	atomic_set(&li->active, 1);
@@ -556,6 +575,8 @@ int kvm_s390_inject_vm(struct kvm *kvm,
 		kfree(inti);
 		return -EINVAL;
 	}
+	trace_kvm_s390_inject_vm(s390int->type, s390int->parm, s390int->parm64,
+				 2);
 
 	mutex_lock(&kvm->lock);
 	fi = &kvm->arch.float_int;
@@ -621,6 +642,8 @@ int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu,
 		kfree(inti);
 		return -EINVAL;
 	}
+	trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, s390int->type, s390int->parm,
+				   s390int->parm64, 2);
 
 	mutex_lock(&vcpu->kvm->lock);
 	li = &vcpu->arch.local_int;
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 4613602e123..e83df7f0fed 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -34,6 +34,7 @@
 
 #define CREATE_TRACE_POINTS
 #include "trace.h"
+#include "trace-s390.h"
 
 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
 
@@ -245,6 +246,7 @@ out_err:
 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 {
 	VCPU_EVENT(vcpu, 3, "%s", "free cpu");
+	trace_kvm_s390_destroy_vcpu(vcpu->vcpu_id);
 	if (!kvm_is_ucontrol(vcpu->kvm)) {
 		clear_bit(63 - vcpu->vcpu_id,
 			  (unsigned long *) &vcpu->kvm->arch.sca->mcn);
@@ -420,6 +422,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
 		goto out_free_sie_block;
 	VM_EVENT(kvm, 3, "create cpu %d at %p, sie block at %p", id, vcpu,
 		 vcpu->arch.sie_block);
+	trace_kvm_s390_create_vcpu(id, vcpu, vcpu->arch.sie_block);
 
 	return vcpu;
 out_free_sie_block:
diff --git a/arch/s390/kvm/trace-s390.h b/arch/s390/kvm/trace-s390.h
new file mode 100644
index 00000000000..90fdf85b5ff
--- /dev/null
+++ b/arch/s390/kvm/trace-s390.h
@@ -0,0 +1,210 @@
+#if !defined(_TRACE_KVMS390_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_KVMS390_H
+
+#include <linux/tracepoint.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM kvm-s390
+#define TRACE_INCLUDE_PATH .
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace-s390
+
+/*
+ * Trace point for the creation of the kvm instance.
+ */
+TRACE_EVENT(kvm_s390_create_vm,
+	    TP_PROTO(unsigned long type),
+	    TP_ARGS(type),
+
+	    TP_STRUCT__entry(
+		    __field(unsigned long, type)
+		    ),
+
+	    TP_fast_assign(
+		    __entry->type = type;
+		    ),
+
+	    TP_printk("create vm%s",
+		      __entry->type & KVM_VM_S390_UCONTROL ? " (UCONTROL)" : "")
+	);
+
+/*
+ * Trace points for creation and destruction of vpcus.
+ */
+TRACE_EVENT(kvm_s390_create_vcpu,
+	    TP_PROTO(unsigned int id, struct kvm_vcpu *vcpu,
+		     struct kvm_s390_sie_block *sie_block),
+	    TP_ARGS(id, vcpu, sie_block),
+
+	    TP_STRUCT__entry(
+		    __field(unsigned int, id)
+		    __field(struct kvm_vcpu *, vcpu)
+		    __field(struct kvm_s390_sie_block *, sie_block)
+		    ),
+
+	    TP_fast_assign(
+		    __entry->id = id;
+		    __entry->vcpu = vcpu;
+		    __entry->sie_block = sie_block;
+		    ),
+
+	    TP_printk("create cpu %d at %p, sie block at %p", __entry->id,
+		      __entry->vcpu, __entry->sie_block)
+	);
+
+TRACE_EVENT(kvm_s390_destroy_vcpu,
+	    TP_PROTO(unsigned int id),
+	    TP_ARGS(id),
+
+	    TP_STRUCT__entry(
+		    __field(unsigned int, id)
+		    ),
+
+	    TP_fast_assign(
+		    __entry->id = id;
+		    ),
+
+	    TP_printk("destroy cpu %d", __entry->id)
+	);
+
+/*
+ * Trace points for injection of interrupts, either per machine or
+ * per vcpu.
+ */
+
+#define kvm_s390_int_type						\
+	{KVM_S390_SIGP_STOP, "sigp stop"},				\
+	{KVM_S390_PROGRAM_INT, "program interrupt"},			\
+	{KVM_S390_SIGP_SET_PREFIX, "sigp set prefix"},			\
+	{KVM_S390_RESTART, "sigp restart"},				\
+	{KVM_S390_INT_VIRTIO, "virtio interrupt"},			\
+	{KVM_S390_INT_SERVICE, "sclp interrupt"},			\
+	{KVM_S390_INT_EMERGENCY, "sigp emergency"},			\
+	{KVM_S390_INT_EXTERNAL_CALL, "sigp ext call"}
+
+TRACE_EVENT(kvm_s390_inject_vm,
+	    TP_PROTO(__u64 type, __u32 parm, __u64 parm64, int who),
+	    TP_ARGS(type, parm, parm64, who),
+
+	    TP_STRUCT__entry(
+		    __field(__u32, inttype)
+		    __field(__u32, parm)
+		    __field(__u64, parm64)
+		    __field(int, who)
+		    ),
+
+	    TP_fast_assign(
+		    __entry->inttype = type & 0x00000000ffffffff;
+		    __entry->parm = parm;
+		    __entry->parm64 = parm64;
+		    __entry->who = who;
+		    ),
+
+	    TP_printk("inject%s: type:%x (%s) parm:%x parm64:%llx",
+		      (__entry->who == 1) ? " (from kernel)" :
+		      (__entry->who == 2) ? " (from user)" : "",
+		      __entry->inttype,
+		      __print_symbolic(__entry->inttype, kvm_s390_int_type),
+		      __entry->parm, __entry->parm64)
+	);
+
+TRACE_EVENT(kvm_s390_inject_vcpu,
+	    TP_PROTO(unsigned int id, __u64 type, __u32 parm, __u64 parm64, \
+		     int who),
+	    TP_ARGS(id, type, parm, parm64, who),
+
+	    TP_STRUCT__entry(
+		    __field(int, id)
+		    __field(__u32, inttype)
+		    __field(__u32, parm)
+		    __field(__u64, parm64)
+		    __field(int, who)
+		    ),
+
+	    TP_fast_assign(
+		    __entry->id = id;
+		    __entry->inttype = type & 0x00000000ffffffff;
+		    __entry->parm = parm;
+		    __entry->parm64 = parm64;
+		    __entry->who = who;
+		    ),
+
+	    TP_printk("inject%s (vcpu %d): type:%x (%s) parm:%x parm64:%llx",
+		      (__entry->who == 1) ? " (from kernel)" :
+		      (__entry->who == 2) ? " (from user)" : "",
+		      __entry->id, __entry->inttype,
+		      __print_symbolic(__entry->inttype, kvm_s390_int_type),
+		      __entry->parm, __entry->parm64)
+	);
+
+/*
+ * Trace point for the actual delivery of interrupts.
+ */
+TRACE_EVENT(kvm_s390_deliver_interrupt,
+	    TP_PROTO(unsigned int id, __u64 type, __u32 data0, __u64 data1),
+	    TP_ARGS(id, type, data0, data1),
+
+	    TP_STRUCT__entry(
+		    __field(int, id)
+		    __field(__u32, inttype)
+		    __field(__u32, data0)
+		    __field(__u64, data1)
+		    ),
+
+	    TP_fast_assign(
+		    __entry->id = id;
+		    __entry->inttype = type & 0x00000000ffffffff;
+		    __entry->data0 = data0;
+		    __entry->data1 = data1;
+		    ),
+
+	    TP_printk("deliver interrupt (vcpu %d): type:%x (%s) "	\
+		      "data:%08x %016llx",
+		      __entry->id, __entry->inttype,
+		      __print_symbolic(__entry->inttype, kvm_s390_int_type),
+		      __entry->data0, __entry->data1)
+	);
+
+/*
+ * Trace point for resets that may be requested from userspace.
+ */
+TRACE_EVENT(kvm_s390_request_resets,
+	    TP_PROTO(__u64 resets),
+	    TP_ARGS(resets),
+
+	    TP_STRUCT__entry(
+		    __field(__u64, resets)
+		    ),
+
+	    TP_fast_assign(
+		    __entry->resets = resets;
+		    ),
+
+	    TP_printk("requesting userspace resets %llx",
+		      __entry->resets)
+	);
+
+/*
+ * Trace point for a vcpu's stop requests.
+ */
+TRACE_EVENT(kvm_s390_stop_request,
+	    TP_PROTO(unsigned int action_bits),
+	    TP_ARGS(action_bits),
+
+	    TP_STRUCT__entry(
+		    __field(unsigned int, action_bits)
+		    ),
+
+	    TP_fast_assign(
+		    __entry->action_bits = action_bits;
+		    ),
+
+	    TP_printk("stop request, action_bits = %08x",
+		      __entry->action_bits)
+	);
+
+
+#endif /* _TRACE_KVMS390_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
-- 
cgit v1.2.3


From 4a4541a40e1fe145c72c4b959fac524a5600d9fb Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 22 Jul 2012 17:41:00 +0300
Subject: KVM: Don't update PPR on any APIC read

The current code will update the PPR on almost any APIC read; however
that's only required if we read the PPR.

kvm_update_ppr() shows up in some profiles, albeit with a low usage (~1%).
This should reduce it further (it will still be called during interrupt
processing).

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/lapic.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index fff7173f6a7..ad7fff7ad13 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -696,12 +696,14 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
 
 		val = apic_get_tmcct(apic);
 		break;
-
+	case APIC_PROCPRI:
+		apic_update_ppr(apic);
+		val = apic_get_reg(apic, offset);
+		break;
 	case APIC_TASKPRI:
 		report_tpr_access(apic, false);
 		/* fall thru */
 	default:
-		apic_update_ppr(apic);
 		val = apic_get_reg(apic, offset);
 		break;
 	}
-- 
cgit v1.2.3


From e9d90d472da97e1b1560bffb89578ba082c88a69 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 26 Jul 2012 18:01:50 +0300
Subject: KVM: Remove internal timer abstraction

kvm_timer_fn(), the sole inhabitant of timer.c, is only used by lapic.c. Move
it there to make it easier to hack on it.

struct kvm_timer is a thin wrapper around hrtimer, and only adds obfuscation.
Move near its two users (with different names) to prepare for simplification.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/Makefile    |  2 +-
 arch/x86/kvm/i8254.c     |  8 ++++----
 arch/x86/kvm/i8254.h     | 18 +++++++++++++++++-
 arch/x86/kvm/kvm_timer.h | 18 ------------------
 arch/x86/kvm/lapic.c     | 30 +++++++++++++++++++++++++++++-
 arch/x86/kvm/lapic.h     | 17 ++++++++++++++++-
 arch/x86/kvm/timer.c     | 47 -----------------------------------------------
 7 files changed, 67 insertions(+), 73 deletions(-)
 delete mode 100644 arch/x86/kvm/kvm_timer.h
 delete mode 100644 arch/x86/kvm/timer.c

(limited to 'arch')

diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 4f579e8dcac..04d30401c5c 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -12,7 +12,7 @@ kvm-$(CONFIG_IOMMU_API)	+= $(addprefix ../../../virt/kvm/, iommu.o)
 kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(addprefix ../../../virt/kvm/, async_pf.o)
 
 kvm-y			+= x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
-			   i8254.o timer.o cpuid.o pmu.o
+			   i8254.o cpuid.o pmu.o
 kvm-intel-y		+= vmx.o
 kvm-amd-y		+= svm.o
 
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index adba28f88d1..1d8e75702d9 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -272,14 +272,14 @@ static void destroy_pit_timer(struct kvm_pit *pit)
 	flush_kthread_work(&pit->expired);
 }
 
-static bool kpit_is_periodic(struct kvm_timer *ktimer)
+static bool kpit_is_periodic(struct kvm_pit_timer *ktimer)
 {
 	struct kvm_kpit_state *ps = container_of(ktimer, struct kvm_kpit_state,
 						 pit_timer);
 	return ps->is_periodic;
 }
 
-static struct kvm_timer_ops kpit_ops = {
+static struct kvm_pit_timer_ops kpit_ops = {
 	.is_periodic = kpit_is_periodic,
 };
 
@@ -322,7 +322,7 @@ static void pit_do_work(struct kthread_work *work)
 
 static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
 {
-	struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
+	struct kvm_pit_timer *ktimer = container_of(data, struct kvm_pit_timer, timer);
 	struct kvm_pit *pt = ktimer->kvm->arch.vpit;
 
 	if (ktimer->reinject || !atomic_read(&ktimer->pending)) {
@@ -340,7 +340,7 @@ static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
 static void create_pit_timer(struct kvm *kvm, u32 val, int is_period)
 {
 	struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
-	struct kvm_timer *pt = &ps->pit_timer;
+	struct kvm_pit_timer *pt = &ps->pit_timer;
 	s64 interval;
 
 	if (!irqchip_in_kernel(kvm) || ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index fdf40425ea1..3351816e8b3 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -21,10 +21,26 @@ struct kvm_kpit_channel_state {
 	ktime_t count_load_time;
 };
 
+struct kvm_pit_timer {
+	struct hrtimer timer;
+	s64 period; 				/* unit: ns */
+	u32 timer_mode_mask;
+	u64 tscdeadline;
+	atomic_t pending;			/* accumulated triggered timers */
+	bool reinject;
+	struct kvm_pit_timer_ops *t_ops;
+	struct kvm *kvm;
+	struct kvm_vcpu *vcpu;
+};
+
+struct kvm_pit_timer_ops {
+	bool (*is_periodic)(struct kvm_pit_timer *);
+};
+
 struct kvm_kpit_state {
 	struct kvm_kpit_channel_state channels[3];
 	u32 flags;
-	struct kvm_timer pit_timer;
+	struct kvm_pit_timer pit_timer;
 	bool is_periodic;
 	u32    speaker_data_on;
 	struct mutex lock;
diff --git a/arch/x86/kvm/kvm_timer.h b/arch/x86/kvm/kvm_timer.h
deleted file mode 100644
index 497dbaa366d..00000000000
--- a/arch/x86/kvm/kvm_timer.h
+++ /dev/null
@@ -1,18 +0,0 @@
-
-struct kvm_timer {
-	struct hrtimer timer;
-	s64 period; 				/* unit: ns */
-	u32 timer_mode_mask;
-	u64 tscdeadline;
-	atomic_t pending;			/* accumulated triggered timers */
-	bool reinject;
-	struct kvm_timer_ops *t_ops;
-	struct kvm *kvm;
-	struct kvm_vcpu *vcpu;
-};
-
-struct kvm_timer_ops {
-	bool (*is_periodic)(struct kvm_timer *);
-};
-
-enum hrtimer_restart kvm_timer_fn(struct hrtimer *data);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index ad7fff7ad13..61ed32cd17c 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1262,6 +1262,34 @@ static const struct kvm_io_device_ops apic_mmio_ops = {
 	.write    = apic_mmio_write,
 };
 
+static enum hrtimer_restart apic_timer_fn(struct hrtimer *data)
+{
+	struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
+	struct kvm_vcpu *vcpu = ktimer->vcpu;
+	wait_queue_head_t *q = &vcpu->wq;
+
+	/*
+	 * There is a race window between reading and incrementing, but we do
+	 * not care about potentially losing timer events in the !reinject
+	 * case anyway. Note: KVM_REQ_PENDING_TIMER is implicitly checked
+	 * in vcpu_enter_guest.
+	 */
+	if (ktimer->reinject || !atomic_read(&ktimer->pending)) {
+		atomic_inc(&ktimer->pending);
+		/* FIXME: this code should not know anything about vcpus */
+		kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
+	}
+
+	if (waitqueue_active(q))
+		wake_up_interruptible(q);
+
+	if (ktimer->t_ops->is_periodic(ktimer)) {
+		hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
+		return HRTIMER_RESTART;
+	} else
+		return HRTIMER_NORESTART;
+}
+
 int kvm_create_lapic(struct kvm_vcpu *vcpu)
 {
 	struct kvm_lapic *apic;
@@ -1285,7 +1313,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
 
 	hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
 		     HRTIMER_MODE_ABS);
-	apic->lapic_timer.timer.function = kvm_timer_fn;
+	apic->lapic_timer.timer.function = apic_timer_fn;
 	apic->lapic_timer.t_ops = &lapic_timer_ops;
 	apic->lapic_timer.kvm = vcpu->kvm;
 	apic->lapic_timer.vcpu = vcpu;
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 4af5405ae1e..d7251c92ed4 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -2,10 +2,25 @@
 #define __KVM_X86_LAPIC_H
 
 #include "iodev.h"
-#include "kvm_timer.h"
 
 #include <linux/kvm_host.h>
 
+struct kvm_timer {
+	struct hrtimer timer;
+	s64 period; 				/* unit: ns */
+	u32 timer_mode_mask;
+	u64 tscdeadline;
+	atomic_t pending;			/* accumulated triggered timers */
+	bool reinject;
+	struct kvm_timer_ops *t_ops;
+	struct kvm *kvm;
+	struct kvm_vcpu *vcpu;
+};
+
+struct kvm_timer_ops {
+	bool (*is_periodic)(struct kvm_timer *);
+};
+
 struct kvm_lapic {
 	unsigned long base_address;
 	struct kvm_io_device dev;
diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c
deleted file mode 100644
index 6b85cc647f3..00000000000
--- a/arch/x86/kvm/timer.c
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Kernel-based Virtual Machine driver for Linux
- *
- * This module enables machines with Intel VT-x extensions to run virtual
- * machines without emulation or binary translation.
- *
- * timer support
- *
- * Copyright 2010 Red Hat, Inc. and/or its affiliates.
- *
- * This work is licensed under the terms of the GNU GPL, version 2.  See
- * the COPYING file in the top-level directory.
- */
-
-#include <linux/kvm_host.h>
-#include <linux/kvm.h>
-#include <linux/hrtimer.h>
-#include <linux/atomic.h>
-#include "kvm_timer.h"
-
-enum hrtimer_restart kvm_timer_fn(struct hrtimer *data)
-{
-	struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
-	struct kvm_vcpu *vcpu = ktimer->vcpu;
-	wait_queue_head_t *q = &vcpu->wq;
-
-	/*
-	 * There is a race window between reading and incrementing, but we do
-	 * not care about potentially losing timer events in the !reinject
-	 * case anyway. Note: KVM_REQ_PENDING_TIMER is implicitly checked
-	 * in vcpu_enter_guest.
-	 */
-	if (ktimer->reinject || !atomic_read(&ktimer->pending)) {
-		atomic_inc(&ktimer->pending);
-		/* FIXME: this code should not know anything about vcpus */
-		kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
-	}
-
-	if (waitqueue_active(q))
-		wake_up_interruptible(q);
-
-	if (ktimer->t_ops->is_periodic(ktimer)) {
-		hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
-		return HRTIMER_RESTART;
-	} else
-		return HRTIMER_NORESTART;
-}
-- 
cgit v1.2.3


From 2a6eac9638a92b61de04bac4233d8ca665ae96af Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 26 Jul 2012 18:01:51 +0300
Subject: KVM: Simplify kvm_timer

'reinject' is never initialized
't_ops' only serves as indirection to lapic_is_periodic; call that directly
   instead
'kvm' is never used
'vcpu' can be derived via container_of

Remove these fields.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/lapic.c | 18 +++++-------------
 arch/x86/kvm/lapic.h |  8 --------
 2 files changed, 5 insertions(+), 21 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 61ed32cd17c..0cd431c85d3 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1214,10 +1214,8 @@ int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
  *----------------------------------------------------------------------
  */
 
-static bool lapic_is_periodic(struct kvm_timer *ktimer)
+static bool lapic_is_periodic(struct kvm_lapic *apic)
 {
-	struct kvm_lapic *apic = container_of(ktimer, struct kvm_lapic,
-					      lapic_timer);
 	return apic_lvtt_period(apic);
 }
 
@@ -1253,10 +1251,6 @@ void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu)
 		kvm_apic_local_deliver(apic, APIC_LVT0);
 }
 
-static struct kvm_timer_ops lapic_timer_ops = {
-	.is_periodic = lapic_is_periodic,
-};
-
 static const struct kvm_io_device_ops apic_mmio_ops = {
 	.read     = apic_mmio_read,
 	.write    = apic_mmio_write,
@@ -1265,7 +1259,8 @@ static const struct kvm_io_device_ops apic_mmio_ops = {
 static enum hrtimer_restart apic_timer_fn(struct hrtimer *data)
 {
 	struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
-	struct kvm_vcpu *vcpu = ktimer->vcpu;
+	struct kvm_lapic *apic = container_of(ktimer, struct kvm_lapic, lapic_timer);
+	struct kvm_vcpu *vcpu = apic->vcpu;
 	wait_queue_head_t *q = &vcpu->wq;
 
 	/*
@@ -1274,7 +1269,7 @@ static enum hrtimer_restart apic_timer_fn(struct hrtimer *data)
 	 * case anyway. Note: KVM_REQ_PENDING_TIMER is implicitly checked
 	 * in vcpu_enter_guest.
 	 */
-	if (ktimer->reinject || !atomic_read(&ktimer->pending)) {
+	if (!atomic_read(&ktimer->pending)) {
 		atomic_inc(&ktimer->pending);
 		/* FIXME: this code should not know anything about vcpus */
 		kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
@@ -1283,7 +1278,7 @@ static enum hrtimer_restart apic_timer_fn(struct hrtimer *data)
 	if (waitqueue_active(q))
 		wake_up_interruptible(q);
 
-	if (ktimer->t_ops->is_periodic(ktimer)) {
+	if (lapic_is_periodic(apic)) {
 		hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
 		return HRTIMER_RESTART;
 	} else
@@ -1314,9 +1309,6 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
 	hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
 		     HRTIMER_MODE_ABS);
 	apic->lapic_timer.timer.function = apic_timer_fn;
-	apic->lapic_timer.t_ops = &lapic_timer_ops;
-	apic->lapic_timer.kvm = vcpu->kvm;
-	apic->lapic_timer.vcpu = vcpu;
 
 	apic->base_address = APIC_DEFAULT_PHYS_BASE;
 	vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE;
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index d7251c92ed4..166766fffd9 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -11,14 +11,6 @@ struct kvm_timer {
 	u32 timer_mode_mask;
 	u64 tscdeadline;
 	atomic_t pending;			/* accumulated triggered timers */
-	bool reinject;
-	struct kvm_timer_ops *t_ops;
-	struct kvm *kvm;
-	struct kvm_vcpu *vcpu;
-};
-
-struct kvm_timer_ops {
-	bool (*is_periodic)(struct kvm_timer *);
 };
 
 struct kvm_lapic {
-- 
cgit v1.2.3


From 9d9d2239bdecd525ce3eb6cbfe4abb925c98208c Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 26 Jul 2012 18:01:52 +0300
Subject: KVM: Simplify kvm_pit_timer

'timer_mode_mask' is unused
'tscdeadline' is unused
't_ops' only adds needless indirection
'vcpu' is unused

Remove.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/i8254.c | 14 +-------------
 arch/x86/kvm/i8254.h |  8 --------
 2 files changed, 1 insertion(+), 21 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 1d8e75702d9..a9e187a5b19 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -272,17 +272,6 @@ static void destroy_pit_timer(struct kvm_pit *pit)
 	flush_kthread_work(&pit->expired);
 }
 
-static bool kpit_is_periodic(struct kvm_pit_timer *ktimer)
-{
-	struct kvm_kpit_state *ps = container_of(ktimer, struct kvm_kpit_state,
-						 pit_timer);
-	return ps->is_periodic;
-}
-
-static struct kvm_pit_timer_ops kpit_ops = {
-	.is_periodic = kpit_is_periodic,
-};
-
 static void pit_do_work(struct kthread_work *work)
 {
 	struct kvm_pit *pit = container_of(work, struct kvm_pit, expired);
@@ -330,7 +319,7 @@ static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
 		queue_kthread_work(&pt->worker, &pt->expired);
 	}
 
-	if (ktimer->t_ops->is_periodic(ktimer)) {
+	if (pt->pit_state.is_periodic) {
 		hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
 		return HRTIMER_RESTART;
 	} else
@@ -357,7 +346,6 @@ static void create_pit_timer(struct kvm *kvm, u32 val, int is_period)
 	ps->is_periodic = is_period;
 
 	pt->timer.function = pit_timer_fn;
-	pt->t_ops = &kpit_ops;
 	pt->kvm = ps->pit->kvm;
 
 	atomic_set(&pt->pending, 0);
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index 3351816e8b3..c9bbcb889c4 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -24,17 +24,9 @@ struct kvm_kpit_channel_state {
 struct kvm_pit_timer {
 	struct hrtimer timer;
 	s64 period; 				/* unit: ns */
-	u32 timer_mode_mask;
-	u64 tscdeadline;
 	atomic_t pending;			/* accumulated triggered timers */
 	bool reinject;
-	struct kvm_pit_timer_ops *t_ops;
 	struct kvm *kvm;
-	struct kvm_vcpu *vcpu;
-};
-
-struct kvm_pit_timer_ops {
-	bool (*is_periodic)(struct kvm_pit_timer *);
 };
 
 struct kvm_kpit_state {
-- 
cgit v1.2.3


From 26ef19242f6e4d747a61b5fd8da72343838864e4 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 26 Jul 2012 18:01:53 +0300
Subject: KVM: fold kvm_pit_timer into kvm_kpit_state

One structure nests inside the other, providing no value at all.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/i8254.c | 52 +++++++++++++++++++++++++---------------------------
 arch/x86/kvm/i8254.h | 14 +++++---------
 arch/x86/kvm/x86.c   |  2 +-
 3 files changed, 31 insertions(+), 37 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index a9e187a5b19..11300d2fa71 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -108,7 +108,7 @@ static s64 __kpit_elapsed(struct kvm *kvm)
 	ktime_t remaining;
 	struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
 
-	if (!ps->pit_timer.period)
+	if (!ps->period)
 		return 0;
 
 	/*
@@ -120,9 +120,9 @@ static s64 __kpit_elapsed(struct kvm *kvm)
 	 * itself with the initial count and continues counting
 	 * from there.
 	 */
-	remaining = hrtimer_get_remaining(&ps->pit_timer.timer);
-	elapsed = ps->pit_timer.period - ktime_to_ns(remaining);
-	elapsed = mod_64(elapsed, ps->pit_timer.period);
+	remaining = hrtimer_get_remaining(&ps->timer);
+	elapsed = ps->period - ktime_to_ns(remaining);
+	elapsed = mod_64(elapsed, ps->period);
 
 	return elapsed;
 }
@@ -238,12 +238,12 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
 	int value;
 
 	spin_lock(&ps->inject_lock);
-	value = atomic_dec_return(&ps->pit_timer.pending);
+	value = atomic_dec_return(&ps->pending);
 	if (value < 0)
 		/* spurious acks can be generated if, for example, the
 		 * PIC is being reset.  Handle it gracefully here
 		 */
-		atomic_inc(&ps->pit_timer.pending);
+		atomic_inc(&ps->pending);
 	else if (value > 0)
 		/* in this case, we had multiple outstanding pit interrupts
 		 * that we needed to inject.  Reinject
@@ -261,14 +261,14 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
 	if (!kvm_vcpu_is_bsp(vcpu) || !pit)
 		return;
 
-	timer = &pit->pit_state.pit_timer.timer;
+	timer = &pit->pit_state.timer;
 	if (hrtimer_cancel(timer))
 		hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
 }
 
 static void destroy_pit_timer(struct kvm_pit *pit)
 {
-	hrtimer_cancel(&pit->pit_state.pit_timer.timer);
+	hrtimer_cancel(&pit->pit_state.timer);
 	flush_kthread_work(&pit->expired);
 }
 
@@ -311,16 +311,16 @@ static void pit_do_work(struct kthread_work *work)
 
 static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
 {
-	struct kvm_pit_timer *ktimer = container_of(data, struct kvm_pit_timer, timer);
-	struct kvm_pit *pt = ktimer->kvm->arch.vpit;
+	struct kvm_kpit_state *ps = container_of(data, struct kvm_kpit_state, timer);
+	struct kvm_pit *pt = ps->kvm->arch.vpit;
 
-	if (ktimer->reinject || !atomic_read(&ktimer->pending)) {
-		atomic_inc(&ktimer->pending);
+	if (ps->reinject || !atomic_read(&ps->pending)) {
+		atomic_inc(&ps->pending);
 		queue_kthread_work(&pt->worker, &pt->expired);
 	}
 
-	if (pt->pit_state.is_periodic) {
-		hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
+	if (ps->is_periodic) {
+		hrtimer_add_expires_ns(&ps->timer, ps->period);
 		return HRTIMER_RESTART;
 	} else
 		return HRTIMER_NORESTART;
@@ -329,7 +329,6 @@ static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
 static void create_pit_timer(struct kvm *kvm, u32 val, int is_period)
 {
 	struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
-	struct kvm_pit_timer *pt = &ps->pit_timer;
 	s64 interval;
 
 	if (!irqchip_in_kernel(kvm) || ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)
@@ -340,18 +339,18 @@ static void create_pit_timer(struct kvm *kvm, u32 val, int is_period)
 	pr_debug("create pit timer, interval is %llu nsec\n", interval);
 
 	/* TODO The new value only affected after the retriggered */
-	hrtimer_cancel(&pt->timer);
+	hrtimer_cancel(&ps->timer);
 	flush_kthread_work(&ps->pit->expired);
-	pt->period = interval;
+	ps->period = interval;
 	ps->is_periodic = is_period;
 
-	pt->timer.function = pit_timer_fn;
-	pt->kvm = ps->pit->kvm;
+	ps->timer.function = pit_timer_fn;
+	ps->kvm = ps->pit->kvm;
 
-	atomic_set(&pt->pending, 0);
+	atomic_set(&ps->pending, 0);
 	ps->irq_ack = 1;
 
-	hrtimer_start(&pt->timer, ktime_add_ns(ktime_get(), interval),
+	hrtimer_start(&ps->timer, ktime_add_ns(ktime_get(), interval),
 		      HRTIMER_MODE_ABS);
 }
 
@@ -627,7 +626,7 @@ void kvm_pit_reset(struct kvm_pit *pit)
 	}
 	mutex_unlock(&pit->pit_state.lock);
 
-	atomic_set(&pit->pit_state.pit_timer.pending, 0);
+	atomic_set(&pit->pit_state.pending, 0);
 	pit->pit_state.irq_ack = 1;
 }
 
@@ -636,7 +635,7 @@ static void pit_mask_notifer(struct kvm_irq_mask_notifier *kimn, bool mask)
 	struct kvm_pit *pit = container_of(kimn, struct kvm_pit, mask_notifier);
 
 	if (!mask) {
-		atomic_set(&pit->pit_state.pit_timer.pending, 0);
+		atomic_set(&pit->pit_state.pending, 0);
 		pit->pit_state.irq_ack = 1;
 	}
 }
@@ -694,12 +693,11 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
 
 	pit_state = &pit->pit_state;
 	pit_state->pit = pit;
-	hrtimer_init(&pit_state->pit_timer.timer,
-		     CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+	hrtimer_init(&pit_state->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
 	pit_state->irq_ack_notifier.gsi = 0;
 	pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq;
 	kvm_register_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier);
-	pit_state->pit_timer.reinject = true;
+	pit_state->reinject = true;
 	mutex_unlock(&pit->pit_state.lock);
 
 	kvm_pit_reset(pit);
@@ -749,7 +747,7 @@ void kvm_free_pit(struct kvm *kvm)
 		kvm_unregister_irq_ack_notifier(kvm,
 				&kvm->arch.vpit->pit_state.irq_ack_notifier);
 		mutex_lock(&kvm->arch.vpit->pit_state.lock);
-		timer = &kvm->arch.vpit->pit_state.pit_timer.timer;
+		timer = &kvm->arch.vpit->pit_state.timer;
 		hrtimer_cancel(timer);
 		flush_kthread_work(&kvm->arch.vpit->expired);
 		kthread_stop(kvm->arch.vpit->worker_task);
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index c9bbcb889c4..dd1b16b611b 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -21,19 +21,15 @@ struct kvm_kpit_channel_state {
 	ktime_t count_load_time;
 };
 
-struct kvm_pit_timer {
-	struct hrtimer timer;
-	s64 period; 				/* unit: ns */
-	atomic_t pending;			/* accumulated triggered timers */
-	bool reinject;
-	struct kvm *kvm;
-};
-
 struct kvm_kpit_state {
 	struct kvm_kpit_channel_state channels[3];
 	u32 flags;
-	struct kvm_pit_timer pit_timer;
 	bool is_periodic;
+	s64 period; 				/* unit: ns */
+	struct hrtimer timer;
+	atomic_t pending;			/* accumulated triggered timers */
+	bool reinject;
+	struct kvm *kvm;
 	u32    speaker_data_on;
 	struct mutex lock;
 	struct kvm_pit *pit;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b6379e55ee2..3a53bcc24f2 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3082,7 +3082,7 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm,
 	if (!kvm->arch.vpit)
 		return -ENXIO;
 	mutex_lock(&kvm->arch.vpit->pit_state.lock);
-	kvm->arch.vpit->pit_state.pit_timer.reinject = control->pit_reinject;
+	kvm->arch.vpit->pit_state.reinject = control->pit_reinject;
 	mutex_unlock(&kvm->arch.vpit->pit_state.lock);
 	return 0;
 }
-- 
cgit v1.2.3


From e115676e042f4d9268c6b6d8cb7dc962aa6cfd7d Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Wed, 1 Aug 2012 17:01:42 +0300
Subject: KVM: x86: update KVM_SAVE_MSRS_BEGIN to correct value

When MSR_KVM_PV_EOI_EN was added to msrs_to_save array
KVM_SAVE_MSRS_BEGIN was not updated accordingly.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3a53bcc24f2..a87c82aa319 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -806,7 +806,7 @@ EXPORT_SYMBOL_GPL(kvm_rdpmc);
  * kvm-specific. Those are put in the beginning of the list.
  */
 
-#define KVM_SAVE_MSRS_BEGIN	9
+#define KVM_SAVE_MSRS_BEGIN	10
 static u32 msrs_to_save[] = {
 	MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
 	MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
-- 
cgit v1.2.3


From aab2eb7a38e0e510874acca01838f5badbca6c7e Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Wed, 1 Aug 2012 18:01:10 +0900
Subject: KVM: Stop checking rmap to see if slot is being created

Instead, check npages consistently.  This helps to make rmap
architecture specific in a later patch.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3ca90d74711..abc039d7842 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6385,7 +6385,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 	 *x86 needs to handle !user_alloc case.
 	 */
 	if (!user_alloc) {
-		if (npages && !old.rmap) {
+		if (npages && !old.npages) {
 			unsigned long userspace_addr;
 
 			userspace_addr = vm_mmap(NULL, 0,
@@ -6413,7 +6413,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 
 	int nr_mmu_pages = 0, npages = mem->memory_size >> PAGE_SHIFT;
 
-	if (!user_alloc && !old.user_alloc && old.rmap && !npages) {
+	if (!user_alloc && !old.user_alloc && old.npages && !npages) {
 		int ret;
 
 		ret = vm_munmap(old.userspace_addr,
-- 
cgit v1.2.3


From 65fbe37c42ed75604c9a770639209dcee162ebe7 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Wed, 1 Aug 2012 18:02:01 +0900
Subject: KVM: MMU: Use gfn_to_rmap() instead of directly reading rmap array

This helps to make rmap architecture specific in a later patch.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c       | 3 ++-
 arch/x86/kvm/mmu_audit.c | 4 +---
 2 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index a9a20528e70..ee768bb2367 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1181,7 +1181,8 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
 	unsigned long *rmapp;
 
 	while (mask) {
-		rmapp = &slot->rmap[gfn_offset + __ffs(mask)];
+		rmapp = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
+				      PT_PAGE_TABLE_LEVEL, slot);
 		__rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL, false);
 
 		/* clear the first set bit */
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index 7d7d0b9e23e..ca403f9bb0f 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -190,7 +190,6 @@ static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp)
 
 static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
-	struct kvm_memory_slot *slot;
 	unsigned long *rmapp;
 	u64 *sptep;
 	struct rmap_iterator iter;
@@ -198,8 +197,7 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
 	if (sp->role.direct || sp->unsync || sp->role.invalid)
 		return;
 
-	slot = gfn_to_memslot(kvm, sp->gfn);
-	rmapp = &slot->rmap[sp->gfn - slot->base_gfn];
+	rmapp = gfn_to_rmap(kvm, sp->gfn, PT_PAGE_TABLE_LEVEL);
 
 	for (sptep = rmap_get_first(*rmapp, &iter); sptep;
 	     sptep = rmap_get_next(&iter)) {
-- 
cgit v1.2.3


From d89cc617b954aff4030fce178f7d86f59aaf713d Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Wed, 1 Aug 2012 18:03:28 +0900
Subject: KVM: Push rmap into kvm_arch_memory_slot

Two reasons:
 - x86 can integrate rmap and rmap_pde and remove heuristics in
   __gfn_to_rmap().
 - Some architectures do not need rmap.

Since rmap is one of the most memory consuming stuff in KVM, ppc'd
better restrict the allocation to Book3S HV.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Acked-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/powerpc/include/asm/kvm_host.h |  1 +
 arch/powerpc/kvm/book3s_64_mmu_hv.c |  6 ++--
 arch/powerpc/kvm/book3s_hv_rm_mmu.c |  4 +--
 arch/powerpc/kvm/powerpc.c          |  8 ++++++
 arch/x86/include/asm/kvm_host.h     |  2 +-
 arch/x86/kvm/mmu.c                  |  5 +---
 arch/x86/kvm/x86.c                  | 55 +++++++++++++++++++++----------------
 7 files changed, 48 insertions(+), 33 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 572ad014126..a29e0918172 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -221,6 +221,7 @@ struct revmap_entry {
 #define KVMPPC_GOT_PAGE		0x80
 
 struct kvm_arch_memory_slot {
+	unsigned long *rmap;
 };
 
 struct kvm_arch {
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 3c635c0616b..d95d11322a1 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -705,7 +705,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		goto out_unlock;
 	hpte[0] = (hpte[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID;
 
-	rmap = &memslot->rmap[gfn - memslot->base_gfn];
+	rmap = &memslot->arch.rmap[gfn - memslot->base_gfn];
 	lock_rmap(rmap);
 
 	/* Check if we might have been invalidated; let the guest retry if so */
@@ -788,7 +788,7 @@ static int kvm_handle_hva_range(struct kvm *kvm,
 		for (; gfn < gfn_end; ++gfn) {
 			gfn_t gfn_offset = gfn - memslot->base_gfn;
 
-			ret = handler(kvm, &memslot->rmap[gfn_offset], gfn);
+			ret = handler(kvm, &memslot->arch.rmap[gfn_offset], gfn);
 			retval |= ret;
 		}
 	}
@@ -1036,7 +1036,7 @@ long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
 	unsigned long *rmapp, *map;
 
 	preempt_disable();
-	rmapp = memslot->rmap;
+	rmapp = memslot->arch.rmap;
 	map = memslot->dirty_bitmap;
 	for (i = 0; i < memslot->npages; ++i) {
 		if (kvm_test_clear_dirty(kvm, rmapp))
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 5c70d19494f..56ac1a5d991 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -84,7 +84,7 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index,
 	if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
 		return;
 
-	rmap = real_vmalloc_addr(&memslot->rmap[gfn - memslot->base_gfn]);
+	rmap = real_vmalloc_addr(&memslot->arch.rmap[gfn - memslot->base_gfn]);
 	lock_rmap(rmap);
 
 	head = *rmap & KVMPPC_RMAP_INDEX;
@@ -180,7 +180,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
 	if (!slot_is_aligned(memslot, psize))
 		return H_PARAMETER;
 	slot_fn = gfn - memslot->base_gfn;
-	rmap = &memslot->rmap[slot_fn];
+	rmap = &memslot->arch.rmap[slot_fn];
 
 	if (!kvm->arch.using_mmu_notifiers) {
 		physp = kvm->arch.slot_phys[memslot->id];
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 87f4dc88607..879b14a6140 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -302,10 +302,18 @@ long kvm_arch_dev_ioctl(struct file *filp,
 void kvm_arch_free_memslot(struct kvm_memory_slot *free,
 			   struct kvm_memory_slot *dont)
 {
+	if (!dont || free->arch.rmap != dont->arch.rmap) {
+		vfree(free->arch.rmap);
+		free->arch.rmap = NULL;
+	}
 }
 
 int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
 {
+	slot->arch.rmap = vzalloc(npages * sizeof(*slot->arch.rmap));
+	if (!slot->arch.rmap)
+		return -ENOMEM;
+
 	return 0;
 }
 
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 48e71318846..1309e69b57f 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -504,7 +504,7 @@ struct kvm_lpage_info {
 };
 
 struct kvm_arch_memory_slot {
-	unsigned long *rmap_pde[KVM_NR_PAGE_SIZES - 1];
+	unsigned long *rmap[KVM_NR_PAGE_SIZES];
 	struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1];
 };
 
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index ee768bb2367..aa9a987ddef 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -970,11 +970,8 @@ static unsigned long *__gfn_to_rmap(gfn_t gfn, int level,
 {
 	unsigned long idx;
 
-	if (likely(level == PT_PAGE_TABLE_LEVEL))
-		return &slot->rmap[gfn - slot->base_gfn];
-
 	idx = gfn_to_index(gfn, slot->base_gfn, level);
-	return &slot->arch.rmap_pde[level - PT_DIRECTORY_LEVEL][idx];
+	return &slot->arch.rmap[level - PT_PAGE_TABLE_LEVEL][idx];
 }
 
 /*
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index abc039d7842..ebf2109318e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6303,14 +6303,18 @@ void kvm_arch_free_memslot(struct kvm_memory_slot *free,
 {
 	int i;
 
-	for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
-		if (!dont || free->arch.rmap_pde[i] != dont->arch.rmap_pde[i]) {
-			kvm_kvfree(free->arch.rmap_pde[i]);
-			free->arch.rmap_pde[i] = NULL;
+	for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
+		if (!dont || free->arch.rmap[i] != dont->arch.rmap[i]) {
+			kvm_kvfree(free->arch.rmap[i]);
+			free->arch.rmap[i] = NULL;
 		}
-		if (!dont || free->arch.lpage_info[i] != dont->arch.lpage_info[i]) {
-			kvm_kvfree(free->arch.lpage_info[i]);
-			free->arch.lpage_info[i] = NULL;
+		if (i == 0)
+			continue;
+
+		if (!dont || free->arch.lpage_info[i - 1] !=
+			     dont->arch.lpage_info[i - 1]) {
+			kvm_kvfree(free->arch.lpage_info[i - 1]);
+			free->arch.lpage_info[i - 1] = NULL;
 		}
 	}
 }
@@ -6319,28 +6323,30 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
 {
 	int i;
 
-	for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
+	for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
 		unsigned long ugfn;
 		int lpages;
-		int level = i + 2;
+		int level = i + 1;
 
 		lpages = gfn_to_index(slot->base_gfn + npages - 1,
 				      slot->base_gfn, level) + 1;
 
-		slot->arch.rmap_pde[i] =
-			kvm_kvzalloc(lpages * sizeof(*slot->arch.rmap_pde[i]));
-		if (!slot->arch.rmap_pde[i])
+		slot->arch.rmap[i] =
+			kvm_kvzalloc(lpages * sizeof(*slot->arch.rmap[i]));
+		if (!slot->arch.rmap[i])
 			goto out_free;
+		if (i == 0)
+			continue;
 
-		slot->arch.lpage_info[i] =
-			kvm_kvzalloc(lpages * sizeof(*slot->arch.lpage_info[i]));
-		if (!slot->arch.lpage_info[i])
+		slot->arch.lpage_info[i - 1] = kvm_kvzalloc(lpages *
+					sizeof(*slot->arch.lpage_info[i - 1]));
+		if (!slot->arch.lpage_info[i - 1])
 			goto out_free;
 
 		if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
-			slot->arch.lpage_info[i][0].write_count = 1;
+			slot->arch.lpage_info[i - 1][0].write_count = 1;
 		if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
-			slot->arch.lpage_info[i][lpages - 1].write_count = 1;
+			slot->arch.lpage_info[i - 1][lpages - 1].write_count = 1;
 		ugfn = slot->userspace_addr >> PAGE_SHIFT;
 		/*
 		 * If the gfn and userspace address are not aligned wrt each
@@ -6352,18 +6358,21 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
 			unsigned long j;
 
 			for (j = 0; j < lpages; ++j)
-				slot->arch.lpage_info[i][j].write_count = 1;
+				slot->arch.lpage_info[i - 1][j].write_count = 1;
 		}
 	}
 
 	return 0;
 
 out_free:
-	for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
-		kvm_kvfree(slot->arch.rmap_pde[i]);
-		kvm_kvfree(slot->arch.lpage_info[i]);
-		slot->arch.rmap_pde[i] = NULL;
-		slot->arch.lpage_info[i] = NULL;
+	for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
+		kvm_kvfree(slot->arch.rmap[i]);
+		slot->arch.rmap[i] = NULL;
+		if (i == 0)
+			continue;
+
+		kvm_kvfree(slot->arch.lpage_info[i - 1]);
+		slot->arch.lpage_info[i - 1] = NULL;
 	}
 	return -ENOMEM;
 }
-- 
cgit v1.2.3


From 6c8ee57be9350c5c2cafdd6a99d0462d528676e2 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Fri, 3 Aug 2012 15:37:54 +0800
Subject: KVM: introduce KVM_PFN_ERR_FAULT

After that, the exported and un-inline function, get_fault_pfn,
can be removed

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index aa9a987ddef..9cf90c8d584 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2512,7 +2512,7 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
 
 	slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log);
 	if (!slot)
-		return get_fault_pfn();
+		return KVM_PFN_ERR_FAULT;
 
 	hva = gfn_to_hva_memslot(slot, gfn);
 
-- 
cgit v1.2.3


From e6c1502b3f933ace20c711ce34ab696f5a67086d Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Fri, 3 Aug 2012 15:38:36 +0800
Subject: KVM: introduce KVM_PFN_ERR_HWPOISON

Then, get_hwpoison_pfn and is_hwpoison_pfn can be removed

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9cf90c8d584..d3cdf69da51 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2649,7 +2649,7 @@ static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *
 static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn)
 {
 	kvm_release_pfn_clean(pfn);
-	if (is_hwpoison_pfn(pfn)) {
+	if (pfn == KVM_PFN_ERR_HWPOISON) {
 		kvm_send_hwpoison_signal(gfn_to_hva(vcpu->kvm, gfn), current);
 		return 0;
 	}
-- 
cgit v1.2.3


From cb9aaa30b133574b646d9d4766ef08a843211393 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Fri, 3 Aug 2012 15:42:10 +0800
Subject: KVM: do not release the error pfn

After commit a2766325cf9f9, the error pfn is replaced by the
error code, it need not be released anymore

[ The patch has been compiling tested for powerpc ]

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/powerpc/kvm/e500_tlb.c | 1 -
 arch/x86/kvm/mmu.c          | 7 +++----
 arch/x86/kvm/mmu_audit.c    | 4 +---
 arch/x86/kvm/paging_tmpl.h  | 8 ++------
 4 files changed, 6 insertions(+), 14 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c
index c8f6c582674..09ce5ac128f 100644
--- a/arch/powerpc/kvm/e500_tlb.c
+++ b/arch/powerpc/kvm/e500_tlb.c
@@ -524,7 +524,6 @@ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
 		if (is_error_pfn(pfn)) {
 			printk(KERN_ERR "Couldn't get real page for gfn %lx!\n",
 					(long)gfn);
-			kvm_release_pfn_clean(pfn);
 			return;
 		}
 
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index d3cdf69da51..9651c2cd000 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2496,7 +2496,9 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 				rmap_recycle(vcpu, sptep, gfn);
 		}
 	}
-	kvm_release_pfn_clean(pfn);
+
+	if (!is_error_pfn(pfn))
+		kvm_release_pfn_clean(pfn);
 }
 
 static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
@@ -2648,7 +2650,6 @@ static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *
 
 static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn)
 {
-	kvm_release_pfn_clean(pfn);
 	if (pfn == KVM_PFN_ERR_HWPOISON) {
 		kvm_send_hwpoison_signal(gfn_to_hva(vcpu->kvm, gfn), current);
 		return 0;
@@ -3273,8 +3274,6 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
 	if (!async)
 		return false; /* *pfn has correct page already */
 
-	kvm_release_pfn_clean(*pfn);
-
 	if (!prefault && can_do_async_pf(vcpu)) {
 		trace_kvm_try_async_get_page(gva, gfn);
 		if (kvm_find_async_pf_gfn(vcpu, gfn)) {
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index ca403f9bb0f..daff69e2115 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -116,10 +116,8 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
 	gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
 	pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);
 
-	if (is_error_pfn(pfn)) {
-		kvm_release_pfn_clean(pfn);
+	if (is_error_pfn(pfn))
 		return;
-	}
 
 	hpa =  pfn << PAGE_SHIFT;
 	if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index bb7cf01cae7..bf8c42bf50f 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -370,10 +370,8 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 	pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
 	pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte, true);
 	pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte));
-	if (mmu_invalid_pfn(pfn)) {
-		kvm_release_pfn_clean(pfn);
+	if (mmu_invalid_pfn(pfn))
 		return;
-	}
 
 	/*
 	 * we call mmu_set_spte() with host_writable = true because that
@@ -448,10 +446,8 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
 		gfn = gpte_to_gfn(gpte);
 		pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
 				      pte_access & ACC_WRITE_MASK);
-		if (mmu_invalid_pfn(pfn)) {
-			kvm_release_pfn_clean(pfn);
+		if (mmu_invalid_pfn(pfn))
 			break;
-		}
 
 		mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
 			     NULL, PT_PAGE_TABLE_LEVEL, gfn,
-- 
cgit v1.2.3


From 32cad84f44d186654492f1a50a1424c8906ccbd9 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Fri, 3 Aug 2012 15:42:52 +0800
Subject: KVM: do not release the error page

After commit a2766325cf9f9, the error page is replaced by the
error code, it need not be released anymore

[ The patch has been compiling tested for powerpc ]

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/powerpc/kvm/44x_tlb.c   | 1 -
 arch/powerpc/kvm/book3s_pr.c | 4 +---
 arch/x86/kvm/svm.c           | 1 -
 arch/x86/kvm/vmx.c           | 5 ++---
 arch/x86/kvm/x86.c           | 9 +++------
 5 files changed, 6 insertions(+), 14 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kvm/44x_tlb.c b/arch/powerpc/kvm/44x_tlb.c
index 33aa715dab2..5dd3ab46997 100644
--- a/arch/powerpc/kvm/44x_tlb.c
+++ b/arch/powerpc/kvm/44x_tlb.c
@@ -319,7 +319,6 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gpa_t gpaddr,
 	if (is_error_page(new_page)) {
 		printk(KERN_ERR "Couldn't get guest page for gfn %llx!\n",
 			(unsigned long long)gfn);
-		kvm_release_page_clean(new_page);
 		return;
 	}
 	hpaddr = page_to_phys(new_page);
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index a1baec340f7..05c28f59f77 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -242,10 +242,8 @@ static void kvmppc_patch_dcbz(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte)
 	int i;
 
 	hpage = gfn_to_page(vcpu->kvm, pte->raddr >> PAGE_SHIFT);
-	if (is_error_page(hpage)) {
-		kvm_release_page_clean(hpage);
+	if (is_error_page(hpage))
 		return;
-	}
 
 	hpage_offset = pte->raddr & ~PAGE_MASK;
 	hpage_offset &= ~0xFFFULL;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 687d0c30e55..31be4a55744 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2105,7 +2105,6 @@ static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page)
 	return kmap(page);
 
 error:
-	kvm_release_page_clean(page);
 	kvm_inject_gp(&svm->vcpu, 0);
 
 	return NULL;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d6e4cbc42b8..cc8ad983692 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -596,10 +596,9 @@ static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
 static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr)
 {
 	struct page *page = gfn_to_page(vcpu->kvm, addr >> PAGE_SHIFT);
-	if (is_error_page(page)) {
-		kvm_release_page_clean(page);
+	if (is_error_page(page))
 		return NULL;
-	}
+
 	return page;
 }
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ebf2109318e..7953a9e7cb1 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1639,10 +1639,9 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 		vcpu->arch.time_page =
 				gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
 
-		if (is_error_page(vcpu->arch.time_page)) {
-			kvm_release_page_clean(vcpu->arch.time_page);
+		if (is_error_page(vcpu->arch.time_page))
 			vcpu->arch.time_page = NULL;
-		}
+
 		break;
 	}
 	case MSR_KVM_ASYNC_PF_EN:
@@ -3945,10 +3944,8 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
 		goto emul_write;
 
 	page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
-	if (is_error_page(page)) {
-		kvm_release_page_clean(page);
+	if (is_error_page(page))
 		goto emul_write;
-	}
 
 	kaddr = kmap_atomic(page);
 	kaddr += offset_in_page(gpa);
-- 
cgit v1.2.3


From 8a5a87d9b7aef24bcdba763d8ee14982477b0a2e Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Sun, 5 Aug 2012 15:58:26 +0300
Subject: KVM: clean up kvm_(set|get)_apic_base

kvm_get_apic_base() needlessly checks irqchip_in_kernel although it does
the same no matter what result of the check is. kvm_set_apic_base() also
checks for irqchip_in_kernel, but kvm_lapic_set_base() can handle this
case.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7953a9e7cb1..3cafbb12ae0 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -246,20 +246,14 @@ static void drop_user_return_notifiers(void *ignore)
 
 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
 {
-	if (irqchip_in_kernel(vcpu->kvm))
-		return vcpu->arch.apic_base;
-	else
-		return vcpu->arch.apic_base;
+	return vcpu->arch.apic_base;
 }
 EXPORT_SYMBOL_GPL(kvm_get_apic_base);
 
 void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
 {
 	/* TODO: reserve bits check */
-	if (irqchip_in_kernel(vcpu->kvm))
-		kvm_lapic_set_base(vcpu, data);
-	else
-		vcpu->arch.apic_base = data;
+	kvm_lapic_set_base(vcpu, data);
 }
 EXPORT_SYMBOL_GPL(kvm_set_apic_base);
 
-- 
cgit v1.2.3


From 5dbc8f3fed0b4cb04dfb276150294f21c5f0fc66 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Sun, 5 Aug 2012 15:58:27 +0300
Subject: KVM: use kvm_lapic_set_base() to change apic_base

Do not change apic_base directly. Use kvm_lapic_set_base() instead.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/lapic.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 0cd431c85d3..49f4ac047f6 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1185,7 +1185,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
 	update_divide_count(apic);
 	atomic_set(&apic->lapic_timer.pending, 0);
 	if (kvm_vcpu_is_bsp(vcpu))
-		vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
+		kvm_lapic_set_base(vcpu,
+				vcpu->arch.apic_base | MSR_IA32_APICBASE_BSP);
 	vcpu->arch.pv_eoi.msr_val = 0;
 	apic_update_ppr(apic);
 
@@ -1310,8 +1311,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
 		     HRTIMER_MODE_ABS);
 	apic->lapic_timer.timer.function = apic_timer_fn;
 
-	apic->base_address = APIC_DEFAULT_PHYS_BASE;
-	vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE;
+	kvm_lapic_set_base(vcpu, APIC_DEFAULT_PHYS_BASE);
 
 	kvm_lapic_reset(vcpu);
 	kvm_iodevice_init(&apic->dev, &apic_mmio_ops);
@@ -1380,8 +1380,7 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
 
-	apic->base_address = vcpu->arch.apic_base &
-			     MSR_IA32_APICBASE_BASE;
+	kvm_lapic_set_base(vcpu, vcpu->arch.apic_base);
 	kvm_apic_set_version(vcpu);
 
 	apic_update_ppr(apic);
-- 
cgit v1.2.3


From 6aed64a8a440360be875f31eabeacaddb83ef25f Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Sun, 5 Aug 2012 15:58:28 +0300
Subject: KVM: mark apic enabled on start up

According to SDM apic is enabled on start up.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/lapic.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 49f4ac047f6..c3f14fe51e9 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1311,7 +1311,8 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
 		     HRTIMER_MODE_ABS);
 	apic->lapic_timer.timer.function = apic_timer_fn;
 
-	kvm_lapic_set_base(vcpu, APIC_DEFAULT_PHYS_BASE);
+	kvm_lapic_set_base(vcpu,
+			APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE);
 
 	kvm_lapic_reset(vcpu);
 	kvm_iodevice_init(&apic->dev, &apic_mmio_ops);
-- 
cgit v1.2.3


From c5cc421ba3219b90f11d151bc55f1608c12830fa Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Sun, 5 Aug 2012 15:58:30 +0300
Subject: KVM: use jump label to optimize checking for HW enabled APIC in
 APIC_BASE MSR

Usually all APICs are HW enabled so the check can be optimized out.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/lapic.c | 29 ++++++++++++++++++++++++++++-
 arch/x86/kvm/lapic.h |  1 +
 arch/x86/kvm/x86.c   |  1 +
 3 files changed, 30 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index c3f14fe51e9..5b46cab044a 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -34,6 +34,7 @@
 #include <asm/current.h>
 #include <asm/apicdef.h>
 #include <linux/atomic.h>
+#include <linux/jump_label.h>
 #include "kvm_cache_regs.h"
 #include "irq.h"
 #include "trace.h"
@@ -117,9 +118,13 @@ static inline int __apic_test_and_clear_vector(int vec, void *bitmap)
 	return __test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
 }
 
+struct static_key_deferred apic_hw_disabled __read_mostly;
+
 static inline int apic_hw_enabled(struct kvm_lapic *apic)
 {
-	return (apic)->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE;
+	if (static_key_false(&apic_hw_disabled.key))
+		return apic->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE;
+	return MSR_IA32_APICBASE_ENABLE;
 }
 
 static inline int  apic_sw_enabled(struct kvm_lapic *apic)
@@ -1055,6 +1060,9 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu)
 
 	hrtimer_cancel(&vcpu->arch.apic->lapic_timer.timer);
 
+	if (!(vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE))
+		static_key_slow_dec_deferred(&apic_hw_disabled);
+
 	if (vcpu->arch.apic->regs)
 		free_page((unsigned long)vcpu->arch.apic->regs);
 
@@ -1125,6 +1133,14 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
 		return;
 	}
 
+	/* update jump label if enable bit changes */
+	if ((vcpu->arch.apic_base ^ value) & MSR_IA32_APICBASE_ENABLE) {
+		if (value & MSR_IA32_APICBASE_ENABLE)
+			static_key_slow_dec_deferred(&apic_hw_disabled);
+		else
+			static_key_slow_inc(&apic_hw_disabled.key);
+	}
+
 	if (!kvm_vcpu_is_bsp(apic->vcpu))
 		value &= ~MSR_IA32_APICBASE_BSP;
 
@@ -1311,6 +1327,11 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
 		     HRTIMER_MODE_ABS);
 	apic->lapic_timer.timer.function = apic_timer_fn;
 
+	/*
+	 * APIC is created enabled. This will prevent kvm_lapic_set_base from
+	 * thinking that APIC satet has changed.
+	 */
+	vcpu->arch.apic_base = MSR_IA32_APICBASE_ENABLE;
 	kvm_lapic_set_base(vcpu,
 			APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE);
 
@@ -1598,3 +1619,9 @@ int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data)
 	return kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_eoi.data,
 					 addr);
 }
+
+void kvm_lapic_init(void)
+{
+	/* do not patch jump label more than once per second */
+	jump_label_rate_limit(&apic_hw_disabled, HZ);
+}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 166766fffd9..73fa299b68e 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -78,4 +78,5 @@ static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu)
 }
 
 int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data);
+void kvm_lapic_init(void);
 #endif
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3cafbb12ae0..29fa18d27e6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4903,6 +4903,7 @@ int kvm_arch_init(void *opaque)
 	if (cpu_has_xsave)
 		host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
 
+	kvm_lapic_init();
 	return 0;
 
 out:
-- 
cgit v1.2.3


From f8c1ea103947038b7197bdd4c8451886a58af0c0 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Sun, 5 Aug 2012 15:58:31 +0300
Subject: KVM: use jump label to optimize checking for SW enabled apic in
 spurious interrupt register

Usually all APICs are SW enabled so the check can be optimized out.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/lapic.c | 39 +++++++++++++++++++++++++++++++--------
 1 file changed, 31 insertions(+), 8 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 5b46cab044a..7f77e96ac5e 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -127,9 +127,24 @@ static inline int apic_hw_enabled(struct kvm_lapic *apic)
 	return MSR_IA32_APICBASE_ENABLE;
 }
 
-static inline int  apic_sw_enabled(struct kvm_lapic *apic)
+struct static_key_deferred apic_sw_disabled __read_mostly;
+
+static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
+{
+	if ((apic_get_reg(apic, APIC_SPIV) ^ val) & APIC_SPIV_APIC_ENABLED) {
+		if (val & APIC_SPIV_APIC_ENABLED)
+			static_key_slow_dec_deferred(&apic_sw_disabled);
+		else
+			static_key_slow_inc(&apic_sw_disabled.key);
+	}
+	apic_set_reg(apic, APIC_SPIV, val);
+}
+
+static inline int apic_sw_enabled(struct kvm_lapic *apic)
 {
-	return apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED;
+	if (static_key_false(&apic_sw_disabled.key))
+		return apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED;
+	return APIC_SPIV_APIC_ENABLED;
 }
 
 static inline int apic_enabled(struct kvm_lapic *apic)
@@ -918,7 +933,7 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
 		u32 mask = 0x3ff;
 		if (apic_get_reg(apic, APIC_LVR) & APIC_LVR_DIRECTED_EOI)
 			mask |= APIC_SPIV_DIRECTED_EOI;
-		apic_set_reg(apic, APIC_SPIV, val & mask);
+		apic_set_spiv(apic, val & mask);
 		if (!(val & APIC_SPIV_APIC_ENABLED)) {
 			int i;
 			u32 lvt_val;
@@ -1055,18 +1070,23 @@ EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi);
 
 void kvm_free_lapic(struct kvm_vcpu *vcpu)
 {
+	struct kvm_lapic *apic = vcpu->arch.apic;
+
 	if (!vcpu->arch.apic)
 		return;
 
-	hrtimer_cancel(&vcpu->arch.apic->lapic_timer.timer);
+	hrtimer_cancel(&apic->lapic_timer.timer);
 
 	if (!(vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE))
 		static_key_slow_dec_deferred(&apic_hw_disabled);
 
-	if (vcpu->arch.apic->regs)
-		free_page((unsigned long)vcpu->arch.apic->regs);
+	if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED))
+		static_key_slow_dec_deferred(&apic_sw_disabled);
 
-	kfree(vcpu->arch.apic);
+	if (apic->regs)
+		free_page((unsigned long)apic->regs);
+
+	kfree(apic);
 }
 
 /*
@@ -1182,7 +1202,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
 		     SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT));
 
 	apic_set_reg(apic, APIC_DFR, 0xffffffffU);
-	apic_set_reg(apic, APIC_SPIV, 0xff);
+	apic_set_spiv(apic, 0xff);
 	apic_set_reg(apic, APIC_TASKPRI, 0);
 	apic_set_reg(apic, APIC_LDR, 0);
 	apic_set_reg(apic, APIC_ESR, 0);
@@ -1335,6 +1355,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
 	kvm_lapic_set_base(vcpu,
 			APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE);
 
+	static_key_slow_inc(&apic_sw_disabled.key); /* sw disabled at reset */
 	kvm_lapic_reset(vcpu);
 	kvm_iodevice_init(&apic->dev, &apic_mmio_ops);
 
@@ -1404,6 +1425,7 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
 
 	kvm_lapic_set_base(vcpu, vcpu->arch.apic_base);
 	kvm_apic_set_version(vcpu);
+	apic_set_spiv(apic, apic_get_reg(apic, APIC_SPIV));
 
 	apic_update_ppr(apic);
 	hrtimer_cancel(&apic->lapic_timer.timer);
@@ -1624,4 +1646,5 @@ void kvm_lapic_init(void)
 {
 	/* do not patch jump label more than once per second */
 	jump_label_rate_limit(&apic_hw_disabled, HZ);
+	jump_label_rate_limit(&apic_sw_disabled, HZ);
 }
-- 
cgit v1.2.3


From 54e9818f3903902a4ea3046035739b8770880092 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Sun, 5 Aug 2012 15:58:32 +0300
Subject: KVM: use jump label to optimize checking for in kernel local apic
 presence

Usually all vcpus have local apic pointer initialized, so the check may
be completely skipped.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/lapic.c | 62 ++++++++++++++++++++++++++++------------------------
 arch/x86/kvm/x86.c   |  7 +++++-
 arch/x86/kvm/x86.h   |  1 +
 3 files changed, 41 insertions(+), 29 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 7f77e96ac5e..650379ba73a 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -152,6 +152,13 @@ static inline int apic_enabled(struct kvm_lapic *apic)
 	return apic_sw_enabled(apic) &&	apic_hw_enabled(apic);
 }
 
+static inline bool vcpu_has_lapic(struct kvm_vcpu *vcpu)
+{
+	if (static_key_false(&kvm_no_apic_vcpu))
+		return vcpu->arch.apic;
+	return true;
+}
+
 #define LVT_MASK	\
 	(APIC_LVT_MASKED | APIC_SEND_PENDING | APIC_VECTOR_MASK)
 
@@ -204,7 +211,7 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu)
 	struct kvm_cpuid_entry2 *feat;
 	u32 v = APIC_VERSION;
 
-	if (!irqchip_in_kernel(vcpu->kvm))
+	if (!vcpu_has_lapic(vcpu))
 		return;
 
 	feat = kvm_find_cpuid_entry(apic->vcpu, 0x1, 0);
@@ -305,7 +312,6 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
 
 int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
 {
-	struct kvm_lapic *apic = vcpu->arch.apic;
 	int highest_irr;
 
 	/* This may race with setting of irr in __apic_accept_irq() and
@@ -313,9 +319,9 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
 	 * will cause vmexit immediately and the value will be recalculated
 	 * on the next vmentry.
 	 */
-	if (!apic)
+	if (!vcpu_has_lapic(vcpu))
 		return 0;
-	highest_irr = apic_find_highest_irr(apic);
+	highest_irr = apic_find_highest_irr(vcpu->arch.apic);
 
 	return highest_irr;
 }
@@ -1061,9 +1067,7 @@ static int apic_mmio_write(struct kvm_io_device *this,
 
 void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu)
 {
-	struct kvm_lapic *apic = vcpu->arch.apic;
-
-	if (apic)
+	if (vcpu_has_lapic(vcpu))
 		apic_reg_write(vcpu->arch.apic, APIC_EOI, 0);
 }
 EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi);
@@ -1098,10 +1102,9 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu)
 u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
-	if (!apic)
-		return 0;
 
-	if (apic_lvtt_oneshot(apic) || apic_lvtt_period(apic))
+	if (!vcpu_has_lapic(vcpu) || apic_lvtt_oneshot(apic) ||
+			apic_lvtt_period(apic))
 		return 0;
 
 	return apic->lapic_timer.tscdeadline;
@@ -1110,10 +1113,9 @@ u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu)
 void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
-	if (!apic)
-		return;
 
-	if (apic_lvtt_oneshot(apic) || apic_lvtt_period(apic))
+	if (!vcpu_has_lapic(vcpu) || apic_lvtt_oneshot(apic) ||
+			apic_lvtt_period(apic))
 		return;
 
 	hrtimer_cancel(&apic->lapic_timer.timer);
@@ -1125,20 +1127,21 @@ void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
 
-	if (!apic)
+	if (!vcpu_has_lapic(vcpu))
 		return;
+
 	apic_set_tpr(apic, ((cr8 & 0x0f) << 4)
 		     | (apic_get_reg(apic, APIC_TASKPRI) & 4));
 }
 
 u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
 {
-	struct kvm_lapic *apic = vcpu->arch.apic;
 	u64 tpr;
 
-	if (!apic)
+	if (!vcpu_has_lapic(vcpu))
 		return 0;
-	tpr = (u64) apic_get_reg(apic, APIC_TASKPRI);
+
+	tpr = (u64) apic_get_reg(vcpu->arch.apic, APIC_TASKPRI);
 
 	return (tpr & 0xf0) >> 4;
 }
@@ -1237,7 +1240,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
 
 bool kvm_apic_present(struct kvm_vcpu *vcpu)
 {
-	return vcpu->arch.apic && apic_hw_enabled(vcpu->arch.apic);
+	return vcpu_has_lapic(vcpu) && apic_hw_enabled(vcpu->arch.apic);
 }
 
 int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
@@ -1258,10 +1261,11 @@ static bool lapic_is_periodic(struct kvm_lapic *apic)
 
 int apic_has_pending_timer(struct kvm_vcpu *vcpu)
 {
-	struct kvm_lapic *lapic = vcpu->arch.apic;
+	struct kvm_lapic *apic = vcpu->arch.apic;
 
-	if (lapic && apic_enabled(lapic) && apic_lvt_enabled(lapic, APIC_LVTT))
-		return atomic_read(&lapic->lapic_timer.pending);
+	if (vcpu_has_lapic(vcpu) && apic_enabled(apic) &&
+			apic_lvt_enabled(apic, APIC_LVTT))
+		return atomic_read(&apic->lapic_timer.pending);
 
 	return 0;
 }
@@ -1371,7 +1375,7 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
 	struct kvm_lapic *apic = vcpu->arch.apic;
 	int highest_irr;
 
-	if (!apic || !apic_enabled(apic))
+	if (!vcpu_has_lapic(vcpu) || !apic_enabled(apic))
 		return -1;
 
 	apic_update_ppr(apic);
@@ -1399,7 +1403,10 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
 
-	if (apic && atomic_read(&apic->lapic_timer.pending) > 0) {
+	if (!vcpu_has_lapic(vcpu))
+		return;
+
+	if (atomic_read(&apic->lapic_timer.pending) > 0) {
 		if (kvm_apic_local_deliver(apic, APIC_LVTT))
 			atomic_dec(&apic->lapic_timer.pending);
 	}
@@ -1439,13 +1446,12 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
 
 void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
 {
-	struct kvm_lapic *apic = vcpu->arch.apic;
 	struct hrtimer *timer;
 
-	if (!apic)
+	if (!vcpu_has_lapic(vcpu))
 		return;
 
-	timer = &apic->lapic_timer.timer;
+	timer = &vcpu->arch.apic->lapic_timer.timer;
 	if (hrtimer_cancel(timer))
 		hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
 }
@@ -1602,7 +1608,7 @@ int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 reg, u64 data)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
 
-	if (!irqchip_in_kernel(vcpu->kvm))
+	if (!vcpu_has_lapic(vcpu))
 		return 1;
 
 	/* if this is ICR write vector before command */
@@ -1616,7 +1622,7 @@ int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data)
 	struct kvm_lapic *apic = vcpu->arch.apic;
 	u32 low, high = 0;
 
-	if (!irqchip_in_kernel(vcpu->kvm))
+	if (!vcpu_has_lapic(vcpu))
 		return 1;
 
 	if (apic_reg_read(apic, reg, 4, &low))
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 29fa18d27e6..8ebf65c349e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6152,6 +6152,8 @@ bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu)
 	return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL);
 }
 
+struct static_key kvm_no_apic_vcpu __read_mostly;
+
 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 {
 	struct page *page;
@@ -6184,7 +6186,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 		r = kvm_create_lapic(vcpu);
 		if (r < 0)
 			goto fail_mmu_destroy;
-	}
+	} else
+		static_key_slow_inc(&kvm_no_apic_vcpu);
 
 	vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
 				       GFP_KERNEL);
@@ -6224,6 +6227,8 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
 	kvm_mmu_destroy(vcpu);
 	srcu_read_unlock(&vcpu->kvm->srcu, idx);
 	free_page((unsigned long)vcpu->arch.pio_data);
+	if (!irqchip_in_kernel(vcpu->kvm))
+		static_key_slow_dec(&kvm_no_apic_vcpu);
 }
 
 int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 3d1134ddb88..2b5219c12ac 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -124,4 +124,5 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
 
 extern u64 host_xcr0;
 
+extern struct static_key kvm_no_apic_vcpu;
 #endif
-- 
cgit v1.2.3


From c48f14966cc41957d88c66dfe49a439e708ab7b8 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Sun, 5 Aug 2012 15:58:33 +0300
Subject: KVM: inline kvm_apic_present() and kvm_lapic_enabled()

Those functions are used during interrupt injection. When inlined they
become nops on the fast path.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/lapic.c | 143 +++++++++++++++++++--------------------------------
 arch/x86/kvm/lapic.h |  45 +++++++++++++++-
 2 files changed, 96 insertions(+), 92 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 650379ba73a..333c27fa6e9 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -73,11 +73,6 @@
 static unsigned int min_timer_period_us = 500;
 module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
 
-static inline u32 apic_get_reg(struct kvm_lapic *apic, int reg_off)
-{
-	return *((u32 *) (apic->regs + reg_off));
-}
-
 static inline void apic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val)
 {
 	*((u32 *) (apic->regs + reg_off)) = val;
@@ -119,19 +114,11 @@ static inline int __apic_test_and_clear_vector(int vec, void *bitmap)
 }
 
 struct static_key_deferred apic_hw_disabled __read_mostly;
-
-static inline int apic_hw_enabled(struct kvm_lapic *apic)
-{
-	if (static_key_false(&apic_hw_disabled.key))
-		return apic->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE;
-	return MSR_IA32_APICBASE_ENABLE;
-}
-
 struct static_key_deferred apic_sw_disabled __read_mostly;
 
 static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
 {
-	if ((apic_get_reg(apic, APIC_SPIV) ^ val) & APIC_SPIV_APIC_ENABLED) {
+	if ((kvm_apic_get_reg(apic, APIC_SPIV) ^ val) & APIC_SPIV_APIC_ENABLED) {
 		if (val & APIC_SPIV_APIC_ENABLED)
 			static_key_slow_dec_deferred(&apic_sw_disabled);
 		else
@@ -140,23 +127,9 @@ static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
 	apic_set_reg(apic, APIC_SPIV, val);
 }
 
-static inline int apic_sw_enabled(struct kvm_lapic *apic)
-{
-	if (static_key_false(&apic_sw_disabled.key))
-		return apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED;
-	return APIC_SPIV_APIC_ENABLED;
-}
-
 static inline int apic_enabled(struct kvm_lapic *apic)
 {
-	return apic_sw_enabled(apic) &&	apic_hw_enabled(apic);
-}
-
-static inline bool vcpu_has_lapic(struct kvm_vcpu *vcpu)
-{
-	if (static_key_false(&kvm_no_apic_vcpu))
-		return vcpu->arch.apic;
-	return true;
+	return kvm_apic_sw_enabled(apic) &&	kvm_apic_hw_enabled(apic);
 }
 
 #define LVT_MASK	\
@@ -168,34 +141,34 @@ static inline bool vcpu_has_lapic(struct kvm_vcpu *vcpu)
 
 static inline int kvm_apic_id(struct kvm_lapic *apic)
 {
-	return (apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
+	return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
 }
 
 static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type)
 {
-	return !(apic_get_reg(apic, lvt_type) & APIC_LVT_MASKED);
+	return !(kvm_apic_get_reg(apic, lvt_type) & APIC_LVT_MASKED);
 }
 
 static inline int apic_lvt_vector(struct kvm_lapic *apic, int lvt_type)
 {
-	return apic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK;
+	return kvm_apic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK;
 }
 
 static inline int apic_lvtt_oneshot(struct kvm_lapic *apic)
 {
-	return ((apic_get_reg(apic, APIC_LVTT) &
+	return ((kvm_apic_get_reg(apic, APIC_LVTT) &
 		apic->lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_ONESHOT);
 }
 
 static inline int apic_lvtt_period(struct kvm_lapic *apic)
 {
-	return ((apic_get_reg(apic, APIC_LVTT) &
+	return ((kvm_apic_get_reg(apic, APIC_LVTT) &
 		apic->lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_PERIODIC);
 }
 
 static inline int apic_lvtt_tscdeadline(struct kvm_lapic *apic)
 {
-	return ((apic_get_reg(apic, APIC_LVTT) &
+	return ((kvm_apic_get_reg(apic, APIC_LVTT) &
 		apic->lapic_timer.timer_mode_mask) ==
 			APIC_LVT_TIMER_TSCDEADLINE);
 }
@@ -211,7 +184,7 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu)
 	struct kvm_cpuid_entry2 *feat;
 	u32 v = APIC_VERSION;
 
-	if (!vcpu_has_lapic(vcpu))
+	if (!kvm_vcpu_has_lapic(vcpu))
 		return;
 
 	feat = kvm_find_cpuid_entry(apic->vcpu, 0x1, 0);
@@ -319,7 +292,7 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
 	 * will cause vmexit immediately and the value will be recalculated
 	 * on the next vmentry.
 	 */
-	if (!vcpu_has_lapic(vcpu))
+	if (!kvm_vcpu_has_lapic(vcpu))
 		return 0;
 	highest_irr = apic_find_highest_irr(vcpu->arch.apic);
 
@@ -404,8 +377,8 @@ static void apic_update_ppr(struct kvm_lapic *apic)
 	u32 tpr, isrv, ppr, old_ppr;
 	int isr;
 
-	old_ppr = apic_get_reg(apic, APIC_PROCPRI);
-	tpr = apic_get_reg(apic, APIC_TASKPRI);
+	old_ppr = kvm_apic_get_reg(apic, APIC_PROCPRI);
+	tpr = kvm_apic_get_reg(apic, APIC_TASKPRI);
 	isr = apic_find_highest_isr(apic);
 	isrv = (isr != -1) ? isr : 0;
 
@@ -441,13 +414,13 @@ int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda)
 	u32 logical_id;
 
 	if (apic_x2apic_mode(apic)) {
-		logical_id = apic_get_reg(apic, APIC_LDR);
+		logical_id = kvm_apic_get_reg(apic, APIC_LDR);
 		return logical_id & mda;
 	}
 
-	logical_id = GET_APIC_LOGICAL_ID(apic_get_reg(apic, APIC_LDR));
+	logical_id = GET_APIC_LOGICAL_ID(kvm_apic_get_reg(apic, APIC_LDR));
 
-	switch (apic_get_reg(apic, APIC_DFR)) {
+	switch (kvm_apic_get_reg(apic, APIC_DFR)) {
 	case APIC_DFR_FLAT:
 		if (logical_id & mda)
 			result = 1;
@@ -459,7 +432,7 @@ int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda)
 		break;
 	default:
 		apic_debug("Bad DFR vcpu %d: %08x\n",
-			   apic->vcpu->vcpu_id, apic_get_reg(apic, APIC_DFR));
+			   apic->vcpu->vcpu_id, kvm_apic_get_reg(apic, APIC_DFR));
 		break;
 	}
 
@@ -617,7 +590,7 @@ static int apic_set_eoi(struct kvm_lapic *apic)
 	apic_clear_isr(vector, apic);
 	apic_update_ppr(apic);
 
-	if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) &&
+	if (!(kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) &&
 	    kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) {
 		int trigger_mode;
 		if (apic_test_vector(vector, apic->regs + APIC_TMR))
@@ -632,8 +605,8 @@ static int apic_set_eoi(struct kvm_lapic *apic)
 
 static void apic_send_ipi(struct kvm_lapic *apic)
 {
-	u32 icr_low = apic_get_reg(apic, APIC_ICR);
-	u32 icr_high = apic_get_reg(apic, APIC_ICR2);
+	u32 icr_low = kvm_apic_get_reg(apic, APIC_ICR);
+	u32 icr_high = kvm_apic_get_reg(apic, APIC_ICR2);
 	struct kvm_lapic_irq irq;
 
 	irq.vector = icr_low & APIC_VECTOR_MASK;
@@ -668,7 +641,7 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic)
 	ASSERT(apic != NULL);
 
 	/* if initial count is 0, current count should also be 0 */
-	if (apic_get_reg(apic, APIC_TMICT) == 0)
+	if (kvm_apic_get_reg(apic, APIC_TMICT) == 0)
 		return 0;
 
 	remaining = hrtimer_get_remaining(&apic->lapic_timer.timer);
@@ -724,13 +697,13 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
 		break;
 	case APIC_PROCPRI:
 		apic_update_ppr(apic);
-		val = apic_get_reg(apic, offset);
+		val = kvm_apic_get_reg(apic, offset);
 		break;
 	case APIC_TASKPRI:
 		report_tpr_access(apic, false);
 		/* fall thru */
 	default:
-		val = apic_get_reg(apic, offset);
+		val = kvm_apic_get_reg(apic, offset);
 		break;
 	}
 
@@ -782,7 +755,7 @@ static int apic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
 
 static int apic_mmio_in_range(struct kvm_lapic *apic, gpa_t addr)
 {
-	return apic_hw_enabled(apic) &&
+	return kvm_apic_hw_enabled(apic) &&
 	    addr >= apic->base_address &&
 	    addr < apic->base_address + LAPIC_MMIO_LENGTH;
 }
@@ -805,7 +778,7 @@ static void update_divide_count(struct kvm_lapic *apic)
 {
 	u32 tmp1, tmp2, tdcr;
 
-	tdcr = apic_get_reg(apic, APIC_TDCR);
+	tdcr = kvm_apic_get_reg(apic, APIC_TDCR);
 	tmp1 = tdcr & 0xf;
 	tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1;
 	apic->divide_count = 0x1 << (tmp2 & 0x7);
@@ -822,7 +795,7 @@ static void start_apic_timer(struct kvm_lapic *apic)
 	if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) {
 		/* lapic timer in oneshot or periodic mode */
 		now = apic->lapic_timer.timer.base->get_time();
-		apic->lapic_timer.period = (u64)apic_get_reg(apic, APIC_TMICT)
+		apic->lapic_timer.period = (u64)kvm_apic_get_reg(apic, APIC_TMICT)
 			    * APIC_BUS_CYCLE_NS * apic->divide_count;
 
 		if (!apic->lapic_timer.period)
@@ -854,7 +827,7 @@ static void start_apic_timer(struct kvm_lapic *apic)
 			   "timer initial count 0x%x, period %lldns, "
 			   "expire @ 0x%016" PRIx64 ".\n", __func__,
 			   APIC_BUS_CYCLE_NS, ktime_to_ns(now),
-			   apic_get_reg(apic, APIC_TMICT),
+			   kvm_apic_get_reg(apic, APIC_TMICT),
 			   apic->lapic_timer.period,
 			   ktime_to_ns(ktime_add_ns(now,
 					apic->lapic_timer.period)));
@@ -886,7 +859,7 @@ static void start_apic_timer(struct kvm_lapic *apic)
 
 static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
 {
-	int nmi_wd_enabled = apic_lvt_nmi_mode(apic_get_reg(apic, APIC_LVT0));
+	int nmi_wd_enabled = apic_lvt_nmi_mode(kvm_apic_get_reg(apic, APIC_LVT0));
 
 	if (apic_lvt_nmi_mode(lvt0_val)) {
 		if (!nmi_wd_enabled) {
@@ -937,7 +910,7 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
 
 	case APIC_SPIV: {
 		u32 mask = 0x3ff;
-		if (apic_get_reg(apic, APIC_LVR) & APIC_LVR_DIRECTED_EOI)
+		if (kvm_apic_get_reg(apic, APIC_LVR) & APIC_LVR_DIRECTED_EOI)
 			mask |= APIC_SPIV_DIRECTED_EOI;
 		apic_set_spiv(apic, val & mask);
 		if (!(val & APIC_SPIV_APIC_ENABLED)) {
@@ -945,7 +918,7 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
 			u32 lvt_val;
 
 			for (i = 0; i < APIC_LVT_NUM; i++) {
-				lvt_val = apic_get_reg(apic,
+				lvt_val = kvm_apic_get_reg(apic,
 						       APIC_LVTT + 0x10 * i);
 				apic_set_reg(apic, APIC_LVTT + 0x10 * i,
 					     lvt_val | APIC_LVT_MASKED);
@@ -974,7 +947,7 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
 	case APIC_LVT1:
 	case APIC_LVTERR:
 		/* TODO: Check vector */
-		if (!apic_sw_enabled(apic))
+		if (!kvm_apic_sw_enabled(apic))
 			val |= APIC_LVT_MASKED;
 
 		val &= apic_lvt_mask[(reg - APIC_LVTT) >> 4];
@@ -983,12 +956,12 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
 		break;
 
 	case APIC_LVTT:
-		if ((apic_get_reg(apic, APIC_LVTT) &
+		if ((kvm_apic_get_reg(apic, APIC_LVTT) &
 		    apic->lapic_timer.timer_mode_mask) !=
 		   (val & apic->lapic_timer.timer_mode_mask))
 			hrtimer_cancel(&apic->lapic_timer.timer);
 
-		if (!apic_sw_enabled(apic))
+		if (!kvm_apic_sw_enabled(apic))
 			val |= APIC_LVT_MASKED;
 		val &= (apic_lvt_mask[0] | apic->lapic_timer.timer_mode_mask);
 		apic_set_reg(apic, APIC_LVTT, val);
@@ -1067,7 +1040,7 @@ static int apic_mmio_write(struct kvm_io_device *this,
 
 void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu)
 {
-	if (vcpu_has_lapic(vcpu))
+	if (kvm_vcpu_has_lapic(vcpu))
 		apic_reg_write(vcpu->arch.apic, APIC_EOI, 0);
 }
 EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi);
@@ -1084,7 +1057,7 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu)
 	if (!(vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE))
 		static_key_slow_dec_deferred(&apic_hw_disabled);
 
-	if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED))
+	if (!(kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED))
 		static_key_slow_dec_deferred(&apic_sw_disabled);
 
 	if (apic->regs)
@@ -1103,7 +1076,7 @@ u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
 
-	if (!vcpu_has_lapic(vcpu) || apic_lvtt_oneshot(apic) ||
+	if (!kvm_vcpu_has_lapic(vcpu) || apic_lvtt_oneshot(apic) ||
 			apic_lvtt_period(apic))
 		return 0;
 
@@ -1114,7 +1087,7 @@ void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
 
-	if (!vcpu_has_lapic(vcpu) || apic_lvtt_oneshot(apic) ||
+	if (!kvm_vcpu_has_lapic(vcpu) || apic_lvtt_oneshot(apic) ||
 			apic_lvtt_period(apic))
 		return;
 
@@ -1127,21 +1100,21 @@ void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
 
-	if (!vcpu_has_lapic(vcpu))
+	if (!kvm_vcpu_has_lapic(vcpu))
 		return;
 
 	apic_set_tpr(apic, ((cr8 & 0x0f) << 4)
-		     | (apic_get_reg(apic, APIC_TASKPRI) & 4));
+		     | (kvm_apic_get_reg(apic, APIC_TASKPRI) & 4));
 }
 
 u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
 {
 	u64 tpr;
 
-	if (!vcpu_has_lapic(vcpu))
+	if (!kvm_vcpu_has_lapic(vcpu))
 		return 0;
 
-	tpr = (u64) apic_get_reg(vcpu->arch.apic, APIC_TASKPRI);
+	tpr = (u64) kvm_apic_get_reg(vcpu->arch.apic, APIC_TASKPRI);
 
 	return (tpr & 0xf0) >> 4;
 }
@@ -1238,16 +1211,6 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
 		   vcpu->arch.apic_base, apic->base_address);
 }
 
-bool kvm_apic_present(struct kvm_vcpu *vcpu)
-{
-	return vcpu_has_lapic(vcpu) && apic_hw_enabled(vcpu->arch.apic);
-}
-
-int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
-{
-	return kvm_apic_present(vcpu) && apic_sw_enabled(vcpu->arch.apic);
-}
-
 /*
  *----------------------------------------------------------------------
  * timer interface
@@ -1263,7 +1226,7 @@ int apic_has_pending_timer(struct kvm_vcpu *vcpu)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
 
-	if (vcpu_has_lapic(vcpu) && apic_enabled(apic) &&
+	if (kvm_vcpu_has_lapic(vcpu) && apic_enabled(apic) &&
 			apic_lvt_enabled(apic, APIC_LVTT))
 		return atomic_read(&apic->lapic_timer.pending);
 
@@ -1272,10 +1235,10 @@ int apic_has_pending_timer(struct kvm_vcpu *vcpu)
 
 int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type)
 {
-	u32 reg = apic_get_reg(apic, lvt_type);
+	u32 reg = kvm_apic_get_reg(apic, lvt_type);
 	int vector, mode, trig_mode;
 
-	if (apic_hw_enabled(apic) && !(reg & APIC_LVT_MASKED)) {
+	if (kvm_apic_hw_enabled(apic) && !(reg & APIC_LVT_MASKED)) {
 		vector = reg & APIC_VECTOR_MASK;
 		mode = reg & APIC_MODE_MASK;
 		trig_mode = reg & APIC_LVT_LEVEL_TRIGGER;
@@ -1375,23 +1338,23 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
 	struct kvm_lapic *apic = vcpu->arch.apic;
 	int highest_irr;
 
-	if (!vcpu_has_lapic(vcpu) || !apic_enabled(apic))
+	if (!kvm_vcpu_has_lapic(vcpu) || !apic_enabled(apic))
 		return -1;
 
 	apic_update_ppr(apic);
 	highest_irr = apic_find_highest_irr(apic);
 	if ((highest_irr == -1) ||
-	    ((highest_irr & 0xF0) <= apic_get_reg(apic, APIC_PROCPRI)))
+	    ((highest_irr & 0xF0) <= kvm_apic_get_reg(apic, APIC_PROCPRI)))
 		return -1;
 	return highest_irr;
 }
 
 int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
 {
-	u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0);
+	u32 lvt0 = kvm_apic_get_reg(vcpu->arch.apic, APIC_LVT0);
 	int r = 0;
 
-	if (!apic_hw_enabled(vcpu->arch.apic))
+	if (!kvm_apic_hw_enabled(vcpu->arch.apic))
 		r = 1;
 	if ((lvt0 & APIC_LVT_MASKED) == 0 &&
 	    GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT)
@@ -1403,7 +1366,7 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
 
-	if (!vcpu_has_lapic(vcpu))
+	if (!kvm_vcpu_has_lapic(vcpu))
 		return;
 
 	if (atomic_read(&apic->lapic_timer.pending) > 0) {
@@ -1432,7 +1395,7 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
 
 	kvm_lapic_set_base(vcpu, vcpu->arch.apic_base);
 	kvm_apic_set_version(vcpu);
-	apic_set_spiv(apic, apic_get_reg(apic, APIC_SPIV));
+	apic_set_spiv(apic, kvm_apic_get_reg(apic, APIC_SPIV));
 
 	apic_update_ppr(apic);
 	hrtimer_cancel(&apic->lapic_timer.timer);
@@ -1448,7 +1411,7 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
 {
 	struct hrtimer *timer;
 
-	if (!vcpu_has_lapic(vcpu))
+	if (!kvm_vcpu_has_lapic(vcpu))
 		return;
 
 	timer = &vcpu->arch.apic->lapic_timer.timer;
@@ -1549,7 +1512,7 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu)
 	if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention))
 		return;
 
-	tpr = apic_get_reg(apic, APIC_TASKPRI) & 0xff;
+	tpr = kvm_apic_get_reg(apic, APIC_TASKPRI) & 0xff;
 	max_irr = apic_find_highest_irr(apic);
 	if (max_irr < 0)
 		max_irr = 0;
@@ -1608,7 +1571,7 @@ int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 reg, u64 data)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
 
-	if (!vcpu_has_lapic(vcpu))
+	if (!kvm_vcpu_has_lapic(vcpu))
 		return 1;
 
 	/* if this is ICR write vector before command */
@@ -1622,7 +1585,7 @@ int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data)
 	struct kvm_lapic *apic = vcpu->arch.apic;
 	u32 low, high = 0;
 
-	if (!vcpu_has_lapic(vcpu))
+	if (!kvm_vcpu_has_lapic(vcpu))
 		return 1;
 
 	if (apic_reg_read(apic, reg, 4, &low))
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 73fa299b68e..2ad9caa06f9 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -55,8 +55,6 @@ int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type);
 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
 void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
 void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu);
-int kvm_lapic_enabled(struct kvm_vcpu *vcpu);
-bool kvm_apic_present(struct kvm_vcpu *vcpu);
 int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
 
 u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu);
@@ -79,4 +77,47 @@ static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu)
 
 int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data);
 void kvm_lapic_init(void);
+
+static inline u32 kvm_apic_get_reg(struct kvm_lapic *apic, int reg_off)
+{
+	        return *((u32 *) (apic->regs + reg_off));
+}
+
+extern struct static_key kvm_no_apic_vcpu;
+
+static inline bool kvm_vcpu_has_lapic(struct kvm_vcpu *vcpu)
+{
+	if (static_key_false(&kvm_no_apic_vcpu))
+		return vcpu->arch.apic;
+	return true;
+}
+
+extern struct static_key_deferred apic_hw_disabled;
+
+static inline int kvm_apic_hw_enabled(struct kvm_lapic *apic)
+{
+	if (static_key_false(&apic_hw_disabled.key))
+		return apic->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE;
+	return MSR_IA32_APICBASE_ENABLE;
+}
+
+extern struct static_key_deferred apic_sw_disabled;
+
+static inline int kvm_apic_sw_enabled(struct kvm_lapic *apic)
+{
+	if (static_key_false(&apic_sw_disabled.key))
+		return kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED;
+	return APIC_SPIV_APIC_ENABLED;
+}
+
+static inline bool kvm_apic_present(struct kvm_vcpu *vcpu)
+{
+	return kvm_vcpu_has_lapic(vcpu) && kvm_apic_hw_enabled(vcpu->arch.apic);
+}
+
+static inline int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
+{
+	return kvm_apic_present(vcpu) && kvm_apic_sw_enabled(vcpu->arch.apic);
+}
+
 #endif
-- 
cgit v1.2.3


From 64eb0620296f924d5fded755c5ed95fb73649e06 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Wed, 8 Aug 2012 15:24:36 +0300
Subject: KVM: correctly detect APIC SW state in kvm_apic_post_state_restore()

For apic_set_spiv() to track APIC SW state correctly it needs to see
previous and next values of the spurious vector register, but currently
memset() overwrite the old value before apic_set_spiv() get a chance to
do tracking. Fix it by calling apic_set_spiv() before overwriting old
value.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/lapic.c | 7 +++++--
 arch/x86/kvm/lapic.h | 3 ++-
 arch/x86/kvm/x86.c   | 3 +--
 3 files changed, 8 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 333c27fa6e9..18d149d8020 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1389,13 +1389,16 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
 	return vector;
 }
 
-void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
+void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
+		struct kvm_lapic_state *s)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
 
 	kvm_lapic_set_base(vcpu, vcpu->arch.apic_base);
+	/* set SPIV separately to get count of SW disabled APICs right */
+	apic_set_spiv(apic, *((u32 *)(s->regs + APIC_SPIV)));
+	memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
 	kvm_apic_set_version(vcpu);
-	apic_set_spiv(apic, kvm_apic_get_reg(apic, APIC_SPIV));
 
 	apic_update_ppr(apic);
 	hrtimer_cancel(&apic->lapic_timer.timer);
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 2ad9caa06f9..615a8b03016 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -54,7 +54,8 @@ int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type);
 
 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
 void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
-void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu);
+void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
+		struct kvm_lapic_state *s);
 int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
 
 u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8ebf65c349e..91a595827de 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2348,8 +2348,7 @@ static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
 static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
 				    struct kvm_lapic_state *s)
 {
-	memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
-	kvm_apic_post_state_restore(vcpu);
+	kvm_apic_post_state_restore(vcpu, s);
 	update_cr8_intercept(vcpu);
 
 	return 0;
-- 
cgit v1.2.3


From 51d59c6b422f3f95940ae4e5b42f165595906aee Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Fri, 3 Aug 2012 15:57:49 -0300
Subject: KVM: x86: fix pvclock guest stopped flag reporting

kvm_guest_time_update unconditionally clears hv_clock.flags field,
so the notification never reaches the guest.

Fix it by allowing PVCLOCK_GUEST_STOPPED to passthrough.

Reviewed-by: Eric B Munson <emunson@mgebm.net>
Reviewed-by: Amit Shah <amit.shah@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  2 ++
 arch/x86/kvm/x86.c              | 13 ++++++++++---
 2 files changed, 12 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 1309e69b57f..fc0e752e756 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -420,6 +420,8 @@ struct kvm_vcpu_arch {
 	unsigned int hw_tsc_khz;
 	unsigned int time_offset;
 	struct page *time_page;
+	/* set guest stopped flag in pvclock flags field */
+	bool pvclock_set_guest_stopped_request;
 
 	struct {
 		u64 msr_val;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 91a595827de..fb0d93788bf 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1134,6 +1134,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 	unsigned long this_tsc_khz;
 	s64 kernel_ns, max_kernel_ns;
 	u64 tsc_timestamp;
+	u8 pvclock_flags;
 
 	/* Keep irq disabled to prevent changes to the clock */
 	local_irq_save(flags);
@@ -1215,7 +1216,14 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 	vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
 	vcpu->last_kernel_ns = kernel_ns;
 	vcpu->last_guest_tsc = tsc_timestamp;
-	vcpu->hv_clock.flags = 0;
+
+	pvclock_flags = 0;
+	if (vcpu->pvclock_set_guest_stopped_request) {
+		pvclock_flags |= PVCLOCK_GUEST_STOPPED;
+		vcpu->pvclock_set_guest_stopped_request = false;
+	}
+
+	vcpu->hv_clock.flags = pvclock_flags;
 
 	/*
 	 * The interface expects us to write an even number signaling that the
@@ -2624,10 +2632,9 @@ static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
  */
 static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
 {
-	struct pvclock_vcpu_time_info *src = &vcpu->arch.hv_clock;
 	if (!vcpu->arch.time_page)
 		return -EINVAL;
-	src->flags |= PVCLOCK_GUEST_STOPPED;
+	vcpu->arch.pvclock_set_guest_stopped_request = true;
 	kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
 	return 0;
 }
-- 
cgit v1.2.3


From e423ca155d3f5f16b46e30de9c818875b1fd617d Mon Sep 17 00:00:00 2001
From: Raghavendra K T <raghavendra.kt@linux.vnet.ibm.com>
Date: Tue, 7 Aug 2012 13:10:13 +0530
Subject: KVM: Correct vmrun to vmcall typo

Signed-off-by: Raghavendra K T <raghavendra.kt@linux.vnet.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_para.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 2f7712e08b1..20f5697888b 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -116,7 +116,7 @@ static inline bool kvm_check_and_clear_guest_paused(void)
  */
 #define KVM_HYPERCALL ".byte 0x0f,0x01,0xc1"
 
-/* For KVM hypercalls, a three-byte sequence of either the vmrun or the vmmrun
+/* For KVM hypercalls, a three-byte sequence of either the vmcall or the vmmcall
  * instruction.  The hypervisor may replace it with something else but only the
  * instructions are guaranteed to be supported.
  *
-- 
cgit v1.2.3


From 2a7921b7a033d5cfc176690d6297c82846c582b2 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Sun, 12 Aug 2012 16:12:29 +0300
Subject: KVM: VMX: restore MSR_IA32_DEBUGCTLMSR after VMEXIT

MSR_IA32_DEBUGCTLMSR is zeroed on VMEXIT. Restore it to the correct
value.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index cc8ad983692..d0f4bec9fc6 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6222,6 +6222,7 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
 static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	unsigned long debugctlmsr;
 
 	if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending) {
 		struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
@@ -6261,6 +6262,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 		vmx_set_interrupt_shadow(vcpu, 0);
 
 	atomic_switch_perf_msrs(vmx);
+	debugctlmsr = get_debugctlmsr();
 
 	vmx->__launched = vmx->loaded_vmcs->launched;
 	asm(
@@ -6362,6 +6364,10 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 #endif
 	      );
 
+	/* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
+	if (debugctlmsr)
+		update_debugctlmsr(debugctlmsr);
+
 #ifndef CONFIG_X86_64
 	/*
 	 * The sysexit path does not restore ds/es, so we must set them to
-- 
cgit v1.2.3


From dbcb4e798072d114fe68813f39a9efd239ab99c0 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 13 Aug 2012 15:38:22 +0300
Subject: KVM: VMX: Advertize RDTSC exiting to nested guests

All processors that support VMX have that feature, and guests (Xen) depend on
it.  As we already implement it, advertize it to the guest.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d0f4bec9fc6..13e0296cea4 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1990,7 +1990,7 @@ static __init void nested_vmx_setup_ctls_msrs(void)
 #endif
 		CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
 		CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING |
-		CPU_BASED_RDPMC_EXITING |
+		CPU_BASED_RDPMC_EXITING | CPU_BASED_RDTSC_EXITING |
 		CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
 	/*
 	 * We can allow some features even when not supported by the
-- 
cgit v1.2.3


From 28a6fdabb3ea775d3d707afd9d2728b3ced2c34d Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Tue, 14 Aug 2012 19:20:28 +0300
Subject: KVM: x86: drop parameter validation in ioapic/pic

We validate irq pin number when routing is setup, so
code handling illegal irq # in pic and ioapic on each injection
is never called.
Drop it, replace with BUG_ON to catch out of bounds access bugs.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/i8259.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index e498b18f010..90c84f947d4 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -190,17 +190,17 @@ void kvm_pic_update_irq(struct kvm_pic *s)
 
 int kvm_pic_set_irq(struct kvm_pic *s, int irq, int irq_source_id, int level)
 {
-	int ret = -1;
+	int ret, irq_level;
+
+	BUG_ON(irq < 0 || irq >= PIC_NUM_PINS);
 
 	pic_lock(s);
-	if (irq >= 0 && irq < PIC_NUM_PINS) {
-		int irq_level = __kvm_irq_line_state(&s->irq_states[irq],
-						     irq_source_id, level);
-		ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, irq_level);
-		pic_update_irq(s);
-		trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr,
-				      s->pics[irq >> 3].imr, ret == 0);
-	}
+	irq_level = __kvm_irq_line_state(&s->irq_states[irq],
+					 irq_source_id, level);
+	ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, irq_level);
+	pic_update_irq(s);
+	trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr,
+			      s->pics[irq >> 3].imr, ret == 0);
 	pic_unlock(s);
 
 	return ret;
-- 
cgit v1.2.3


From 8fbe6a541f50eeec5e3e49bd92db23ade9496673 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 15 Aug 2012 16:00:40 +0200
Subject: KVM guest: disable stealtime on reboot to avoid mem corruption

else, host continues to update stealtime after reboot,
which can corrupt e.g. initramfs area.
found when tracking down initramfs unpack error on initial reboot
(with qemu-kvm -smp 2, no problem with single-core).

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kernel/kvm.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch')

diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index c1d61ee4b4f..1596cc8fd79 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -354,6 +354,7 @@ static void kvm_pv_guest_cpu_reboot(void *unused)
 	if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
 		wrmsrl(MSR_KVM_PV_EOI_EN, 0);
 	kvm_pv_disable_apf();
+	kvm_disable_steal_time();
 }
 
 static int kvm_pv_reboot_notify(struct notifier_block *nb,
-- 
cgit v1.2.3


From 8e3d9d061b5d132217629e7b5635ff0c02488e65 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Tue, 21 Aug 2012 10:57:42 +0800
Subject: KVM: x86: fix possible infinite loop caused by reexecute_instruction

Currently, we reexecute all unhandleable instructions if they do not
access on the mmio, however, it can not work if host map the readonly
memory to guest. If the instruction try to write this kind of memory,
it will fault again when guest retry it, then we will goto a infinite
loop: retry instruction -> write #PF -> emulation fail ->
retry instruction -> ...

Fix it by retrying the instruction only when it faults on the writable
memory

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index fb0d93788bf..704680d0fa3 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4473,6 +4473,7 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu)
 static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
 {
 	gpa_t gpa;
+	pfn_t pfn;
 
 	if (tdp_enabled)
 		return false;
@@ -4490,8 +4491,17 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
 	if (gpa == UNMAPPED_GVA)
 		return true; /* let cpu generate fault */
 
-	if (!kvm_is_error_hva(gfn_to_hva(vcpu->kvm, gpa >> PAGE_SHIFT)))
+	/*
+	 * Do not retry the unhandleable instruction if it faults on the
+	 * readonly host memory, otherwise it will goto a infinite loop:
+	 * retry instruction -> write #PF -> emulation fail -> retry
+	 * instruction -> ...
+	 */
+	pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa));
+	if (!is_error_pfn(pfn)) {
+		kvm_release_pfn_clean(pfn);
 		return true;
+	}
 
 	return false;
 }
-- 
cgit v1.2.3


From 037d92dc5d4691ae7cf44699c55ca83b1b441992 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Tue, 21 Aug 2012 10:59:12 +0800
Subject: KVM: introduce gfn_to_pfn_memslot_atomic

It can instead of hva_to_pfn_atomic

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9651c2cd000..5548971ae80 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2510,15 +2510,12 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
 				     bool no_dirty_log)
 {
 	struct kvm_memory_slot *slot;
-	unsigned long hva;
 
 	slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log);
 	if (!slot)
 		return KVM_PFN_ERR_FAULT;
 
-	hva = gfn_to_hva_memslot(slot, gfn);
-
-	return hva_to_pfn_atomic(hva);
+	return gfn_to_pfn_memslot_atomic(slot, gfn);
 }
 
 static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
-- 
cgit v1.2.3


From 4d8b81abc47b83a1939e59df2fdb0e98dfe0eedd Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Tue, 21 Aug 2012 11:02:51 +0800
Subject: KVM: introduce readonly memslot

In current code, if we map a readonly memory space from host to guest
and the page is not currently mapped in the host, we will get a fault
pfn and async is not allowed, then the vm will crash

We introduce readonly memory region to map ROM/ROMD to the guest, read access
is happy for readonly memslot, write access on readonly memslot will cause
KVM_EXIT_MMIO exit

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm.h | 1 +
 arch/x86/kvm/mmu.c         | 9 +++++++++
 arch/x86/kvm/x86.c         | 1 +
 3 files changed, 11 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index 246617efd67..521bf252e34 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -25,6 +25,7 @@
 #define __KVM_HAVE_DEBUGREGS
 #define __KVM_HAVE_XSAVE
 #define __KVM_HAVE_XCRS
+#define __KVM_HAVE_READONLY_MEM
 
 /* Architectural interrupt line count. */
 #define KVM_NR_INTERRUPTS 256
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 5548971ae80..8e312a2e141 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2647,6 +2647,15 @@ static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *
 
 static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn)
 {
+	/*
+	 * Do not cache the mmio info caused by writing the readonly gfn
+	 * into the spte otherwise read access on readonly gfn also can
+	 * caused mmio page fault and treat it as mmio access.
+	 * Return 1 to tell kvm to emulate it.
+	 */
+	if (pfn == KVM_PFN_ERR_RO_FAULT)
+		return 1;
+
 	if (pfn == KVM_PFN_ERR_HWPOISON) {
 		kvm_send_hwpoison_signal(gfn_to_hva(vcpu->kvm, gfn), current);
 		return 0;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 704680d0fa3..42bbf4187d2 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2175,6 +2175,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_GET_TSC_KHZ:
 	case KVM_CAP_PCI_2_3:
 	case KVM_CAP_KVMCLOCK_CTRL:
+	case KVM_CAP_READONLY_MEM:
 		r = 1;
 		break;
 	case KVM_CAP_COALESCED_MMIO:
-- 
cgit v1.2.3


From 90993cdd1800dc6ef9587431a0c625b978584e81 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Thu, 16 Aug 2012 17:00:19 -0300
Subject: x86: KVM guest: merge CONFIG_KVM_CLOCK into CONFIG_KVM_GUEST

The distinction between CONFIG_KVM_CLOCK and CONFIG_KVM_GUEST is
not so clear anymore, as demonstrated by recent bugs caused by poor
handling of on/off combinations of these options.

Merge CONFIG_KVM_CLOCK into CONFIG_KVM_GUEST.

Reported-By: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/Kconfig                | 21 ++++++++-------------
 arch/x86/include/asm/kvm_para.h |  4 ++--
 arch/x86/kernel/Makefile        |  3 +--
 arch/x86/kernel/kvm.c           |  2 --
 arch/x86/kernel/setup.c         |  2 +-
 5 files changed, 12 insertions(+), 20 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 8ec3a1aa4ab..a42e2e99caa 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -573,23 +573,18 @@ config PARAVIRT_TIME_ACCOUNTING
 
 source "arch/x86/xen/Kconfig"
 
-config KVM_CLOCK
-	bool "KVM paravirtualized clock"
-	select PARAVIRT
-	select PARAVIRT_CLOCK
-	---help---
-	  Turning on this option will allow you to run a paravirtualized clock
-	  when running over the KVM hypervisor. Instead of relying on a PIT
-	  (or probably other) emulation by the underlying device model, the host
-	  provides the guest with timing infrastructure such as time of day, and
-	  system time
-
 config KVM_GUEST
-	bool "KVM Guest support"
+	bool "KVM Guest support (including kvmclock)"
+	select PARAVIRT
 	select PARAVIRT
+	select PARAVIRT_CLOCK
+	default y if PARAVIRT_GUEST
 	---help---
 	  This option enables various optimizations for running under the KVM
-	  hypervisor.
+	  hypervisor. It includes a paravirtualized clock, so that instead
+	  of relying on a PIT (or probably other) emulation by the
+	  underlying device model, the host provides the guest with
+	  timing infrastructure such as time of day, and system time
 
 source "arch/x86/lguest/Kconfig"
 
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 20f5697888b..eb3e9d85e1f 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -102,14 +102,14 @@ struct kvm_vcpu_pv_apf_data {
 extern void kvmclock_init(void);
 extern int kvm_register_clock(char *txt);
 
-#ifdef CONFIG_KVM_CLOCK
+#ifdef CONFIG_KVM_GUEST
 bool kvm_check_and_clear_guest_paused(void);
 #else
 static inline bool kvm_check_and_clear_guest_paused(void)
 {
 	return false;
 }
-#endif /* CONFIG_KVMCLOCK */
+#endif /* CONFIG_KVM_GUEST */
 
 /* This instruction is vmcall.  On non-VT architectures, it will generate a
  * trap that we will then rewrite to the appropriate instruction.
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 8215e5652d9..7203298e0b8 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -81,8 +81,7 @@ obj-$(CONFIG_DEBUG_RODATA_TEST)	+= test_rodata.o
 obj-$(CONFIG_DEBUG_NX_TEST)	+= test_nx.o
 obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o
 
-obj-$(CONFIG_KVM_GUEST)		+= kvm.o
-obj-$(CONFIG_KVM_CLOCK)		+= kvmclock.o
+obj-$(CONFIG_KVM_GUEST)		+= kvm.o kvmclock.o
 obj-$(CONFIG_PARAVIRT)		+= paravirt.o paravirt_patch_$(BITS).o
 obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o
 obj-$(CONFIG_PARAVIRT_CLOCK)	+= pvclock.o
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 1596cc8fd79..b3e5e51bc90 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -397,9 +397,7 @@ void kvm_disable_steal_time(void)
 #ifdef CONFIG_SMP
 static void __init kvm_smp_prepare_boot_cpu(void)
 {
-#ifdef CONFIG_KVM_CLOCK
 	WARN_ON(kvm_register_clock("primary cpu clock"));
-#endif
 	kvm_guest_cpu_init();
 	native_smp_prepare_boot_cpu();
 }
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index f4b9b80e1b9..b3386ae3438 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -957,7 +957,7 @@ void __init setup_arch(char **cmdline_p)
 	initmem_init();
 	memblock_find_dma_reserve();
 
-#ifdef CONFIG_KVM_CLOCK
+#ifdef CONFIG_KVM_GUEST
 	kvmclock_init();
 #endif
 
-- 
cgit v1.2.3


From 66a03505a7fcc70187319ef2318832f4d3c451a6 Mon Sep 17 00:00:00 2001
From: Gavin Shan <shangw@linux.vnet.ibm.com>
Date: Fri, 24 Aug 2012 16:50:28 +0800
Subject: KVM: PPC: book3s: fix build error caused by gfn_to_hva_memslot()

The build error was caused by that builtin functions are calling
the functions implemented in modules. This error was introduced by
commit 4d8b81abc4 ("KVM: introduce readonly memslot").

The patch fixes the build error by moving function __gfn_to_hva_memslot()
from kvm_main.c to kvm_host.h and making that "inline" so that the
builtin function (kvmppc_h_enter) can use that.

Acked-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/powerpc/kvm/book3s_hv_rm_mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 56ac1a5d991..fb0e821622d 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -197,7 +197,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
 		pa &= PAGE_MASK;
 	} else {
 		/* Translate to host virtual address */
-		hva = gfn_to_hva_memslot(memslot, gfn);
+		hva = __gfn_to_hva_memslot(memslot, gfn);
 
 		/* Look up the Linux PTE for the backing page */
 		pte_size = psize;
-- 
cgit v1.2.3


From dd856efafe6097a5c9104725c2bca74430423db8 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 27 Aug 2012 23:46:17 +0300
Subject: KVM: x86 emulator: access GPRs on demand

Instead of populating the entire register file, read in registers
as they are accessed, and write back only the modified ones.  This
saves a VMREAD and VMWRITE on Intel (for rsp, since it is not usually
used during emulation), and a two 128-byte copies for the registers.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h |  20 ++-
 arch/x86/kvm/emulate.c             | 299 +++++++++++++++++++++++--------------
 arch/x86/kvm/x86.c                 |  45 +++---
 3 files changed, 220 insertions(+), 144 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index c764f43b71c..282aee5d6ac 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -85,6 +85,19 @@ struct x86_instruction_info {
 #define X86EMUL_INTERCEPTED     6 /* Intercepted by nested VMCB/VMCS */
 
 struct x86_emulate_ops {
+	/*
+	 * read_gpr: read a general purpose register (rax - r15)
+	 *
+	 * @reg: gpr number.
+	 */
+	ulong (*read_gpr)(struct x86_emulate_ctxt *ctxt, unsigned reg);
+	/*
+	 * write_gpr: write a general purpose register (rax - r15)
+	 *
+	 * @reg: gpr number.
+	 * @val: value to write.
+	 */
+	void (*write_gpr)(struct x86_emulate_ctxt *ctxt, unsigned reg, ulong val);
 	/*
 	 * read_std: Read bytes of standard (non-emulated/special) memory.
 	 *           Used for descriptor reading.
@@ -281,8 +294,10 @@ struct x86_emulate_ctxt {
 	bool rip_relative;
 	unsigned long _eip;
 	struct operand memop;
+	u32 regs_valid;  /* bitmaps of registers in _regs[] that can be read */
+	u32 regs_dirty;  /* bitmaps of registers in _regs[] that have been written */
 	/* Fields above regs are cleared together. */
-	unsigned long regs[NR_VCPU_REGS];
+	unsigned long _regs[NR_VCPU_REGS];
 	struct operand *memopp;
 	struct fetch_cache fetch;
 	struct read_cache io_read;
@@ -394,4 +409,7 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
 			 u16 tss_selector, int idt_index, int reason,
 			 bool has_error_code, u32 error_code);
 int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq);
+void emulator_invalidate_register_cache(struct x86_emulate_ctxt *ctxt);
+void emulator_writeback_register_cache(struct x86_emulate_ctxt *ctxt);
+
 #endif /* _ASM_X86_KVM_X86_EMULATE_H */
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index e8fb6c5c6c0..5e27ba53261 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -202,6 +202,42 @@ struct gprefix {
 #define EFLG_RESERVED_ZEROS_MASK 0xffc0802a
 #define EFLG_RESERVED_ONE_MASK 2
 
+static ulong reg_read(struct x86_emulate_ctxt *ctxt, unsigned nr)
+{
+	if (!(ctxt->regs_valid & (1 << nr))) {
+		ctxt->regs_valid |= 1 << nr;
+		ctxt->_regs[nr] = ctxt->ops->read_gpr(ctxt, nr);
+	}
+	return ctxt->_regs[nr];
+}
+
+static ulong *reg_write(struct x86_emulate_ctxt *ctxt, unsigned nr)
+{
+	ctxt->regs_valid |= 1 << nr;
+	ctxt->regs_dirty |= 1 << nr;
+	return &ctxt->_regs[nr];
+}
+
+static ulong *reg_rmw(struct x86_emulate_ctxt *ctxt, unsigned nr)
+{
+	reg_read(ctxt, nr);
+	return reg_write(ctxt, nr);
+}
+
+static void writeback_registers(struct x86_emulate_ctxt *ctxt)
+{
+	unsigned reg;
+
+	for_each_set_bit(reg, (ulong *)&ctxt->regs_dirty, 16)
+		ctxt->ops->write_gpr(ctxt, reg, ctxt->_regs[reg]);
+}
+
+static void invalidate_registers(struct x86_emulate_ctxt *ctxt)
+{
+	ctxt->regs_dirty = 0;
+	ctxt->regs_valid = 0;
+}
+
 /*
  * Instruction emulation:
  * Most instructions are emulated directly via a fragment of inline assembly
@@ -374,8 +410,8 @@ struct gprefix {
 #define __emulate_1op_rax_rdx(ctxt, _op, _suffix, _ex)			\
 	do {								\
 		unsigned long _tmp;					\
-		ulong *rax = &(ctxt)->regs[VCPU_REGS_RAX];		\
-		ulong *rdx = &(ctxt)->regs[VCPU_REGS_RDX];		\
+		ulong *rax = reg_rmw((ctxt), VCPU_REGS_RAX);		\
+		ulong *rdx = reg_rmw((ctxt), VCPU_REGS_RDX);		\
 									\
 		__asm__ __volatile__ (					\
 			_PRE_EFLAGS("0", "5", "1")			\
@@ -494,7 +530,7 @@ register_address_increment(struct x86_emulate_ctxt *ctxt, unsigned long *reg, in
 
 static void rsp_increment(struct x86_emulate_ctxt *ctxt, int inc)
 {
-	masked_increment(&ctxt->regs[VCPU_REGS_RSP], stack_mask(ctxt), inc);
+	masked_increment(reg_rmw(ctxt, VCPU_REGS_RSP), stack_mask(ctxt), inc);
 }
 
 static inline void jmp_rel(struct x86_emulate_ctxt *ctxt, int rel)
@@ -786,14 +822,15 @@ static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
  * pointer into the block that addresses the relevant register.
  * @highbyte_regs specifies whether to decode AH,CH,DH,BH.
  */
-static void *decode_register(u8 modrm_reg, unsigned long *regs,
+static void *decode_register(struct x86_emulate_ctxt *ctxt, u8 modrm_reg,
 			     int highbyte_regs)
 {
 	void *p;
 
-	p = &regs[modrm_reg];
 	if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8)
-		p = (unsigned char *)&regs[modrm_reg & 3] + 1;
+		p = (unsigned char *)reg_rmw(ctxt, modrm_reg & 3) + 1;
+	else
+		p = reg_rmw(ctxt, modrm_reg);
 	return p;
 }
 
@@ -982,10 +1019,10 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
 
 	op->type = OP_REG;
 	if (ctxt->d & ByteOp) {
-		op->addr.reg = decode_register(reg, ctxt->regs, highbyte_regs);
+		op->addr.reg = decode_register(ctxt, reg, highbyte_regs);
 		op->bytes = 1;
 	} else {
-		op->addr.reg = decode_register(reg, ctxt->regs, 0);
+		op->addr.reg = decode_register(ctxt, reg, 0);
 		op->bytes = ctxt->op_bytes;
 	}
 	fetch_register_operand(op);
@@ -1020,8 +1057,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
 	if (ctxt->modrm_mod == 3) {
 		op->type = OP_REG;
 		op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
-		op->addr.reg = decode_register(ctxt->modrm_rm,
-					       ctxt->regs, ctxt->d & ByteOp);
+		op->addr.reg = decode_register(ctxt, ctxt->modrm_rm, ctxt->d & ByteOp);
 		if (ctxt->d & Sse) {
 			op->type = OP_XMM;
 			op->bytes = 16;
@@ -1042,10 +1078,10 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
 	op->type = OP_MEM;
 
 	if (ctxt->ad_bytes == 2) {
-		unsigned bx = ctxt->regs[VCPU_REGS_RBX];
-		unsigned bp = ctxt->regs[VCPU_REGS_RBP];
-		unsigned si = ctxt->regs[VCPU_REGS_RSI];
-		unsigned di = ctxt->regs[VCPU_REGS_RDI];
+		unsigned bx = reg_read(ctxt, VCPU_REGS_RBX);
+		unsigned bp = reg_read(ctxt, VCPU_REGS_RBP);
+		unsigned si = reg_read(ctxt, VCPU_REGS_RSI);
+		unsigned di = reg_read(ctxt, VCPU_REGS_RDI);
 
 		/* 16-bit ModR/M decode. */
 		switch (ctxt->modrm_mod) {
@@ -1102,17 +1138,17 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
 			if ((base_reg & 7) == 5 && ctxt->modrm_mod == 0)
 				modrm_ea += insn_fetch(s32, ctxt);
 			else {
-				modrm_ea += ctxt->regs[base_reg];
+				modrm_ea += reg_read(ctxt, base_reg);
 				adjust_modrm_seg(ctxt, base_reg);
 			}
 			if (index_reg != 4)
-				modrm_ea += ctxt->regs[index_reg] << scale;
+				modrm_ea += reg_read(ctxt, index_reg) << scale;
 		} else if ((ctxt->modrm_rm & 7) == 5 && ctxt->modrm_mod == 0) {
 			if (ctxt->mode == X86EMUL_MODE_PROT64)
 				ctxt->rip_relative = 1;
 		} else {
 			base_reg = ctxt->modrm_rm;
-			modrm_ea += ctxt->regs[base_reg];
+			modrm_ea += reg_read(ctxt, base_reg);
 			adjust_modrm_seg(ctxt, base_reg);
 		}
 		switch (ctxt->modrm_mod) {
@@ -1250,10 +1286,10 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
 	if (rc->pos == rc->end) { /* refill pio read ahead */
 		unsigned int in_page, n;
 		unsigned int count = ctxt->rep_prefix ?
-			address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) : 1;
+			address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) : 1;
 		in_page = (ctxt->eflags & EFLG_DF) ?
-			offset_in_page(ctxt->regs[VCPU_REGS_RDI]) :
-			PAGE_SIZE - offset_in_page(ctxt->regs[VCPU_REGS_RDI]);
+			offset_in_page(reg_read(ctxt, VCPU_REGS_RDI)) :
+			PAGE_SIZE - offset_in_page(reg_read(ctxt, VCPU_REGS_RDI));
 		n = min(min(in_page, (unsigned int)sizeof(rc->data)) / size,
 			count);
 		if (n == 0)
@@ -1533,7 +1569,7 @@ static int push(struct x86_emulate_ctxt *ctxt, void *data, int bytes)
 	struct segmented_address addr;
 
 	rsp_increment(ctxt, -bytes);
-	addr.ea = ctxt->regs[VCPU_REGS_RSP] & stack_mask(ctxt);
+	addr.ea = reg_read(ctxt, VCPU_REGS_RSP) & stack_mask(ctxt);
 	addr.seg = VCPU_SREG_SS;
 
 	return segmented_write(ctxt, addr, data, bytes);
@@ -1552,7 +1588,7 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt,
 	int rc;
 	struct segmented_address addr;
 
-	addr.ea = ctxt->regs[VCPU_REGS_RSP] & stack_mask(ctxt);
+	addr.ea = reg_read(ctxt, VCPU_REGS_RSP) & stack_mask(ctxt);
 	addr.seg = VCPU_SREG_SS;
 	rc = segmented_read(ctxt, addr, dest, len);
 	if (rc != X86EMUL_CONTINUE)
@@ -1620,26 +1656,28 @@ static int em_enter(struct x86_emulate_ctxt *ctxt)
 	int rc;
 	unsigned frame_size = ctxt->src.val;
 	unsigned nesting_level = ctxt->src2.val & 31;
+	ulong rbp;
 
 	if (nesting_level)
 		return X86EMUL_UNHANDLEABLE;
 
-	rc = push(ctxt, &ctxt->regs[VCPU_REGS_RBP], stack_size(ctxt));
+	rbp = reg_read(ctxt, VCPU_REGS_RBP);
+	rc = push(ctxt, &rbp, stack_size(ctxt));
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
-	assign_masked(&ctxt->regs[VCPU_REGS_RBP], ctxt->regs[VCPU_REGS_RSP],
+	assign_masked(reg_rmw(ctxt, VCPU_REGS_RBP), reg_read(ctxt, VCPU_REGS_RSP),
 		      stack_mask(ctxt));
-	assign_masked(&ctxt->regs[VCPU_REGS_RSP],
-		      ctxt->regs[VCPU_REGS_RSP] - frame_size,
+	assign_masked(reg_rmw(ctxt, VCPU_REGS_RSP),
+		      reg_read(ctxt, VCPU_REGS_RSP) - frame_size,
 		      stack_mask(ctxt));
 	return X86EMUL_CONTINUE;
 }
 
 static int em_leave(struct x86_emulate_ctxt *ctxt)
 {
-	assign_masked(&ctxt->regs[VCPU_REGS_RSP], ctxt->regs[VCPU_REGS_RBP],
+	assign_masked(reg_rmw(ctxt, VCPU_REGS_RSP), reg_read(ctxt, VCPU_REGS_RBP),
 		      stack_mask(ctxt));
-	return emulate_pop(ctxt, &ctxt->regs[VCPU_REGS_RBP], ctxt->op_bytes);
+	return emulate_pop(ctxt, reg_rmw(ctxt, VCPU_REGS_RBP), ctxt->op_bytes);
 }
 
 static int em_push_sreg(struct x86_emulate_ctxt *ctxt)
@@ -1667,13 +1705,13 @@ static int em_pop_sreg(struct x86_emulate_ctxt *ctxt)
 
 static int em_pusha(struct x86_emulate_ctxt *ctxt)
 {
-	unsigned long old_esp = ctxt->regs[VCPU_REGS_RSP];
+	unsigned long old_esp = reg_read(ctxt, VCPU_REGS_RSP);
 	int rc = X86EMUL_CONTINUE;
 	int reg = VCPU_REGS_RAX;
 
 	while (reg <= VCPU_REGS_RDI) {
 		(reg == VCPU_REGS_RSP) ?
-		(ctxt->src.val = old_esp) : (ctxt->src.val = ctxt->regs[reg]);
+		(ctxt->src.val = old_esp) : (ctxt->src.val = reg_read(ctxt, reg));
 
 		rc = em_push(ctxt);
 		if (rc != X86EMUL_CONTINUE)
@@ -1702,7 +1740,7 @@ static int em_popa(struct x86_emulate_ctxt *ctxt)
 			--reg;
 		}
 
-		rc = emulate_pop(ctxt, &ctxt->regs[reg], ctxt->op_bytes);
+		rc = emulate_pop(ctxt, reg_rmw(ctxt, reg), ctxt->op_bytes);
 		if (rc != X86EMUL_CONTINUE)
 			break;
 		--reg;
@@ -1710,7 +1748,7 @@ static int em_popa(struct x86_emulate_ctxt *ctxt)
 	return rc;
 }
 
-int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq)
+static int __emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq)
 {
 	struct x86_emulate_ops *ops = ctxt->ops;
 	int rc;
@@ -1759,11 +1797,22 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq)
 	return rc;
 }
 
+int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq)
+{
+	int rc;
+
+	invalidate_registers(ctxt);
+	rc = __emulate_int_real(ctxt, irq);
+	if (rc == X86EMUL_CONTINUE)
+		writeback_registers(ctxt);
+	return rc;
+}
+
 static int emulate_int(struct x86_emulate_ctxt *ctxt, int irq)
 {
 	switch(ctxt->mode) {
 	case X86EMUL_MODE_REAL:
-		return emulate_int_real(ctxt, irq);
+		return __emulate_int_real(ctxt, irq);
 	case X86EMUL_MODE_VM86:
 	case X86EMUL_MODE_PROT16:
 	case X86EMUL_MODE_PROT32:
@@ -1970,14 +2019,14 @@ static int em_cmpxchg8b(struct x86_emulate_ctxt *ctxt)
 {
 	u64 old = ctxt->dst.orig_val64;
 
-	if (((u32) (old >> 0) != (u32) ctxt->regs[VCPU_REGS_RAX]) ||
-	    ((u32) (old >> 32) != (u32) ctxt->regs[VCPU_REGS_RDX])) {
-		ctxt->regs[VCPU_REGS_RAX] = (u32) (old >> 0);
-		ctxt->regs[VCPU_REGS_RDX] = (u32) (old >> 32);
+	if (((u32) (old >> 0) != (u32) reg_read(ctxt, VCPU_REGS_RAX)) ||
+	    ((u32) (old >> 32) != (u32) reg_read(ctxt, VCPU_REGS_RDX))) {
+		*reg_write(ctxt, VCPU_REGS_RAX) = (u32) (old >> 0);
+		*reg_write(ctxt, VCPU_REGS_RDX) = (u32) (old >> 32);
 		ctxt->eflags &= ~EFLG_ZF;
 	} else {
-		ctxt->dst.val64 = ((u64)ctxt->regs[VCPU_REGS_RCX] << 32) |
-			(u32) ctxt->regs[VCPU_REGS_RBX];
+		ctxt->dst.val64 = ((u64)reg_read(ctxt, VCPU_REGS_RCX) << 32) |
+			(u32) reg_read(ctxt, VCPU_REGS_RBX);
 
 		ctxt->eflags |= EFLG_ZF;
 	}
@@ -2013,7 +2062,7 @@ static int em_cmpxchg(struct x86_emulate_ctxt *ctxt)
 {
 	/* Save real source value, then compare EAX against destination. */
 	ctxt->src.orig_val = ctxt->src.val;
-	ctxt->src.val = ctxt->regs[VCPU_REGS_RAX];
+	ctxt->src.val = reg_read(ctxt, VCPU_REGS_RAX);
 	emulate_2op_SrcV(ctxt, "cmp");
 
 	if (ctxt->eflags & EFLG_ZF) {
@@ -2022,7 +2071,7 @@ static int em_cmpxchg(struct x86_emulate_ctxt *ctxt)
 	} else {
 		/* Failure: write the value we saw to EAX. */
 		ctxt->dst.type = OP_REG;
-		ctxt->dst.addr.reg = (unsigned long *)&ctxt->regs[VCPU_REGS_RAX];
+		ctxt->dst.addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX);
 	}
 	return X86EMUL_CONTINUE;
 }
@@ -2159,10 +2208,10 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt)
 	ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS);
 	ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
 
-	ctxt->regs[VCPU_REGS_RCX] = ctxt->_eip;
+	*reg_write(ctxt, VCPU_REGS_RCX) = ctxt->_eip;
 	if (efer & EFER_LMA) {
 #ifdef CONFIG_X86_64
-		ctxt->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF;
+		*reg_write(ctxt, VCPU_REGS_R11) = ctxt->eflags & ~EFLG_RF;
 
 		ops->get_msr(ctxt,
 			     ctxt->mode == X86EMUL_MODE_PROT64 ?
@@ -2241,7 +2290,7 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt)
 	ctxt->_eip = msr_data;
 
 	ops->get_msr(ctxt, MSR_IA32_SYSENTER_ESP, &msr_data);
-	ctxt->regs[VCPU_REGS_RSP] = msr_data;
+	*reg_write(ctxt, VCPU_REGS_RSP) = msr_data;
 
 	return X86EMUL_CONTINUE;
 }
@@ -2291,8 +2340,8 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt)
 	ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS);
 	ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
 
-	ctxt->_eip = ctxt->regs[VCPU_REGS_RDX];
-	ctxt->regs[VCPU_REGS_RSP] = ctxt->regs[VCPU_REGS_RCX];
+	ctxt->_eip = reg_read(ctxt, VCPU_REGS_RDX);
+	*reg_write(ctxt, VCPU_REGS_RSP) = reg_read(ctxt, VCPU_REGS_RCX);
 
 	return X86EMUL_CONTINUE;
 }
@@ -2361,14 +2410,14 @@ static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt,
 {
 	tss->ip = ctxt->_eip;
 	tss->flag = ctxt->eflags;
-	tss->ax = ctxt->regs[VCPU_REGS_RAX];
-	tss->cx = ctxt->regs[VCPU_REGS_RCX];
-	tss->dx = ctxt->regs[VCPU_REGS_RDX];
-	tss->bx = ctxt->regs[VCPU_REGS_RBX];
-	tss->sp = ctxt->regs[VCPU_REGS_RSP];
-	tss->bp = ctxt->regs[VCPU_REGS_RBP];
-	tss->si = ctxt->regs[VCPU_REGS_RSI];
-	tss->di = ctxt->regs[VCPU_REGS_RDI];
+	tss->ax = reg_read(ctxt, VCPU_REGS_RAX);
+	tss->cx = reg_read(ctxt, VCPU_REGS_RCX);
+	tss->dx = reg_read(ctxt, VCPU_REGS_RDX);
+	tss->bx = reg_read(ctxt, VCPU_REGS_RBX);
+	tss->sp = reg_read(ctxt, VCPU_REGS_RSP);
+	tss->bp = reg_read(ctxt, VCPU_REGS_RBP);
+	tss->si = reg_read(ctxt, VCPU_REGS_RSI);
+	tss->di = reg_read(ctxt, VCPU_REGS_RDI);
 
 	tss->es = get_segment_selector(ctxt, VCPU_SREG_ES);
 	tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS);
@@ -2384,14 +2433,14 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
 
 	ctxt->_eip = tss->ip;
 	ctxt->eflags = tss->flag | 2;
-	ctxt->regs[VCPU_REGS_RAX] = tss->ax;
-	ctxt->regs[VCPU_REGS_RCX] = tss->cx;
-	ctxt->regs[VCPU_REGS_RDX] = tss->dx;
-	ctxt->regs[VCPU_REGS_RBX] = tss->bx;
-	ctxt->regs[VCPU_REGS_RSP] = tss->sp;
-	ctxt->regs[VCPU_REGS_RBP] = tss->bp;
-	ctxt->regs[VCPU_REGS_RSI] = tss->si;
-	ctxt->regs[VCPU_REGS_RDI] = tss->di;
+	*reg_write(ctxt, VCPU_REGS_RAX) = tss->ax;
+	*reg_write(ctxt, VCPU_REGS_RCX) = tss->cx;
+	*reg_write(ctxt, VCPU_REGS_RDX) = tss->dx;
+	*reg_write(ctxt, VCPU_REGS_RBX) = tss->bx;
+	*reg_write(ctxt, VCPU_REGS_RSP) = tss->sp;
+	*reg_write(ctxt, VCPU_REGS_RBP) = tss->bp;
+	*reg_write(ctxt, VCPU_REGS_RSI) = tss->si;
+	*reg_write(ctxt, VCPU_REGS_RDI) = tss->di;
 
 	/*
 	 * SDM says that segment selectors are loaded before segment
@@ -2476,14 +2525,14 @@ static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt,
 	tss->cr3 = ctxt->ops->get_cr(ctxt, 3);
 	tss->eip = ctxt->_eip;
 	tss->eflags = ctxt->eflags;
-	tss->eax = ctxt->regs[VCPU_REGS_RAX];
-	tss->ecx = ctxt->regs[VCPU_REGS_RCX];
-	tss->edx = ctxt->regs[VCPU_REGS_RDX];
-	tss->ebx = ctxt->regs[VCPU_REGS_RBX];
-	tss->esp = ctxt->regs[VCPU_REGS_RSP];
-	tss->ebp = ctxt->regs[VCPU_REGS_RBP];
-	tss->esi = ctxt->regs[VCPU_REGS_RSI];
-	tss->edi = ctxt->regs[VCPU_REGS_RDI];
+	tss->eax = reg_read(ctxt, VCPU_REGS_RAX);
+	tss->ecx = reg_read(ctxt, VCPU_REGS_RCX);
+	tss->edx = reg_read(ctxt, VCPU_REGS_RDX);
+	tss->ebx = reg_read(ctxt, VCPU_REGS_RBX);
+	tss->esp = reg_read(ctxt, VCPU_REGS_RSP);
+	tss->ebp = reg_read(ctxt, VCPU_REGS_RBP);
+	tss->esi = reg_read(ctxt, VCPU_REGS_RSI);
+	tss->edi = reg_read(ctxt, VCPU_REGS_RDI);
 
 	tss->es = get_segment_selector(ctxt, VCPU_SREG_ES);
 	tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS);
@@ -2505,14 +2554,14 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
 	ctxt->eflags = tss->eflags | 2;
 
 	/* General purpose registers */
-	ctxt->regs[VCPU_REGS_RAX] = tss->eax;
-	ctxt->regs[VCPU_REGS_RCX] = tss->ecx;
-	ctxt->regs[VCPU_REGS_RDX] = tss->edx;
-	ctxt->regs[VCPU_REGS_RBX] = tss->ebx;
-	ctxt->regs[VCPU_REGS_RSP] = tss->esp;
-	ctxt->regs[VCPU_REGS_RBP] = tss->ebp;
-	ctxt->regs[VCPU_REGS_RSI] = tss->esi;
-	ctxt->regs[VCPU_REGS_RDI] = tss->edi;
+	*reg_write(ctxt, VCPU_REGS_RAX) = tss->eax;
+	*reg_write(ctxt, VCPU_REGS_RCX) = tss->ecx;
+	*reg_write(ctxt, VCPU_REGS_RDX) = tss->edx;
+	*reg_write(ctxt, VCPU_REGS_RBX) = tss->ebx;
+	*reg_write(ctxt, VCPU_REGS_RSP) = tss->esp;
+	*reg_write(ctxt, VCPU_REGS_RBP) = tss->ebp;
+	*reg_write(ctxt, VCPU_REGS_RSI) = tss->esi;
+	*reg_write(ctxt, VCPU_REGS_RDI) = tss->edi;
 
 	/*
 	 * SDM says that segment selectors are loaded before segment
@@ -2727,14 +2776,17 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
 {
 	int rc;
 
+	invalidate_registers(ctxt);
 	ctxt->_eip = ctxt->eip;
 	ctxt->dst.type = OP_NONE;
 
 	rc = emulator_do_task_switch(ctxt, tss_selector, idt_index, reason,
 				     has_error_code, error_code);
 
-	if (rc == X86EMUL_CONTINUE)
+	if (rc == X86EMUL_CONTINUE) {
 		ctxt->eip = ctxt->_eip;
+		writeback_registers(ctxt);
+	}
 
 	return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
 }
@@ -2744,8 +2796,8 @@ static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned seg,
 {
 	int df = (ctxt->eflags & EFLG_DF) ? -1 : 1;
 
-	register_address_increment(ctxt, &ctxt->regs[reg], df * op->bytes);
-	op->addr.mem.ea = register_address(ctxt, ctxt->regs[reg]);
+	register_address_increment(ctxt, reg_rmw(ctxt, reg), df * op->bytes);
+	op->addr.mem.ea = register_address(ctxt, reg_read(ctxt, reg));
 	op->addr.mem.seg = seg;
 }
 
@@ -2921,7 +2973,7 @@ static int em_cwd(struct x86_emulate_ctxt *ctxt)
 {
 	ctxt->dst.type = OP_REG;
 	ctxt->dst.bytes = ctxt->src.bytes;
-	ctxt->dst.addr.reg = &ctxt->regs[VCPU_REGS_RDX];
+	ctxt->dst.addr.reg = reg_rmw(ctxt, VCPU_REGS_RDX);
 	ctxt->dst.val = ~((ctxt->src.val >> (ctxt->src.bytes * 8 - 1)) - 1);
 
 	return X86EMUL_CONTINUE;
@@ -2932,8 +2984,8 @@ static int em_rdtsc(struct x86_emulate_ctxt *ctxt)
 	u64 tsc = 0;
 
 	ctxt->ops->get_msr(ctxt, MSR_IA32_TSC, &tsc);
-	ctxt->regs[VCPU_REGS_RAX] = (u32)tsc;
-	ctxt->regs[VCPU_REGS_RDX] = tsc >> 32;
+	*reg_write(ctxt, VCPU_REGS_RAX) = (u32)tsc;
+	*reg_write(ctxt, VCPU_REGS_RDX) = tsc >> 32;
 	return X86EMUL_CONTINUE;
 }
 
@@ -2941,10 +2993,10 @@ static int em_rdpmc(struct x86_emulate_ctxt *ctxt)
 {
 	u64 pmc;
 
-	if (ctxt->ops->read_pmc(ctxt, ctxt->regs[VCPU_REGS_RCX], &pmc))
+	if (ctxt->ops->read_pmc(ctxt, reg_read(ctxt, VCPU_REGS_RCX), &pmc))
 		return emulate_gp(ctxt, 0);
-	ctxt->regs[VCPU_REGS_RAX] = (u32)pmc;
-	ctxt->regs[VCPU_REGS_RDX] = pmc >> 32;
+	*reg_write(ctxt, VCPU_REGS_RAX) = (u32)pmc;
+	*reg_write(ctxt, VCPU_REGS_RDX) = pmc >> 32;
 	return X86EMUL_CONTINUE;
 }
 
@@ -2986,9 +3038,9 @@ static int em_wrmsr(struct x86_emulate_ctxt *ctxt)
 {
 	u64 msr_data;
 
-	msr_data = (u32)ctxt->regs[VCPU_REGS_RAX]
-		| ((u64)ctxt->regs[VCPU_REGS_RDX] << 32);
-	if (ctxt->ops->set_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], msr_data))
+	msr_data = (u32)reg_read(ctxt, VCPU_REGS_RAX)
+		| ((u64)reg_read(ctxt, VCPU_REGS_RDX) << 32);
+	if (ctxt->ops->set_msr(ctxt, reg_read(ctxt, VCPU_REGS_RCX), msr_data))
 		return emulate_gp(ctxt, 0);
 
 	return X86EMUL_CONTINUE;
@@ -2998,11 +3050,11 @@ static int em_rdmsr(struct x86_emulate_ctxt *ctxt)
 {
 	u64 msr_data;
 
-	if (ctxt->ops->get_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], &msr_data))
+	if (ctxt->ops->get_msr(ctxt, reg_read(ctxt, VCPU_REGS_RCX), &msr_data))
 		return emulate_gp(ctxt, 0);
 
-	ctxt->regs[VCPU_REGS_RAX] = (u32)msr_data;
-	ctxt->regs[VCPU_REGS_RDX] = msr_data >> 32;
+	*reg_write(ctxt, VCPU_REGS_RAX) = (u32)msr_data;
+	*reg_write(ctxt, VCPU_REGS_RDX) = msr_data >> 32;
 	return X86EMUL_CONTINUE;
 }
 
@@ -3182,8 +3234,8 @@ static int em_lmsw(struct x86_emulate_ctxt *ctxt)
 
 static int em_loop(struct x86_emulate_ctxt *ctxt)
 {
-	register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RCX], -1);
-	if ((address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) != 0) &&
+	register_address_increment(ctxt, reg_rmw(ctxt, VCPU_REGS_RCX), -1);
+	if ((address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) != 0) &&
 	    (ctxt->b == 0xe2 || test_cc(ctxt->b ^ 0x5, ctxt->eflags)))
 		jmp_rel(ctxt, ctxt->src.val);
 
@@ -3192,7 +3244,7 @@ static int em_loop(struct x86_emulate_ctxt *ctxt)
 
 static int em_jcxz(struct x86_emulate_ctxt *ctxt)
 {
-	if (address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) == 0)
+	if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0)
 		jmp_rel(ctxt, ctxt->src.val);
 
 	return X86EMUL_CONTINUE;
@@ -3280,20 +3332,20 @@ static int em_cpuid(struct x86_emulate_ctxt *ctxt)
 {
 	u32 eax, ebx, ecx, edx;
 
-	eax = ctxt->regs[VCPU_REGS_RAX];
-	ecx = ctxt->regs[VCPU_REGS_RCX];
+	eax = reg_read(ctxt, VCPU_REGS_RAX);
+	ecx = reg_read(ctxt, VCPU_REGS_RCX);
 	ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
-	ctxt->regs[VCPU_REGS_RAX] = eax;
-	ctxt->regs[VCPU_REGS_RBX] = ebx;
-	ctxt->regs[VCPU_REGS_RCX] = ecx;
-	ctxt->regs[VCPU_REGS_RDX] = edx;
+	*reg_write(ctxt, VCPU_REGS_RAX) = eax;
+	*reg_write(ctxt, VCPU_REGS_RBX) = ebx;
+	*reg_write(ctxt, VCPU_REGS_RCX) = ecx;
+	*reg_write(ctxt, VCPU_REGS_RDX) = edx;
 	return X86EMUL_CONTINUE;
 }
 
 static int em_lahf(struct x86_emulate_ctxt *ctxt)
 {
-	ctxt->regs[VCPU_REGS_RAX] &= ~0xff00UL;
-	ctxt->regs[VCPU_REGS_RAX] |= (ctxt->eflags & 0xff) << 8;
+	*reg_rmw(ctxt, VCPU_REGS_RAX) &= ~0xff00UL;
+	*reg_rmw(ctxt, VCPU_REGS_RAX) |= (ctxt->eflags & 0xff) << 8;
 	return X86EMUL_CONTINUE;
 }
 
@@ -3450,7 +3502,7 @@ static int check_svme(struct x86_emulate_ctxt *ctxt)
 
 static int check_svme_pa(struct x86_emulate_ctxt *ctxt)
 {
-	u64 rax = ctxt->regs[VCPU_REGS_RAX];
+	u64 rax = reg_read(ctxt, VCPU_REGS_RAX);
 
 	/* Valid physical address? */
 	if (rax & 0xffff000000000000ULL)
@@ -3472,7 +3524,7 @@ static int check_rdtsc(struct x86_emulate_ctxt *ctxt)
 static int check_rdpmc(struct x86_emulate_ctxt *ctxt)
 {
 	u64 cr4 = ctxt->ops->get_cr(ctxt, 4);
-	u64 rcx = ctxt->regs[VCPU_REGS_RCX];
+	u64 rcx = reg_read(ctxt, VCPU_REGS_RCX);
 
 	if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) ||
 	    (rcx > 3))
@@ -3930,7 +3982,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
 	case OpAcc:
 		op->type = OP_REG;
 		op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
-		op->addr.reg = &ctxt->regs[VCPU_REGS_RAX];
+		op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX);
 		fetch_register_operand(op);
 		op->orig_val = op->val;
 		break;
@@ -3938,19 +3990,19 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
 		op->type = OP_MEM;
 		op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
 		op->addr.mem.ea =
-			register_address(ctxt, ctxt->regs[VCPU_REGS_RDI]);
+			register_address(ctxt, reg_read(ctxt, VCPU_REGS_RDI));
 		op->addr.mem.seg = VCPU_SREG_ES;
 		op->val = 0;
 		break;
 	case OpDX:
 		op->type = OP_REG;
 		op->bytes = 2;
-		op->addr.reg = &ctxt->regs[VCPU_REGS_RDX];
+		op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RDX);
 		fetch_register_operand(op);
 		break;
 	case OpCL:
 		op->bytes = 1;
-		op->val = ctxt->regs[VCPU_REGS_RCX] & 0xff;
+		op->val = reg_read(ctxt, VCPU_REGS_RCX) & 0xff;
 		break;
 	case OpImmByte:
 		rc = decode_imm(ctxt, op, 1, true);
@@ -3981,7 +4033,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
 		op->type = OP_MEM;
 		op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
 		op->addr.mem.ea =
-			register_address(ctxt, ctxt->regs[VCPU_REGS_RSI]);
+			register_address(ctxt, reg_read(ctxt, VCPU_REGS_RSI));
 		op->addr.mem.seg = seg_override(ctxt);
 		op->val = 0;
 		break;
@@ -4287,6 +4339,7 @@ static void fetch_possible_mmx_operand(struct x86_emulate_ctxt *ctxt,
 		read_mmx_reg(ctxt, &op->mm_val, op->addr.mm);
 }
 
+
 int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 {
 	struct x86_emulate_ops *ops = ctxt->ops;
@@ -4371,7 +4424,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 
 	if (ctxt->rep_prefix && (ctxt->d & String)) {
 		/* All REP prefixes have the same first termination condition */
-		if (address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) == 0) {
+		if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0) {
 			ctxt->eip = ctxt->_eip;
 			goto done;
 		}
@@ -4444,7 +4497,7 @@ special_insn:
 		ctxt->dst.val = ctxt->src.addr.mem.ea;
 		break;
 	case 0x90 ... 0x97: /* nop / xchg reg, rax */
-		if (ctxt->dst.addr.reg == &ctxt->regs[VCPU_REGS_RAX])
+		if (ctxt->dst.addr.reg == reg_rmw(ctxt, VCPU_REGS_RAX))
 			break;
 		rc = em_xchg(ctxt);
 		break;
@@ -4472,7 +4525,7 @@ special_insn:
 		rc = em_grp2(ctxt);
 		break;
 	case 0xd2 ... 0xd3:	/* Grp2 */
-		ctxt->src.val = ctxt->regs[VCPU_REGS_RCX];
+		ctxt->src.val = reg_read(ctxt, VCPU_REGS_RCX);
 		rc = em_grp2(ctxt);
 		break;
 	case 0xe9: /* jmp rel */
@@ -4527,14 +4580,14 @@ writeback:
 
 	if (ctxt->rep_prefix && (ctxt->d & String)) {
 		struct read_cache *r = &ctxt->io_read;
-		register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RCX], -1);
+		register_address_increment(ctxt, reg_rmw(ctxt, VCPU_REGS_RCX), -1);
 
 		if (!string_insn_completed(ctxt)) {
 			/*
 			 * Re-enter guest when pio read ahead buffer is empty
 			 * or, if it is not used, after each 1024 iteration.
 			 */
-			if ((r->end != 0 || ctxt->regs[VCPU_REGS_RCX] & 0x3ff) &&
+			if ((r->end != 0 || reg_read(ctxt, VCPU_REGS_RCX) & 0x3ff) &&
 			    (r->end == 0 || r->end != r->pos)) {
 				/*
 				 * Reset read cache. Usually happens before
@@ -4542,6 +4595,7 @@ writeback:
 				 * we have to do it here.
 				 */
 				ctxt->mem_read.end = 0;
+				writeback_registers(ctxt);
 				return EMULATION_RESTART;
 			}
 			goto done; /* skip rip writeback */
@@ -4556,6 +4610,9 @@ done:
 	if (rc == X86EMUL_INTERCEPTED)
 		return EMULATION_INTERCEPTED;
 
+	if (rc == X86EMUL_CONTINUE)
+		writeback_registers(ctxt);
+
 	return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
 
 twobyte_insn:
@@ -4628,3 +4685,13 @@ twobyte_insn:
 cannot_emulate:
 	return EMULATION_FAILED;
 }
+
+void emulator_invalidate_register_cache(struct x86_emulate_ctxt *ctxt)
+{
+	invalidate_registers(ctxt);
+}
+
+void emulator_writeback_register_cache(struct x86_emulate_ctxt *ctxt)
+{
+	writeback_registers(ctxt);
+}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 42bbf4187d2..e00050ce7a6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4313,7 +4313,19 @@ static void emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
 	kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx);
 }
 
+static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg)
+{
+	return kvm_register_read(emul_to_vcpu(ctxt), reg);
+}
+
+static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulong val)
+{
+	kvm_register_write(emul_to_vcpu(ctxt), reg, val);
+}
+
 static struct x86_emulate_ops emulate_ops = {
+	.read_gpr            = emulator_read_gpr,
+	.write_gpr           = emulator_write_gpr,
 	.read_std            = kvm_read_guest_virt_system,
 	.write_std           = kvm_write_guest_virt_system,
 	.fetch               = kvm_fetch_guest_virt,
@@ -4348,14 +4360,6 @@ static struct x86_emulate_ops emulate_ops = {
 	.get_cpuid           = emulator_get_cpuid,
 };
 
-static void cache_all_regs(struct kvm_vcpu *vcpu)
-{
-	kvm_register_read(vcpu, VCPU_REGS_RAX);
-	kvm_register_read(vcpu, VCPU_REGS_RSP);
-	kvm_register_read(vcpu, VCPU_REGS_RIP);
-	vcpu->arch.regs_dirty = ~0;
-}
-
 static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
 {
 	u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu, mask);
@@ -4382,12 +4386,10 @@ static void inject_emulated_exception(struct kvm_vcpu *vcpu)
 		kvm_queue_exception(vcpu, ctxt->exception.vector);
 }
 
-static void init_decode_cache(struct x86_emulate_ctxt *ctxt,
-			      const unsigned long *regs)
+static void init_decode_cache(struct x86_emulate_ctxt *ctxt)
 {
 	memset(&ctxt->twobyte, 0,
-	       (void *)&ctxt->regs - (void *)&ctxt->twobyte);
-	memcpy(ctxt->regs, regs, sizeof(ctxt->regs));
+	       (void *)&ctxt->_regs - (void *)&ctxt->twobyte);
 
 	ctxt->fetch.start = 0;
 	ctxt->fetch.end = 0;
@@ -4402,14 +4404,6 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
 	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
 	int cs_db, cs_l;
 
-	/*
-	 * TODO: fix emulate.c to use guest_read/write_register
-	 * instead of direct ->regs accesses, can save hundred cycles
-	 * on Intel for instructions that don't read/change RSP, for
-	 * for example.
-	 */
-	cache_all_regs(vcpu);
-
 	kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
 
 	ctxt->eflags = kvm_get_rflags(vcpu);
@@ -4421,7 +4415,7 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
 							  X86EMUL_MODE_PROT16;
 	ctxt->guest_mode = is_guest_mode(vcpu);
 
-	init_decode_cache(ctxt, vcpu->arch.regs);
+	init_decode_cache(ctxt);
 	vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
 }
 
@@ -4441,7 +4435,6 @@ int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
 		return EMULATE_FAIL;
 
 	ctxt->eip = ctxt->_eip;
-	memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
 	kvm_rip_write(vcpu, ctxt->eip);
 	kvm_set_rflags(vcpu, ctxt->eflags);
 
@@ -4599,7 +4592,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
 	   changes registers values  during IO operation */
 	if (vcpu->arch.emulate_regs_need_sync_from_vcpu) {
 		vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
-		memcpy(ctxt->regs, vcpu->arch.regs, sizeof ctxt->regs);
+		emulator_invalidate_register_cache(ctxt);
 	}
 
 restart:
@@ -4637,7 +4630,6 @@ restart:
 		toggle_interruptibility(vcpu, ctxt->interruptibility);
 		kvm_set_rflags(vcpu, ctxt->eflags);
 		kvm_make_request(KVM_REQ_EVENT, vcpu);
-		memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
 		vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
 		kvm_rip_write(vcpu, ctxt->eip);
 	} else
@@ -5591,8 +5583,7 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 		 * that usually, but some bad designed PV devices (vmware
 		 * backdoor interface) need this to work
 		 */
-		struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
-		memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
+		emulator_writeback_register_cache(&vcpu->arch.emulate_ctxt);
 		vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
 	}
 	regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
@@ -5723,6 +5714,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
 {
 	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
 	int ret;
+	unsigned reg;
 
 	init_emulate_ctxt(vcpu);
 
@@ -5732,7 +5724,6 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
 	if (ret)
 		return EMULATE_FAIL;
 
-	memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
 	kvm_rip_write(vcpu, ctxt->eip);
 	kvm_set_rflags(vcpu, ctxt->eflags);
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
-- 
cgit v1.2.3


From baa7e81e325bbb6ddebd7680ac1068859244b61d Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 21 Aug 2012 17:06:58 +0300
Subject: KVM: VMX: Separate saving pre-realmode state from setting segments

Commit b246dd5df139 ("KVM: VMX: Fix KVM_SET_SREGS with big real mode
segments") moved fix_rmode_seg() to vmx_set_segment(), so that it is
applied not just on transitions to real mode, but also on KVM_SET_SREGS
(migration).  However fix_rmode_seg() not only munges the vmcs segments,
it also sets up the save area for us to restore when returning to
protected mode or to return in vmx_get_segment().

Move saving the segment into a new function, save_rmode_seg(), and
call it just during the transition.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 13e0296cea4..4e49caf9224 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2768,7 +2768,7 @@ static gva_t rmode_tss_base(struct kvm *kvm)
 	return kvm->arch.tss_addr;
 }
 
-static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
+static void save_rmode_seg(int seg, struct kvm_save_segment *save)
 {
 	struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
 
@@ -2776,6 +2776,12 @@ static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
 	save->base = vmcs_readl(sf->base);
 	save->limit = vmcs_read32(sf->limit);
 	save->ar = vmcs_read32(sf->ar_bytes);
+}
+
+static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
+{
+	struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
+
 	vmcs_write16(sf->selector, save->base >> 4);
 	vmcs_write32(sf->base, save->base & 0xffff0);
 	vmcs_write32(sf->limit, 0xffff);
@@ -2798,6 +2804,12 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
 	vmx->emulation_required = 1;
 	vmx->rmode.vm86_active = 1;
 
+	save_rmode_seg(VCPU_SREG_TR, &vmx->rmode.tr);
+	save_rmode_seg(VCPU_SREG_ES, &vmx->rmode.es);
+	save_rmode_seg(VCPU_SREG_DS, &vmx->rmode.ds);
+	save_rmode_seg(VCPU_SREG_FS, &vmx->rmode.fs);
+	save_rmode_seg(VCPU_SREG_GS, &vmx->rmode.gs);
+
 	/*
 	 * Very old userspace does not call KVM_SET_TSS_ADDR before entering
 	 * vcpu. Call it here with phys address pointing 16M below 4G.
@@ -2812,14 +2824,8 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
 
 	vmx_segment_cache_clear(vmx);
 
-	vmx->rmode.tr.selector = vmcs_read16(GUEST_TR_SELECTOR);
-	vmx->rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
 	vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
-
-	vmx->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
 	vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
-
-	vmx->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
 	vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
 
 	flags = vmcs_readl(GUEST_RFLAGS);
-- 
cgit v1.2.3


From 72fbefec26841699fee9ad0b050624aeb43d5bae Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 21 Aug 2012 17:06:59 +0300
Subject: KVM: VMX: Fix incorrect lookup of segment S flag in
 fix_pmode_dataseg()

fix_pmode_dataseg() looks up S in ->base instead of ->ar_bytes.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 4e49caf9224..1d93079432b 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2693,11 +2693,11 @@ static __exit void hardware_unsetup(void)
 	free_kvm_area();
 }
 
-static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save)
+static void fix_pmode_dataseg(int seg, struct kvm_segment *save)
 {
 	struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
 
-	if (vmcs_readl(sf->base) == save->base && (save->base & AR_S_MASK)) {
+	if (vmcs_readl(sf->base) == save->base && (save->ar_bytes & AR_S_MASK)) {
 		vmcs_write16(sf->selector, save->selector);
 		vmcs_writel(sf->base, save->base);
 		vmcs_write32(sf->limit, save->limit);
-- 
cgit v1.2.3


From f5f7b2fe3bf849b58c8144729aba78b8e29e1e4c Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 21 Aug 2012 17:07:00 +0300
Subject: KVM: VMX: Use kvm_segment to save protected-mode segments when
 entering realmode

Instead of using struct kvm_save_segment, use struct kvm_segment, which is what
the other APIs use.  This leads to some simplification.

We replace save_rmode_seg() with a call to vmx_save_segment().  Since this depends
on rmode.vm86_active, we move the call to before setting the flag.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 85 +++++++++++++++---------------------------------------
 1 file changed, 24 insertions(+), 61 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 1d93079432b..7e95ff68b9d 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -405,16 +405,16 @@ struct vcpu_vmx {
 	struct {
 		int vm86_active;
 		ulong save_rflags;
+		struct kvm_segment segs[8];
+	} rmode;
+	struct {
+		u32 bitmask; /* 4 bits per segment (1 bit per field) */
 		struct kvm_save_segment {
 			u16 selector;
 			unsigned long base;
 			u32 limit;
 			u32 ar;
-		} tr, es, ds, fs, gs;
-	} rmode;
-	struct {
-		u32 bitmask; /* 4 bits per segment (1 bit per field) */
-		struct kvm_save_segment seg[8];
+		} seg[8];
 	} segment_cache;
 	int vpid;
 	bool emulation_required;
@@ -2693,15 +2693,12 @@ static __exit void hardware_unsetup(void)
 	free_kvm_area();
 }
 
-static void fix_pmode_dataseg(int seg, struct kvm_segment *save)
+static void fix_pmode_dataseg(struct kvm_vcpu *vcpu, int seg, struct kvm_segment *save)
 {
 	struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
 
-	if (vmcs_readl(sf->base) == save->base && (save->ar_bytes & AR_S_MASK)) {
-		vmcs_write16(sf->selector, save->selector);
-		vmcs_writel(sf->base, save->base);
-		vmcs_write32(sf->limit, save->limit);
-		vmcs_write32(sf->ar_bytes, save->ar);
+	if (vmcs_readl(sf->base) == save->base && save->s) {
+		vmx_set_segment(vcpu, save, seg);
 	} else {
 		u32 dpl = (vmcs_read16(sf->selector) & SELECTOR_RPL_MASK)
 			<< AR_DPL_SHIFT;
@@ -2719,10 +2716,7 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
 
 	vmx_segment_cache_clear(vmx);
 
-	vmcs_write16(GUEST_TR_SELECTOR, vmx->rmode.tr.selector);
-	vmcs_writel(GUEST_TR_BASE, vmx->rmode.tr.base);
-	vmcs_write32(GUEST_TR_LIMIT, vmx->rmode.tr.limit);
-	vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar);
+	vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
 
 	flags = vmcs_readl(GUEST_RFLAGS);
 	flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
@@ -2737,10 +2731,10 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
 	if (emulate_invalid_guest_state)
 		return;
 
-	fix_pmode_dataseg(VCPU_SREG_ES, &vmx->rmode.es);
-	fix_pmode_dataseg(VCPU_SREG_DS, &vmx->rmode.ds);
-	fix_pmode_dataseg(VCPU_SREG_GS, &vmx->rmode.gs);
-	fix_pmode_dataseg(VCPU_SREG_FS, &vmx->rmode.fs);
+	fix_pmode_dataseg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
+	fix_pmode_dataseg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
+	fix_pmode_dataseg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
+	fix_pmode_dataseg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
 
 	vmx_segment_cache_clear(vmx);
 
@@ -2768,17 +2762,7 @@ static gva_t rmode_tss_base(struct kvm *kvm)
 	return kvm->arch.tss_addr;
 }
 
-static void save_rmode_seg(int seg, struct kvm_save_segment *save)
-{
-	struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
-
-	save->selector = vmcs_read16(sf->selector);
-	save->base = vmcs_readl(sf->base);
-	save->limit = vmcs_read32(sf->limit);
-	save->ar = vmcs_read32(sf->ar_bytes);
-}
-
-static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
+static void fix_rmode_seg(int seg, struct kvm_segment *save)
 {
 	struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
 
@@ -2801,14 +2785,15 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
 	if (enable_unrestricted_guest)
 		return;
 
+	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
+	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
+	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
+	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
+	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
+
 	vmx->emulation_required = 1;
 	vmx->rmode.vm86_active = 1;
 
-	save_rmode_seg(VCPU_SREG_TR, &vmx->rmode.tr);
-	save_rmode_seg(VCPU_SREG_ES, &vmx->rmode.es);
-	save_rmode_seg(VCPU_SREG_DS, &vmx->rmode.ds);
-	save_rmode_seg(VCPU_SREG_FS, &vmx->rmode.fs);
-	save_rmode_seg(VCPU_SREG_GS, &vmx->rmode.gs);
 
 	/*
 	 * Very old userspace does not call KVM_SET_TSS_ADDR before entering
@@ -3118,7 +3103,6 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
 			    struct kvm_segment *var, int seg)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	struct kvm_save_segment *save;
 	u32 ar;
 
 	if (vmx->rmode.vm86_active
@@ -3126,27 +3110,15 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
 		|| seg == VCPU_SREG_DS || seg == VCPU_SREG_FS
 		|| seg == VCPU_SREG_GS)
 	    && !emulate_invalid_guest_state) {
-		switch (seg) {
-		case VCPU_SREG_TR: save = &vmx->rmode.tr; break;
-		case VCPU_SREG_ES: save = &vmx->rmode.es; break;
-		case VCPU_SREG_DS: save = &vmx->rmode.ds; break;
-		case VCPU_SREG_FS: save = &vmx->rmode.fs; break;
-		case VCPU_SREG_GS: save = &vmx->rmode.gs; break;
-		default: BUG();
-		}
-		var->selector = save->selector;
-		var->base = save->base;
-		var->limit = save->limit;
-		ar = save->ar;
+		*var = vmx->rmode.segs[seg];
 		if (seg == VCPU_SREG_TR
 		    || var->selector == vmx_read_guest_seg_selector(vmx, seg))
-			goto use_saved_rmode_seg;
+			return;
 	}
 	var->base = vmx_read_guest_seg_base(vmx, seg);
 	var->limit = vmx_read_guest_seg_limit(vmx, seg);
 	var->selector = vmx_read_guest_seg_selector(vmx, seg);
 	ar = vmx_read_guest_seg_ar(vmx, seg);
-use_saved_rmode_seg:
 	if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state)
 		ar = 0;
 	var->type = ar & 15;
@@ -3235,10 +3207,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
 
 	if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) {
 		vmcs_write16(sf->selector, var->selector);
-		vmx->rmode.tr.selector = var->selector;
-		vmx->rmode.tr.base = var->base;
-		vmx->rmode.tr.limit = var->limit;
-		vmx->rmode.tr.ar = vmx_segment_access_rights(var);
+		vmx->rmode.segs[VCPU_SREG_TR] = *var;
 		return;
 	}
 	vmcs_writel(sf->base, var->base);
@@ -3289,16 +3258,10 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
 				     vmcs_readl(GUEST_CS_BASE) >> 4);
 			break;
 		case VCPU_SREG_ES:
-			fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.es);
-			break;
 		case VCPU_SREG_DS:
-			fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.ds);
-			break;
 		case VCPU_SREG_GS:
-			fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.gs);
-			break;
 		case VCPU_SREG_FS:
-			fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.fs);
+			fix_rmode_seg(seg, &vmx->rmode.segs[seg]);
 			break;
 		case VCPU_SREG_SS:
 			vmcs_write16(GUEST_SS_SELECTOR,
-- 
cgit v1.2.3


From c865c43de66dc973865bda337022f03b6e16c8df Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 21 Aug 2012 17:07:01 +0300
Subject: KVM: VMX: Retain limit and attributes when entering protected mode

Real processors don't change segment limits and attributes while in
real mode.  Mimic that behaviour.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 7e95ff68b9d..88eeb405560 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2696,14 +2696,14 @@ static __exit void hardware_unsetup(void)
 static void fix_pmode_dataseg(struct kvm_vcpu *vcpu, int seg, struct kvm_segment *save)
 {
 	struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
+	struct kvm_segment tmp = *save;
 
-	if (vmcs_readl(sf->base) == save->base && save->s) {
-		vmx_set_segment(vcpu, save, seg);
-	} else {
-		u32 dpl = (vmcs_read16(sf->selector) & SELECTOR_RPL_MASK)
-			<< AR_DPL_SHIFT;
-		vmcs_write32(sf->ar_bytes, 0x93 | dpl);
+	if (!(vmcs_readl(sf->base) == tmp.base && tmp.s)) {
+		tmp.base = vmcs_readl(sf->base);
+		tmp.selector = vmcs_read16(sf->selector);
+		tmp.s = 1;
 	}
+	vmx_set_segment(vcpu, &tmp, seg);
 }
 
 static void enter_pmode(struct kvm_vcpu *vcpu)
-- 
cgit v1.2.3


From 495e116684cebc5ae625916aba37fc07f345707b Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 21 Aug 2012 17:07:02 +0300
Subject: KVM: VMX: Allow real mode emulation using vm86 with dpl=0

Real mode is always entered from protected mode with dpl=0.  Since
the dpl doesn't affect execution, and we already override it to 3
in the vmcs (as vmx requires), we can allow execution in that state.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 88eeb405560..4811d91759a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3317,7 +3317,7 @@ static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
 		return false;
 	if (var.limit != 0xffff)
 		return false;
-	if (ar != 0xf3)
+	if ((ar | (3 << AR_DPL_SHIFT)) != 0xf3)
 		return false;
 
 	return true;
-- 
cgit v1.2.3


From e2a610d7fc3e285af8061ff071761752255d95f6 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 21 Aug 2012 17:07:03 +0300
Subject: KVM: VMX: Allow vm86 virtualization of big real mode

Usually, big real mode uses large (4GB) segments.  Currently we don't
virtualize this; if any segment has a limit other than 0xffff, we emulate.
But if we set the vmx-visible limit to 0xffff, we can use vm86 to virtualize
real mode; if an access overruns the segment limit, the guest will #GP, which
we will trap and forward to the emulator.  This results in significantly
faster execution, and less risk of hitting an unemulated instruction.

If the limit is less than 0xffff, we retain the existing behaviour.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 4811d91759a..fd21eb45466 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3315,7 +3315,7 @@ static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
 
 	if (var.base != (var.selector << 4))
 		return false;
-	if (var.limit != 0xffff)
+	if (var.limit < 0xffff)
 		return false;
 	if ((ar | (3 << AR_DPL_SHIFT)) != 0xf3)
 		return false;
-- 
cgit v1.2.3


From 03ebebeb1ff5d1d6209fd8df4ffc9204df82bd55 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 21 Aug 2012 17:07:04 +0300
Subject: KVM: x86 emulator: Leave segment limit and attributs alone in real
 mode

When loading a segment in real mode, only the base and selector must
be modified.  The limit needs to be left alone, otherwise big real mode
users will hit a #GP due to limit checking (currently this is suppressed
because we don't check limits in real mode).

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 5e27ba53261..f8b27cd2a6c 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1388,19 +1388,15 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 	bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */
 	ulong desc_addr;
 	int ret;
+	u16 dummy;
 
 	memset(&seg_desc, 0, sizeof seg_desc);
 
 	if ((seg <= VCPU_SREG_GS && ctxt->mode == X86EMUL_MODE_VM86)
 	    || ctxt->mode == X86EMUL_MODE_REAL) {
 		/* set real mode segment descriptor */
+		ctxt->ops->get_segment(ctxt, &dummy, &seg_desc, NULL, seg);
 		set_desc_base(&seg_desc, selector << 4);
-		set_desc_limit(&seg_desc, 0xffff);
-		seg_desc.type = 3;
-		seg_desc.p = 1;
-		seg_desc.s = 1;
-		if (ctxt->mode == X86EMUL_MODE_VM86)
-			seg_desc.dpl = 3;
 		goto load;
 	}
 
-- 
cgit v1.2.3


From a5625189f6810ef79ced53989c794acfa10d3370 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 21 Aug 2012 17:07:05 +0300
Subject: KVM: x86 emulator: Check segment limits in real mode too

Segment limits are verified in real mode, not just protected mode.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index f8b27cd2a6c..5b1c701cd6d 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -668,8 +668,6 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
 
 	la = seg_base(ctxt, addr.seg) + addr.ea;
 	switch (ctxt->mode) {
-	case X86EMUL_MODE_REAL:
-		break;
 	case X86EMUL_MODE_PROT64:
 		if (((signed long)la << 16) >> 16 != la)
 			return emulate_gp(ctxt, 0);
@@ -699,7 +697,10 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
 				goto bad;
 		}
 		cpl = ctxt->ops->cpl(ctxt);
-		rpl = sel & 3;
+		if (ctxt->mode == X86EMUL_MODE_REAL)
+			rpl = 0;
+		else
+			rpl = sel & 3;
 		cpl = max(cpl, rpl);
 		if (!(desc.type & 8)) {
 			/* data segment */
-- 
cgit v1.2.3


From 0afbe2f8781a812c7e501ec129eff45b21f792af Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 21 Aug 2012 17:07:06 +0300
Subject: KVM: x86 emulator: Fix #GP error code during linearization

We want the segment selector, nor segment number.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 5b1c701cd6d..1451cffd97e 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -725,9 +725,9 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
 	return X86EMUL_CONTINUE;
 bad:
 	if (addr.seg == VCPU_SREG_SS)
-		return emulate_ss(ctxt, addr.seg);
+		return emulate_ss(ctxt, sel);
 	else
-		return emulate_gp(ctxt, addr.seg);
+		return emulate_gp(ctxt, sel);
 }
 
 static int linearize(struct x86_emulate_ctxt *ctxt,
-- 
cgit v1.2.3


From 726364202853f843a97a8ba4a7c3cd91e3aa84b7 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 21 Aug 2012 17:07:07 +0300
Subject: KVM: VMX: Return real real-mode segment data even if
 emulate_invalid_guest_state=1

emulate_invalid_guest_state=1 doesn't mean we don't munge the segments in the
vmcs; we do.  So we need to return the real ones (maintained by vmx_set_segment).

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index fd21eb45466..0d6872621ab 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3108,8 +3108,7 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
 	if (vmx->rmode.vm86_active
 	    && (seg == VCPU_SREG_TR || seg == VCPU_SREG_ES
 		|| seg == VCPU_SREG_DS || seg == VCPU_SREG_FS
-		|| seg == VCPU_SREG_GS)
-	    && !emulate_invalid_guest_state) {
+		|| seg == VCPU_SREG_GS)) {
 		*var = vmx->rmode.segs[seg];
 		if (seg == VCPU_SREG_TR
 		    || var->selector == vmx_read_guest_seg_selector(vmx, seg))
-- 
cgit v1.2.3


From 1390a28b274e2e45f89bac67c435cbcbc5cc0790 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 21 Aug 2012 17:07:08 +0300
Subject: KVM: VMX: Preserve segment limit and access rights in real mode

While this is undocumented, real processors do not reload the segment
limit and access rights when loading a segment register in real mode.
Real programs rely on it so we need to comply with this behaviour.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 0d6872621ab..6e6421aeca0 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3113,6 +3113,9 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
 		if (seg == VCPU_SREG_TR
 		    || var->selector == vmx_read_guest_seg_selector(vmx, seg))
 			return;
+		var->base = vmx_read_guest_seg_base(vmx, seg);
+		var->selector = vmx_read_guest_seg_selector(vmx, seg);
+		return;
 	}
 	var->base = vmx_read_guest_seg_base(vmx, seg);
 	var->limit = vmx_read_guest_seg_limit(vmx, seg);
-- 
cgit v1.2.3


From ce5668034752e52e995c8dc625337380099a088a Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 21 Aug 2012 17:07:09 +0300
Subject: KVM: VMX: Save all segment data in real mode

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 6e6421aeca0..62b2d0cf2a7 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3216,6 +3216,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
 	vmcs_write32(sf->limit, var->limit);
 	vmcs_write16(sf->selector, var->selector);
 	if (vmx->rmode.vm86_active && var->s) {
+		vmx->rmode.segs[seg] = *var;
 		/*
 		 * Hack real-mode segments into vm86 compatibility.
 		 */
-- 
cgit v1.2.3


From a81aba14dc0ea499f4c218b5db0303b2ea8151d3 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 21 Aug 2012 17:07:10 +0300
Subject: KVM: VMX: Ignore segment G and D bits when considering whether we can
 virtualize

We will enter the guest with G and D cleared; as real hardware ignores D in
real mode, and G is taken care of by the limit test, we allow more code to
run in vm86 mode.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 62b2d0cf2a7..248c2b490e9 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3320,7 +3320,7 @@ static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
 		return false;
 	if (var.limit < 0xffff)
 		return false;
-	if ((ar | (3 << AR_DPL_SHIFT)) != 0xf3)
+	if (((ar | (3 << AR_DPL_SHIFT)) & ~(AR_G_MASK | AR_DB_MASK)) != 0xf3)
 		return false;
 
 	return true;
-- 
cgit v1.2.3


From 9a7819774e4236e8736a074b7e85276967911924 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Thu, 30 Aug 2012 17:45:54 -0300
Subject: KVM: x86: remove unused variable from kvm_task_switch()

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e00050ce7a6..20f2266dfb6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5714,7 +5714,6 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
 {
 	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
 	int ret;
-	unsigned reg;
 
 	init_emulate_ctxt(vcpu);
 
-- 
cgit v1.2.3


From ec798660cf72c981ad8eed272487a0fe2b3222f2 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Mon, 3 Sep 2012 14:47:25 +0300
Subject: KVM: cleanup pic reset

kvm_pic_reset() is not used anywhere. Move reset logic from
pic_ioport_write() there.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/i8259.c | 52 +++++++++++-----------------------------------------
 1 file changed, 11 insertions(+), 41 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 90c84f947d4..848206df096 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -275,23 +275,20 @@ void kvm_pic_reset(struct kvm_kpic_state *s)
 {
 	int irq, i;
 	struct kvm_vcpu *vcpu;
-	u8 irr = s->irr, isr = s->imr;
+	u8 edge_irr = s->irr & ~s->elcr;
 	bool found = false;
 
 	s->last_irr = 0;
-	s->irr = 0;
+	s->irr &= s->elcr;
 	s->imr = 0;
-	s->isr = 0;
 	s->priority_add = 0;
-	s->irq_base = 0;
-	s->read_reg_select = 0;
-	s->poll = 0;
 	s->special_mask = 0;
-	s->init_state = 0;
-	s->auto_eoi = 0;
-	s->rotate_on_auto_eoi = 0;
-	s->special_fully_nested_mode = 0;
-	s->init4 = 0;
+	s->read_reg_select = 0;
+	if (!s->init4) {
+		s->special_fully_nested_mode = 0;
+		s->auto_eoi = 0;
+	}
+	s->init_state = 1;
 
 	kvm_for_each_vcpu(i, vcpu, s->pics_state->kvm)
 		if (kvm_apic_accept_pic_intr(vcpu)) {
@@ -304,7 +301,7 @@ void kvm_pic_reset(struct kvm_kpic_state *s)
 		return;
 
 	for (irq = 0; irq < PIC_NUM_PINS/2; irq++)
-		if (irr & (1 << irq) || isr & (1 << irq))
+		if (edge_irr & (1 << irq))
 			pic_clear_isr(s, irq);
 }
 
@@ -316,40 +313,13 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
 	addr &= 1;
 	if (addr == 0) {
 		if (val & 0x10) {
-			u8 edge_irr = s->irr & ~s->elcr;
-			int i;
-			bool found;
-			struct kvm_vcpu *vcpu;
-
 			s->init4 = val & 1;
-			s->last_irr = 0;
-			s->irr &= s->elcr;
-			s->imr = 0;
-			s->priority_add = 0;
-			s->special_mask = 0;
-			s->read_reg_select = 0;
-			if (!s->init4) {
-				s->special_fully_nested_mode = 0;
-				s->auto_eoi = 0;
-			}
-			s->init_state = 1;
 			if (val & 0x02)
 				pr_pic_unimpl("single mode not supported");
 			if (val & 0x08)
 				pr_pic_unimpl(
-					"level sensitive irq not supported");
-
-			kvm_for_each_vcpu(i, vcpu, s->pics_state->kvm)
-				if (kvm_apic_accept_pic_intr(vcpu)) {
-					found = true;
-					break;
-				}
-
-
-			if (found)
-				for (irq = 0; irq < PIC_NUM_PINS/2; irq++)
-					if (edge_irr & (1 << irq))
-						pic_clear_isr(s, irq);
+						"level sensitive irq not supported");
+			kvm_pic_reset(s);
 		} else if (val & 0x08) {
 			if (val & 0x04)
 				s->poll = 1;
-- 
cgit v1.2.3


From 326d07cb30fed2387efccd4bf3bd8e4f28719e9e Mon Sep 17 00:00:00 2001
From: Mathias Krause <minipli@googlemail.com>
Date: Thu, 30 Aug 2012 01:30:13 +0200
Subject: KVM: x86: minor size optimization

Some fields can be constified and/or made static to reduce code and data
size.

Numbers for a 32 bit build:

        text    data     bss     dec     hex filename
before: 3351      80       0    3431     d67 cpuid.o
 after: 3391       0       0    3391     d3f cpuid.o

Signed-off-by: Mathias Krause <minipli@googlemail.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/cpuid.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index b496da684bd..ec79e773342 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -397,8 +397,8 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		break;
 	}
 	case KVM_CPUID_SIGNATURE: {
-		char signature[12] = "KVMKVMKVM\0\0";
-		u32 *sigptr = (u32 *)signature;
+		static const char signature[12] = "KVMKVMKVM\0\0";
+		const u32 *sigptr = (const u32 *)signature;
 		entry->eax = KVM_CPUID_FEATURES;
 		entry->ebx = sigptr[0];
 		entry->ecx = sigptr[1];
@@ -484,10 +484,10 @@ struct kvm_cpuid_param {
 	u32 func;
 	u32 idx;
 	bool has_leaf_count;
-	bool (*qualifier)(struct kvm_cpuid_param *param);
+	bool (*qualifier)(const struct kvm_cpuid_param *param);
 };
 
-static bool is_centaur_cpu(struct kvm_cpuid_param *param)
+static bool is_centaur_cpu(const struct kvm_cpuid_param *param)
 {
 	return boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR;
 }
@@ -498,7 +498,7 @@ int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
 	struct kvm_cpuid_entry2 *cpuid_entries;
 	int limit, nent = 0, r = -E2BIG, i;
 	u32 func;
-	static struct kvm_cpuid_param param[] = {
+	static const struct kvm_cpuid_param param[] = {
 		{ .func = 0, .has_leaf_count = true },
 		{ .func = 0x80000000, .has_leaf_count = true },
 		{ .func = 0xC0000000, .qualifier = is_centaur_cpu, .has_leaf_count = true },
@@ -517,7 +517,7 @@ int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
 
 	r = 0;
 	for (i = 0; i < ARRAY_SIZE(param); i++) {
-		struct kvm_cpuid_param *ent = &param[i];
+		const struct kvm_cpuid_param *ent = &param[i];
 
 		if (ent->qualifier && !ent->qualifier(ent))
 			continue;
-- 
cgit v1.2.3


From 89a87c67791b840d815a2028b88cefe6906ed42c Mon Sep 17 00:00:00 2001
From: Mathias Krause <minipli@googlemail.com>
Date: Thu, 30 Aug 2012 01:30:14 +0200
Subject: KVM: x86 emulator: use aligned variants of SSE register ops

As the the compiler ensures that the memory operand is always aligned
to a 16 byte memory location, use the aligned variant of MOVDQ for
read_sse_reg() and write_sse_reg().

Signed-off-by: Mathias Krause <minipli@googlemail.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 64 +++++++++++++++++++++++++-------------------------
 1 file changed, 32 insertions(+), 32 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 1451cffd97e..5a0fee1a19c 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -909,23 +909,23 @@ static void read_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, int reg)
 {
 	ctxt->ops->get_fpu(ctxt);
 	switch (reg) {
-	case 0: asm("movdqu %%xmm0, %0" : "=m"(*data)); break;
-	case 1: asm("movdqu %%xmm1, %0" : "=m"(*data)); break;
-	case 2: asm("movdqu %%xmm2, %0" : "=m"(*data)); break;
-	case 3: asm("movdqu %%xmm3, %0" : "=m"(*data)); break;
-	case 4: asm("movdqu %%xmm4, %0" : "=m"(*data)); break;
-	case 5: asm("movdqu %%xmm5, %0" : "=m"(*data)); break;
-	case 6: asm("movdqu %%xmm6, %0" : "=m"(*data)); break;
-	case 7: asm("movdqu %%xmm7, %0" : "=m"(*data)); break;
+	case 0: asm("movdqa %%xmm0, %0" : "=m"(*data)); break;
+	case 1: asm("movdqa %%xmm1, %0" : "=m"(*data)); break;
+	case 2: asm("movdqa %%xmm2, %0" : "=m"(*data)); break;
+	case 3: asm("movdqa %%xmm3, %0" : "=m"(*data)); break;
+	case 4: asm("movdqa %%xmm4, %0" : "=m"(*data)); break;
+	case 5: asm("movdqa %%xmm5, %0" : "=m"(*data)); break;
+	case 6: asm("movdqa %%xmm6, %0" : "=m"(*data)); break;
+	case 7: asm("movdqa %%xmm7, %0" : "=m"(*data)); break;
 #ifdef CONFIG_X86_64
-	case 8: asm("movdqu %%xmm8, %0" : "=m"(*data)); break;
-	case 9: asm("movdqu %%xmm9, %0" : "=m"(*data)); break;
-	case 10: asm("movdqu %%xmm10, %0" : "=m"(*data)); break;
-	case 11: asm("movdqu %%xmm11, %0" : "=m"(*data)); break;
-	case 12: asm("movdqu %%xmm12, %0" : "=m"(*data)); break;
-	case 13: asm("movdqu %%xmm13, %0" : "=m"(*data)); break;
-	case 14: asm("movdqu %%xmm14, %0" : "=m"(*data)); break;
-	case 15: asm("movdqu %%xmm15, %0" : "=m"(*data)); break;
+	case 8: asm("movdqa %%xmm8, %0" : "=m"(*data)); break;
+	case 9: asm("movdqa %%xmm9, %0" : "=m"(*data)); break;
+	case 10: asm("movdqa %%xmm10, %0" : "=m"(*data)); break;
+	case 11: asm("movdqa %%xmm11, %0" : "=m"(*data)); break;
+	case 12: asm("movdqa %%xmm12, %0" : "=m"(*data)); break;
+	case 13: asm("movdqa %%xmm13, %0" : "=m"(*data)); break;
+	case 14: asm("movdqa %%xmm14, %0" : "=m"(*data)); break;
+	case 15: asm("movdqa %%xmm15, %0" : "=m"(*data)); break;
 #endif
 	default: BUG();
 	}
@@ -937,23 +937,23 @@ static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data,
 {
 	ctxt->ops->get_fpu(ctxt);
 	switch (reg) {
-	case 0: asm("movdqu %0, %%xmm0" : : "m"(*data)); break;
-	case 1: asm("movdqu %0, %%xmm1" : : "m"(*data)); break;
-	case 2: asm("movdqu %0, %%xmm2" : : "m"(*data)); break;
-	case 3: asm("movdqu %0, %%xmm3" : : "m"(*data)); break;
-	case 4: asm("movdqu %0, %%xmm4" : : "m"(*data)); break;
-	case 5: asm("movdqu %0, %%xmm5" : : "m"(*data)); break;
-	case 6: asm("movdqu %0, %%xmm6" : : "m"(*data)); break;
-	case 7: asm("movdqu %0, %%xmm7" : : "m"(*data)); break;
+	case 0: asm("movdqa %0, %%xmm0" : : "m"(*data)); break;
+	case 1: asm("movdqa %0, %%xmm1" : : "m"(*data)); break;
+	case 2: asm("movdqa %0, %%xmm2" : : "m"(*data)); break;
+	case 3: asm("movdqa %0, %%xmm3" : : "m"(*data)); break;
+	case 4: asm("movdqa %0, %%xmm4" : : "m"(*data)); break;
+	case 5: asm("movdqa %0, %%xmm5" : : "m"(*data)); break;
+	case 6: asm("movdqa %0, %%xmm6" : : "m"(*data)); break;
+	case 7: asm("movdqa %0, %%xmm7" : : "m"(*data)); break;
 #ifdef CONFIG_X86_64
-	case 8: asm("movdqu %0, %%xmm8" : : "m"(*data)); break;
-	case 9: asm("movdqu %0, %%xmm9" : : "m"(*data)); break;
-	case 10: asm("movdqu %0, %%xmm10" : : "m"(*data)); break;
-	case 11: asm("movdqu %0, %%xmm11" : : "m"(*data)); break;
-	case 12: asm("movdqu %0, %%xmm12" : : "m"(*data)); break;
-	case 13: asm("movdqu %0, %%xmm13" : : "m"(*data)); break;
-	case 14: asm("movdqu %0, %%xmm14" : : "m"(*data)); break;
-	case 15: asm("movdqu %0, %%xmm15" : : "m"(*data)); break;
+	case 8: asm("movdqa %0, %%xmm8" : : "m"(*data)); break;
+	case 9: asm("movdqa %0, %%xmm9" : : "m"(*data)); break;
+	case 10: asm("movdqa %0, %%xmm10" : : "m"(*data)); break;
+	case 11: asm("movdqa %0, %%xmm11" : : "m"(*data)); break;
+	case 12: asm("movdqa %0, %%xmm12" : : "m"(*data)); break;
+	case 13: asm("movdqa %0, %%xmm13" : : "m"(*data)); break;
+	case 14: asm("movdqa %0, %%xmm14" : : "m"(*data)); break;
+	case 15: asm("movdqa %0, %%xmm15" : : "m"(*data)); break;
 #endif
 	default: BUG();
 	}
-- 
cgit v1.2.3


From fd0a0d82083747301f6c8084b4141bb490625016 Mon Sep 17 00:00:00 2001
From: Mathias Krause <minipli@googlemail.com>
Date: Thu, 30 Aug 2012 01:30:15 +0200
Subject: KVM: x86 emulator: mark opcode tables const

The opcode tables never change at runtime, therefor mark them const.

Signed-off-by: Mathias Krause <minipli@googlemail.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 40 ++++++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 5a0fee1a19c..fd06f9d6584 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -161,9 +161,9 @@ struct opcode {
 	u64 intercept : 8;
 	union {
 		int (*execute)(struct x86_emulate_ctxt *ctxt);
-		struct opcode *group;
-		struct group_dual *gdual;
-		struct gprefix *gprefix;
+		const struct opcode *group;
+		const struct group_dual *gdual;
+		const struct gprefix *gprefix;
 	} u;
 	int (*check_perm)(struct x86_emulate_ctxt *ctxt);
 };
@@ -3574,13 +3574,13 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt)
 		I2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock, _e),	\
 		I2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e)
 
-static struct opcode group7_rm1[] = {
+static const struct opcode group7_rm1[] = {
 	DI(SrcNone | Priv, monitor),
 	DI(SrcNone | Priv, mwait),
 	N, N, N, N, N, N,
 };
 
-static struct opcode group7_rm3[] = {
+static const struct opcode group7_rm3[] = {
 	DIP(SrcNone | Prot | Priv,		vmrun,		check_svme_pa),
 	II(SrcNone  | Prot | VendorSpecific,	em_vmmcall,	vmmcall),
 	DIP(SrcNone | Prot | Priv,		vmload,		check_svme_pa),
@@ -3591,13 +3591,13 @@ static struct opcode group7_rm3[] = {
 	DIP(SrcNone | Prot | Priv,		invlpga,	check_svme),
 };
 
-static struct opcode group7_rm7[] = {
+static const struct opcode group7_rm7[] = {
 	N,
 	DIP(SrcNone, rdtscp, check_rdtsc),
 	N, N, N, N, N, N,
 };
 
-static struct opcode group1[] = {
+static const struct opcode group1[] = {
 	I(Lock, em_add),
 	I(Lock | PageTable, em_or),
 	I(Lock, em_adc),
@@ -3608,11 +3608,11 @@ static struct opcode group1[] = {
 	I(0, em_cmp),
 };
 
-static struct opcode group1A[] = {
+static const struct opcode group1A[] = {
 	I(DstMem | SrcNone | Mov | Stack, em_pop), N, N, N, N, N, N, N,
 };
 
-static struct opcode group3[] = {
+static const struct opcode group3[] = {
 	I(DstMem | SrcImm, em_test),
 	I(DstMem | SrcImm, em_test),
 	I(DstMem | SrcNone | Lock, em_not),
@@ -3623,13 +3623,13 @@ static struct opcode group3[] = {
 	I(SrcMem, em_idiv_ex),
 };
 
-static struct opcode group4[] = {
+static const struct opcode group4[] = {
 	I(ByteOp | DstMem | SrcNone | Lock, em_grp45),
 	I(ByteOp | DstMem | SrcNone | Lock, em_grp45),
 	N, N, N, N, N, N,
 };
 
-static struct opcode group5[] = {
+static const struct opcode group5[] = {
 	I(DstMem | SrcNone | Lock,		em_grp45),
 	I(DstMem | SrcNone | Lock,		em_grp45),
 	I(SrcMem | Stack,			em_grp45),
@@ -3639,7 +3639,7 @@ static struct opcode group5[] = {
 	I(SrcMem | Stack,			em_grp45), N,
 };
 
-static struct opcode group6[] = {
+static const struct opcode group6[] = {
 	DI(Prot,	sldt),
 	DI(Prot,	str),
 	II(Prot | Priv | SrcMem16, em_lldt, lldt),
@@ -3647,7 +3647,7 @@ static struct opcode group6[] = {
 	N, N, N, N,
 };
 
-static struct group_dual group7 = { {
+static const struct group_dual group7 = { {
 	II(Mov | DstMem | Priv,			em_sgdt, sgdt),
 	II(Mov | DstMem | Priv,			em_sidt, sidt),
 	II(SrcMem | Priv,			em_lgdt, lgdt),
@@ -3664,7 +3664,7 @@ static struct group_dual group7 = { {
 	EXT(0, group7_rm7),
 } };
 
-static struct opcode group8[] = {
+static const struct opcode group8[] = {
 	N, N, N, N,
 	I(DstMem | SrcImmByte,				em_bt),
 	I(DstMem | SrcImmByte | Lock | PageTable,	em_bts),
@@ -3672,26 +3672,26 @@ static struct opcode group8[] = {
 	I(DstMem | SrcImmByte | Lock | PageTable,	em_btc),
 };
 
-static struct group_dual group9 = { {
+static const struct group_dual group9 = { {
 	N, I(DstMem64 | Lock | PageTable, em_cmpxchg8b), N, N, N, N, N, N,
 }, {
 	N, N, N, N, N, N, N, N,
 } };
 
-static struct opcode group11[] = {
+static const struct opcode group11[] = {
 	I(DstMem | SrcImm | Mov | PageTable, em_mov),
 	X7(D(Undefined)),
 };
 
-static struct gprefix pfx_0f_6f_0f_7f = {
+static const struct gprefix pfx_0f_6f_0f_7f = {
 	I(Mmx, em_mov), I(Sse | Aligned, em_mov), N, I(Sse | Unaligned, em_mov),
 };
 
-static struct gprefix pfx_vmovntpx = {
+static const struct gprefix pfx_vmovntpx = {
 	I(0, em_mov), N, N, N,
 };
 
-static struct opcode opcode_table[256] = {
+static const struct opcode opcode_table[256] = {
 	/* 0x00 - 0x07 */
 	I6ALU(Lock, em_add),
 	I(ImplicitOps | Stack | No64 | Src2ES, em_push_sreg),
@@ -3808,7 +3808,7 @@ static struct opcode opcode_table[256] = {
 	D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5),
 };
 
-static struct opcode twobyte_table[256] = {
+static const struct opcode twobyte_table[256] = {
 	/* 0x00 - 0x0F */
 	G(0, group6), GD(0, &group7), N, N,
 	N, I(ImplicitOps | VendorSpecific, em_syscall),
-- 
cgit v1.2.3


From 0225fb509d51fcf777eb0aa31c304c582e3248fd Mon Sep 17 00:00:00 2001
From: Mathias Krause <minipli@googlemail.com>
Date: Thu, 30 Aug 2012 01:30:16 +0200
Subject: KVM: x86 emulator: constify emulate_ops

We never change emulate_ops[] at runtime so it should be r/o.

Signed-off-by: Mathias Krause <minipli@googlemail.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h |  2 +-
 arch/x86/kvm/emulate.c             | 22 +++++++++++-----------
 arch/x86/kvm/x86.c                 |  2 +-
 3 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 282aee5d6ac..b5bb73aecc0 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -250,7 +250,7 @@ struct read_cache {
 };
 
 struct x86_emulate_ctxt {
-	struct x86_emulate_ops *ops;
+	const struct x86_emulate_ops *ops;
 
 	/* Register state before/after emulation. */
 	unsigned long eflags;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index fd06f9d6584..663e95881bd 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1325,7 +1325,7 @@ static int read_interrupt_descriptor(struct x86_emulate_ctxt *ctxt,
 static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
 				     u16 selector, struct desc_ptr *dt)
 {
-	struct x86_emulate_ops *ops = ctxt->ops;
+	const struct x86_emulate_ops *ops = ctxt->ops;
 
 	if (selector & 1 << 2) {
 		struct desc_struct desc;
@@ -1747,7 +1747,7 @@ static int em_popa(struct x86_emulate_ctxt *ctxt)
 
 static int __emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq)
 {
-	struct x86_emulate_ops *ops = ctxt->ops;
+	const struct x86_emulate_ops *ops = ctxt->ops;
 	int rc;
 	struct desc_ptr dt;
 	gva_t cs_addr;
@@ -2129,7 +2129,7 @@ static bool vendor_intel(struct x86_emulate_ctxt *ctxt)
 
 static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt)
 {
-	struct x86_emulate_ops *ops = ctxt->ops;
+	const struct x86_emulate_ops *ops = ctxt->ops;
 	u32 eax, ebx, ecx, edx;
 
 	/*
@@ -2173,7 +2173,7 @@ static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt)
 
 static int em_syscall(struct x86_emulate_ctxt *ctxt)
 {
-	struct x86_emulate_ops *ops = ctxt->ops;
+	const struct x86_emulate_ops *ops = ctxt->ops;
 	struct desc_struct cs, ss;
 	u64 msr_data;
 	u16 cs_sel, ss_sel;
@@ -2231,7 +2231,7 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt)
 
 static int em_sysenter(struct x86_emulate_ctxt *ctxt)
 {
-	struct x86_emulate_ops *ops = ctxt->ops;
+	const struct x86_emulate_ops *ops = ctxt->ops;
 	struct desc_struct cs, ss;
 	u64 msr_data;
 	u16 cs_sel, ss_sel;
@@ -2294,7 +2294,7 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt)
 
 static int em_sysexit(struct x86_emulate_ctxt *ctxt)
 {
-	struct x86_emulate_ops *ops = ctxt->ops;
+	const struct x86_emulate_ops *ops = ctxt->ops;
 	struct desc_struct cs, ss;
 	u64 msr_data;
 	int usermode;
@@ -2357,7 +2357,7 @@ static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt)
 static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
 					    u16 port, u16 len)
 {
-	struct x86_emulate_ops *ops = ctxt->ops;
+	const struct x86_emulate_ops *ops = ctxt->ops;
 	struct desc_struct tr_seg;
 	u32 base3;
 	int r;
@@ -2476,7 +2476,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
 			  u16 tss_selector, u16 old_tss_sel,
 			  ulong old_tss_base, struct desc_struct *new_desc)
 {
-	struct x86_emulate_ops *ops = ctxt->ops;
+	const struct x86_emulate_ops *ops = ctxt->ops;
 	struct tss_segment_16 tss_seg;
 	int ret;
 	u32 new_tss_base = get_desc_base(new_desc);
@@ -2623,7 +2623,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
 			  u16 tss_selector, u16 old_tss_sel,
 			  ulong old_tss_base, struct desc_struct *new_desc)
 {
-	struct x86_emulate_ops *ops = ctxt->ops;
+	const struct x86_emulate_ops *ops = ctxt->ops;
 	struct tss_segment_32 tss_seg;
 	int ret;
 	u32 new_tss_base = get_desc_base(new_desc);
@@ -2667,7 +2667,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
 				   u16 tss_selector, int idt_index, int reason,
 				   bool has_error_code, u32 error_code)
 {
-	struct x86_emulate_ops *ops = ctxt->ops;
+	const struct x86_emulate_ops *ops = ctxt->ops;
 	struct desc_struct curr_tss_desc, next_tss_desc;
 	int ret;
 	u16 old_tss_sel = get_segment_selector(ctxt, VCPU_SREG_TR);
@@ -4339,7 +4339,7 @@ static void fetch_possible_mmx_operand(struct x86_emulate_ctxt *ctxt,
 
 int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 {
-	struct x86_emulate_ops *ops = ctxt->ops;
+	const struct x86_emulate_ops *ops = ctxt->ops;
 	int rc = X86EMUL_CONTINUE;
 	int saved_dst_type = ctxt->dst.type;
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 20f2266dfb6..0dc066f0428 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4323,7 +4323,7 @@ static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulon
 	kvm_register_write(emul_to_vcpu(ctxt), reg, val);
 }
 
-static struct x86_emulate_ops emulate_ops = {
+static const struct x86_emulate_ops emulate_ops = {
 	.read_gpr            = emulator_read_gpr,
 	.write_gpr           = emulator_write_gpr,
 	.read_std            = kvm_read_guest_virt_system,
-- 
cgit v1.2.3


From 0fbe9b0b19fb92eee2cf23c23d63d6b3312681e5 Mon Sep 17 00:00:00 2001
From: Mathias Krause <minipli@googlemail.com>
Date: Thu, 30 Aug 2012 01:30:17 +0200
Subject: KVM: x86: constify read_write_emulator_ops

We never change those, make them r/o.

Signed-off-by: Mathias Krause <minipli@googlemail.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0dc066f0428..317241619e2 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3773,14 +3773,14 @@ static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
 	return X86EMUL_CONTINUE;
 }
 
-static struct read_write_emulator_ops read_emultor = {
+static const struct read_write_emulator_ops read_emultor = {
 	.read_write_prepare = read_prepare,
 	.read_write_emulate = read_emulate,
 	.read_write_mmio = vcpu_mmio_read,
 	.read_write_exit_mmio = read_exit_mmio,
 };
 
-static struct read_write_emulator_ops write_emultor = {
+static const struct read_write_emulator_ops write_emultor = {
 	.read_write_emulate = write_emulate,
 	.read_write_mmio = write_mmio,
 	.read_write_exit_mmio = write_exit_mmio,
@@ -3791,7 +3791,7 @@ static int emulator_read_write_onepage(unsigned long addr, void *val,
 				       unsigned int bytes,
 				       struct x86_exception *exception,
 				       struct kvm_vcpu *vcpu,
-				       struct read_write_emulator_ops *ops)
+				       const struct read_write_emulator_ops *ops)
 {
 	gpa_t gpa;
 	int handled, ret;
@@ -3840,7 +3840,7 @@ mmio:
 int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr,
 			void *val, unsigned int bytes,
 			struct x86_exception *exception,
-			struct read_write_emulator_ops *ops)
+			const struct read_write_emulator_ops *ops)
 {
 	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
 	gpa_t gpa;
-- 
cgit v1.2.3


From f1d248315afc55771c3991b934014daa154d05f1 Mon Sep 17 00:00:00 2001
From: Mathias Krause <minipli@googlemail.com>
Date: Thu, 30 Aug 2012 01:30:18 +0200
Subject: KVM: x86: more constification

Signed-off-by: Mathias Krause <minipli@googlemail.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/lapic.c | 2 +-
 arch/x86/kvm/x86.c   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 18d149d8020..07ad628dadb 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -198,7 +198,7 @@ static inline int apic_x2apic_mode(struct kvm_lapic *apic)
 	return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
 }
 
-static unsigned int apic_lvt_mask[APIC_LVT_NUM] = {
+static const unsigned int apic_lvt_mask[APIC_LVT_NUM] = {
 	LVT_MASK ,      /* part LVTT mask, timer mode mask added at runtime */
 	LVT_MASK | APIC_MODE_MASK,	/* LVTTHMR */
 	LVT_MASK | APIC_MODE_MASK,	/* LVTPC */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 317241619e2..666da13c34f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -817,7 +817,7 @@ static u32 msrs_to_save[] = {
 
 static unsigned num_msrs_to_save;
 
-static u32 emulated_msrs[] = {
+static const u32 emulated_msrs[] = {
 	MSR_IA32_TSCDEADLINE,
 	MSR_IA32_MISC_ENABLE,
 	MSR_IA32_MCG_STATUS,
-- 
cgit v1.2.3


From 772e031899fdf3c7636d66aae9b0b57d1aaebb93 Mon Sep 17 00:00:00 2001
From: Mathias Krause <minipli@googlemail.com>
Date: Thu, 30 Aug 2012 01:30:19 +0200
Subject: KVM: VMX: constify lookup tables

We use vmcs_field_to_offset_table[], kvm_vmx_segment_fields[] and
kvm_vmx_exit_handlers[] as lookup tables only -- make them r/o.

Signed-off-by: Mathias Krause <minipli@googlemail.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 248c2b490e9..d62b4139a29 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -450,7 +450,7 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
 #define FIELD64(number, name)	[number] = VMCS12_OFFSET(name), \
 				[number##_HIGH] = VMCS12_OFFSET(name)+4
 
-static unsigned short vmcs_field_to_offset_table[] = {
+static const unsigned short vmcs_field_to_offset_table[] = {
 	FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id),
 	FIELD(GUEST_ES_SELECTOR, guest_es_selector),
 	FIELD(GUEST_CS_SELECTOR, guest_cs_selector),
@@ -666,7 +666,7 @@ static struct vmx_capability {
 		.ar_bytes = GUEST_##seg##_AR_BYTES,	   	\
 	}
 
-static struct kvm_vmx_segment_field {
+static const struct kvm_vmx_segment_field {
 	unsigned selector;
 	unsigned base;
 	unsigned limit;
@@ -2695,7 +2695,7 @@ static __exit void hardware_unsetup(void)
 
 static void fix_pmode_dataseg(struct kvm_vcpu *vcpu, int seg, struct kvm_segment *save)
 {
-	struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
+	const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
 	struct kvm_segment tmp = *save;
 
 	if (!(vmcs_readl(sf->base) == tmp.base && tmp.s)) {
@@ -2764,7 +2764,7 @@ static gva_t rmode_tss_base(struct kvm *kvm)
 
 static void fix_rmode_seg(int seg, struct kvm_segment *save)
 {
-	struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
+	const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
 
 	vmcs_write16(sf->selector, save->base >> 4);
 	vmcs_write32(sf->base, save->base & 0xffff0);
@@ -3202,7 +3202,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
 			    struct kvm_segment *var, int seg)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
+	const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
 	u32 ar;
 
 	vmx_segment_cache_clear(vmx);
@@ -3572,7 +3572,7 @@ out:
 
 static void seg_setup(int seg)
 {
-	struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
+	const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
 	unsigned int ar;
 
 	vmcs_write16(sf->selector, 0);
@@ -5655,7 +5655,7 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
  * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
  * to be done to userspace and return 0.
  */
-static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
+static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 	[EXIT_REASON_EXCEPTION_NMI]           = handle_exception,
 	[EXIT_REASON_EXTERNAL_INTERRUPT]      = handle_external_interrupt,
 	[EXIT_REASON_TRIPLE_FAULT]            = handle_triple_fault,
-- 
cgit v1.2.3


From 09941fbb712655cde9b350852be7a99a6f61a03f Mon Sep 17 00:00:00 2001
From: Mathias Krause <minipli@googlemail.com>
Date: Thu, 30 Aug 2012 01:30:20 +0200
Subject: KVM: SVM: constify lookup tables

We never modify direct_access_msrs[], msrpm_ranges[],
svm_exit_handlers[] or x86_intercept_map[] at runtime.
Mark them r/o.

Signed-off-by: Mathias Krause <minipli@googlemail.com>
Cc: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 31be4a55744..611c72875fb 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -163,7 +163,7 @@ static DEFINE_PER_CPU(u64, current_tsc_ratio);
 
 #define MSR_INVALID			0xffffffffU
 
-static struct svm_direct_access_msrs {
+static const struct svm_direct_access_msrs {
 	u32 index;   /* Index of the MSR */
 	bool always; /* True if intercept is always on */
 } direct_access_msrs[] = {
@@ -400,7 +400,7 @@ struct svm_init_data {
 	int r;
 };
 
-static u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
+static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
 
 #define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
 #define MSRS_RANGE_SIZE 2048
@@ -3267,7 +3267,7 @@ static int pause_interception(struct vcpu_svm *svm)
 	return 1;
 }
 
-static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
+static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
 	[SVM_EXIT_READ_CR0]			= cr_interception,
 	[SVM_EXIT_READ_CR3]			= cr_interception,
 	[SVM_EXIT_READ_CR4]			= cr_interception,
@@ -4068,7 +4068,7 @@ static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
 #define POST_MEM(exit) { .exit_code = (exit), \
 			.stage = X86_ICPT_POST_MEMACCESS, }
 
-static struct __x86_intercept {
+static const struct __x86_intercept {
 	u32 exit_code;
 	enum x86_intercept_stage stage;
 } x86_intercept_map[] = {
-- 
cgit v1.2.3


From 2df72e9bc4c505d8357012f2924589f3d16f9d44 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Fri, 24 Aug 2012 15:54:57 -0300
Subject: KVM: split kvm_arch_flush_shadow

Introducing kvm_arch_flush_shadow_memslot, to invalidate the
translations of a single memory slot.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/ia64/kvm/kvm-ia64.c   | 8 +++++++-
 arch/powerpc/kvm/powerpc.c | 6 +++++-
 arch/s390/kvm/kvm-s390.c   | 7 ++++++-
 arch/x86/kvm/x86.c         | 8 +++++++-
 4 files changed, 25 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index eac65380bd2..8b3a9c0e771 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -1613,11 +1613,17 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 	return;
 }
 
-void kvm_arch_flush_shadow(struct kvm *kvm)
+void kvm_arch_flush_shadow_all(struct kvm *kvm)
 {
 	kvm_flush_remote_tlbs(kvm);
 }
 
+void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
+				   struct kvm_memory_slot *slot)
+{
+	kvm_arch_flush_shadow_all();
+}
+
 long kvm_arch_dev_ioctl(struct file *filp,
 			unsigned int ioctl, unsigned long arg)
 {
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 879b14a6140..4d213b8b0fb 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -334,8 +334,12 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 	kvmppc_core_commit_memory_region(kvm, mem);
 }
 
+void kvm_arch_flush_shadow_all(struct kvm *kvm)
+{
+}
 
-void kvm_arch_flush_shadow(struct kvm *kvm)
+void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
+				   struct kvm_memory_slot *slot)
 {
 }
 
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index e83df7f0fed..ecced9d1898 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -969,7 +969,12 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 	return;
 }
 
-void kvm_arch_flush_shadow(struct kvm *kvm)
+void kvm_arch_flush_shadow_all(struct kvm *kvm)
+{
+}
+
+void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
+				   struct kvm_memory_slot *slot)
 {
 }
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 666da13c34f..37797a090a8 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6447,12 +6447,18 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 	spin_unlock(&kvm->mmu_lock);
 }
 
-void kvm_arch_flush_shadow(struct kvm *kvm)
+void kvm_arch_flush_shadow_all(struct kvm *kvm)
 {
 	kvm_mmu_zap_all(kvm);
 	kvm_reload_remote_mmus(kvm);
 }
 
+void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
+				   struct kvm_memory_slot *slot)
+{
+	kvm_arch_flush_shadow_all(kvm);
+}
+
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 {
 	return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
-- 
cgit v1.2.3


From 3b4dc3a031110753b9ba36432dbd21f989fcee56 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Tue, 28 Aug 2012 17:43:26 -0300
Subject: KVM: move postcommit flush to x86, as mmio sptes are x86 specific

Other arches do not need this.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

v2: fix incorrect deletion of mmio sptes on gpa move (noticed by Takuya)
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 37797a090a8..6f6812ec841 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6445,6 +6445,14 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 		kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
 	kvm_mmu_slot_remove_write_access(kvm, mem->slot);
 	spin_unlock(&kvm->mmu_lock);
+	/*
+	 * If memory slot is created, or moved, we need to clear all
+	 * mmio sptes.
+	 */
+	if (npages && old.base_gfn != mem->guest_phys_addr >> PAGE_SHIFT) {
+		kvm_mmu_zap_all(kvm);
+		kvm_reload_remote_mmus(kvm);
+	}
 }
 
 void kvm_arch_flush_shadow_all(struct kvm *kvm)
-- 
cgit v1.2.3


From 716d51abff06f48425cef15d78ca6f36093f6dbf Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Mon, 3 Sep 2012 15:24:26 +0300
Subject: KVM: Provide userspace IO exit completion callback

Current code assumes that IO exit was due to instruction emulation
and handles execution back to emulator directly. This patch adds new
userspace IO exit completion callback that can be set by any other code
that caused IO exit to userspace.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/x86.c              | 93 +++++++++++++++++++++++++----------------
 2 files changed, 57 insertions(+), 37 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index fc0e752e756..64adb6117e1 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -414,6 +414,7 @@ struct kvm_vcpu_arch {
 	struct x86_emulate_ctxt emulate_ctxt;
 	bool emulate_regs_need_sync_to_vcpu;
 	bool emulate_regs_need_sync_from_vcpu;
+	int (*complete_userspace_io)(struct kvm_vcpu *vcpu);
 
 	gpa_t time;
 	struct pvclock_vcpu_time_info hv_clock;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6f6812ec841..f91e2c9d7cb 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4544,6 +4544,9 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
 	return true;
 }
 
+static int complete_emulated_mmio(struct kvm_vcpu *vcpu);
+static int complete_emulated_pio(struct kvm_vcpu *vcpu);
+
 int x86_emulate_instruction(struct kvm_vcpu *vcpu,
 			    unsigned long cr2,
 			    int emulation_type,
@@ -4614,13 +4617,16 @@ restart:
 	} else if (vcpu->arch.pio.count) {
 		if (!vcpu->arch.pio.in)
 			vcpu->arch.pio.count = 0;
-		else
+		else {
 			writeback = false;
+			vcpu->arch.complete_userspace_io = complete_emulated_pio;
+		}
 		r = EMULATE_DO_MMIO;
 	} else if (vcpu->mmio_needed) {
 		if (!vcpu->mmio_is_write)
 			writeback = false;
 		r = EMULATE_DO_MMIO;
+		vcpu->arch.complete_userspace_io = complete_emulated_mmio;
 	} else if (r == EMULATION_RESTART)
 		goto restart;
 	else
@@ -5476,6 +5482,24 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
 	return r;
 }
 
+static inline int complete_emulated_io(struct kvm_vcpu *vcpu)
+{
+	int r;
+	vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+	r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
+	srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
+	if (r != EMULATE_DONE)
+		return 0;
+	return 1;
+}
+
+static int complete_emulated_pio(struct kvm_vcpu *vcpu)
+{
+	BUG_ON(!vcpu->arch.pio.count);
+
+	return complete_emulated_io(vcpu);
+}
+
 /*
  * Implements the following, as a state machine:
  *
@@ -5492,47 +5516,37 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
  *      copy data
  *      exit
  */
-static int complete_mmio(struct kvm_vcpu *vcpu)
+static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
 {
 	struct kvm_run *run = vcpu->run;
 	struct kvm_mmio_fragment *frag;
-	int r;
 
-	if (!(vcpu->arch.pio.count || vcpu->mmio_needed))
-		return 1;
+	BUG_ON(!vcpu->mmio_needed);
 
-	if (vcpu->mmio_needed) {
-		/* Complete previous fragment */
-		frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment++];
-		if (!vcpu->mmio_is_write)
-			memcpy(frag->data, run->mmio.data, frag->len);
-		if (vcpu->mmio_cur_fragment == vcpu->mmio_nr_fragments) {
-			vcpu->mmio_needed = 0;
-			if (vcpu->mmio_is_write)
-				return 1;
-			vcpu->mmio_read_completed = 1;
-			goto done;
-		}
-		/* Initiate next fragment */
-		++frag;
-		run->exit_reason = KVM_EXIT_MMIO;
-		run->mmio.phys_addr = frag->gpa;
+	/* Complete previous fragment */
+	frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment++];
+	if (!vcpu->mmio_is_write)
+		memcpy(frag->data, run->mmio.data, frag->len);
+	if (vcpu->mmio_cur_fragment == vcpu->mmio_nr_fragments) {
+		vcpu->mmio_needed = 0;
 		if (vcpu->mmio_is_write)
-			memcpy(run->mmio.data, frag->data, frag->len);
-		run->mmio.len = frag->len;
-		run->mmio.is_write = vcpu->mmio_is_write;
-		return 0;
-
-	}
-done:
-	vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
-	r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
-	srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
-	if (r != EMULATE_DONE)
-		return 0;
-	return 1;
+			return 1;
+		vcpu->mmio_read_completed = 1;
+		return complete_emulated_io(vcpu);
+	}
+	/* Initiate next fragment */
+	++frag;
+	run->exit_reason = KVM_EXIT_MMIO;
+	run->mmio.phys_addr = frag->gpa;
+	if (vcpu->mmio_is_write)
+		memcpy(run->mmio.data, frag->data, frag->len);
+	run->mmio.len = frag->len;
+	run->mmio.is_write = vcpu->mmio_is_write;
+	vcpu->arch.complete_userspace_io = complete_emulated_mmio;
+	return 0;
 }
 
+
 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	int r;
@@ -5559,9 +5573,14 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		}
 	}
 
-	r = complete_mmio(vcpu);
-	if (r <= 0)
-		goto out;
+	if (unlikely(vcpu->arch.complete_userspace_io)) {
+		int (*cui)(struct kvm_vcpu *) = vcpu->arch.complete_userspace_io;
+		vcpu->arch.complete_userspace_io = NULL;
+		r = cui(vcpu);
+		if (r <= 0)
+			goto out;
+	} else
+		WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed);
 
 	r = __vcpu_run(vcpu);
 
-- 
cgit v1.2.3


From 9d1b39a967871b7c69025dba7b7bdaee42871021 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Mon, 3 Sep 2012 15:24:27 +0300
Subject: KVM: emulator: make x86 emulation modes enum instead of defines

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h | 22 ++++++++++------------
 arch/x86/kvm/emulate.c             |  4 +++-
 2 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index b5bb73aecc0..e9e5675c0df 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -249,6 +249,15 @@ struct read_cache {
 	unsigned long end;
 };
 
+/* Execution mode, passed to the emulator. */
+enum x86emul_mode {
+	X86EMUL_MODE_REAL,	/* Real mode.             */
+	X86EMUL_MODE_VM86,	/* Virtual 8086 mode.     */
+	X86EMUL_MODE_PROT16,	/* 16-bit protected mode. */
+	X86EMUL_MODE_PROT32,	/* 32-bit protected mode. */
+	X86EMUL_MODE_PROT64,	/* 64-bit (long) mode.    */
+};
+
 struct x86_emulate_ctxt {
 	const struct x86_emulate_ops *ops;
 
@@ -256,7 +265,7 @@ struct x86_emulate_ctxt {
 	unsigned long eflags;
 	unsigned long eip; /* eip before instruction emulation */
 	/* Emulated execution mode, represented by an X86EMUL_MODE value. */
-	int mode;
+	enum x86emul_mode mode;
 
 	/* interruptibility state, as a result of execution of STI or MOV SS */
 	int interruptibility;
@@ -308,17 +317,6 @@ struct x86_emulate_ctxt {
 #define REPE_PREFIX	0xf3
 #define REPNE_PREFIX	0xf2
 
-/* Execution mode, passed to the emulator. */
-#define X86EMUL_MODE_REAL     0	/* Real mode.             */
-#define X86EMUL_MODE_VM86     1	/* Virtual 8086 mode.     */
-#define X86EMUL_MODE_PROT16   2	/* 16-bit protected mode. */
-#define X86EMUL_MODE_PROT32   4	/* 32-bit protected mode. */
-#define X86EMUL_MODE_PROT64   8	/* 64-bit (long) mode.    */
-
-/* any protected mode   */
-#define X86EMUL_MODE_PROT     (X86EMUL_MODE_PROT16|X86EMUL_MODE_PROT32| \
-			       X86EMUL_MODE_PROT64)
-
 /* CPUID vendors */
 #define X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx 0x68747541
 #define X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx 0x444d4163
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 663e95881bd..5fe06a8fbeb 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2268,6 +2268,8 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt)
 		if (msr_data == 0x0)
 			return emulate_gp(ctxt, 0);
 		break;
+	default:
+		break;
 	}
 
 	ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
@@ -4400,7 +4402,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 	}
 
 	/* Instruction can only be executed in protected mode */
-	if ((ctxt->d & Prot) && !(ctxt->mode & X86EMUL_MODE_PROT)) {
+	if ((ctxt->d & Prot) && ctxt->mode < X86EMUL_MODE_PROT16) {
 		rc = emulate_ud(ctxt);
 		goto done;
 	}
-- 
cgit v1.2.3


From f3bd64c68a8f1245e3d037f70c6936cd7bb1196b Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Mon, 3 Sep 2012 15:24:28 +0300
Subject: KVM: emulator: string_addr_inc() cleanup

Remove unneeded segment argument. Address structure already has correct
segment which was put there during decode.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 5fe06a8fbeb..415f903facd 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2790,14 +2790,13 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
 	return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
 }
 
-static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned seg,
-			    int reg, struct operand *op)
+static void string_addr_inc(struct x86_emulate_ctxt *ctxt, int reg,
+		struct operand *op)
 {
 	int df = (ctxt->eflags & EFLG_DF) ? -1 : 1;
 
 	register_address_increment(ctxt, reg_rmw(ctxt, reg), df * op->bytes);
 	op->addr.mem.ea = register_address(ctxt, reg_read(ctxt, reg));
-	op->addr.mem.seg = seg;
 }
 
 static int em_das(struct x86_emulate_ctxt *ctxt)
@@ -4570,12 +4569,10 @@ writeback:
 	ctxt->dst.type = saved_dst_type;
 
 	if ((ctxt->d & SrcMask) == SrcSI)
-		string_addr_inc(ctxt, seg_override(ctxt),
-				VCPU_REGS_RSI, &ctxt->src);
+		string_addr_inc(ctxt, VCPU_REGS_RSI, &ctxt->src);
 
 	if ((ctxt->d & DstMask) == DstDI)
-		string_addr_inc(ctxt, VCPU_SREG_ES, VCPU_REGS_RDI,
-				&ctxt->dst);
+		string_addr_inc(ctxt, VCPU_REGS_RDI, &ctxt->dst);
 
 	if (ctxt->rep_prefix && (ctxt->d & String)) {
 		struct read_cache *r = &ctxt->io_read;
-- 
cgit v1.2.3


From b3356bf0dbb34980620f2f7def7d1b9a0d325225 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Mon, 3 Sep 2012 15:24:29 +0300
Subject: KVM: emulator: optimize "rep ins" handling

Optimize "rep ins" by allowing emulator to write back more than one
datum at a time. Introduce new operand type OP_MEM_STR which tells
writeback() that dst contains pointer to an array that should be written
back as opposite to just one data element.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h |  4 +++-
 arch/x86/kvm/emulate.c             | 33 ++++++++++++++++++++++++++++-----
 2 files changed, 31 insertions(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index e9e5675c0df..15f960c06ff 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -213,8 +213,9 @@ typedef u32 __attribute__((vector_size(16))) sse128_t;
 
 /* Type, address-of, and value of an instruction's operand. */
 struct operand {
-	enum { OP_REG, OP_MEM, OP_IMM, OP_XMM, OP_MM, OP_NONE } type;
+	enum { OP_REG, OP_MEM, OP_MEM_STR, OP_IMM, OP_XMM, OP_MM, OP_NONE } type;
 	unsigned int bytes;
+	unsigned int count;
 	union {
 		unsigned long orig_val;
 		u64 orig_val64;
@@ -234,6 +235,7 @@ struct operand {
 		char valptr[sizeof(unsigned long) + 2];
 		sse128_t vec_val;
 		u64 mm_val;
+		void *data;
 	};
 };
 
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 415f903facd..39171cb307e 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1301,8 +1301,15 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
 		rc->end = n * size;
 	}
 
-	memcpy(dest, rc->data + rc->pos, size);
-	rc->pos += size;
+	if (ctxt->rep_prefix && !(ctxt->eflags & EFLG_DF)) {
+		ctxt->dst.data = rc->data + rc->pos;
+		ctxt->dst.type = OP_MEM_STR;
+		ctxt->dst.count = (rc->end - rc->pos) / size;
+		rc->pos = rc->end;
+	} else {
+		memcpy(dest, rc->data + rc->pos, size);
+		rc->pos += size;
+	}
 	return 1;
 }
 
@@ -1546,6 +1553,14 @@ static int writeback(struct x86_emulate_ctxt *ctxt)
 		if (rc != X86EMUL_CONTINUE)
 			return rc;
 		break;
+	case OP_MEM_STR:
+		rc = segmented_write(ctxt,
+				ctxt->dst.addr.mem,
+				ctxt->dst.data,
+				ctxt->dst.bytes * ctxt->dst.count);
+		if (rc != X86EMUL_CONTINUE)
+			return rc;
+		break;
 	case OP_XMM:
 		write_sse_reg(ctxt, &ctxt->dst.vec_val, ctxt->dst.addr.xmm);
 		break;
@@ -2793,7 +2808,7 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
 static void string_addr_inc(struct x86_emulate_ctxt *ctxt, int reg,
 		struct operand *op)
 {
-	int df = (ctxt->eflags & EFLG_DF) ? -1 : 1;
+	int df = (ctxt->eflags & EFLG_DF) ? -op->count : op->count;
 
 	register_address_increment(ctxt, reg_rmw(ctxt, reg), df * op->bytes);
 	op->addr.mem.ea = register_address(ctxt, reg_read(ctxt, reg));
@@ -3733,7 +3748,7 @@ static const struct opcode opcode_table[256] = {
 	I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op),
 	I(SrcImmByte | Mov | Stack, em_push),
 	I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op),
-	I2bvIP(DstDI | SrcDX | Mov | String, em_in, ins, check_perm_in), /* insb, insw/insd */
+	I2bvIP(DstDI | SrcDX | Mov | String | Unaligned, em_in, ins, check_perm_in), /* insb, insw/insd */
 	I2bvIP(SrcSI | DstDX | String, em_out, outs, check_perm_out), /* outsb, outsw/outsd */
 	/* 0x70 - 0x7F */
 	X16(D(SrcImmByte)),
@@ -3991,6 +4006,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
 			register_address(ctxt, reg_read(ctxt, VCPU_REGS_RDI));
 		op->addr.mem.seg = VCPU_SREG_ES;
 		op->val = 0;
+		op->count = 1;
 		break;
 	case OpDX:
 		op->type = OP_REG;
@@ -4034,6 +4050,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
 			register_address(ctxt, reg_read(ctxt, VCPU_REGS_RSI));
 		op->addr.mem.seg = seg_override(ctxt);
 		op->val = 0;
+		op->count = 1;
 		break;
 	case OpImmFAddr:
 		op->type = OP_IMM;
@@ -4575,8 +4592,14 @@ writeback:
 		string_addr_inc(ctxt, VCPU_REGS_RDI, &ctxt->dst);
 
 	if (ctxt->rep_prefix && (ctxt->d & String)) {
+		unsigned int count;
 		struct read_cache *r = &ctxt->io_read;
-		register_address_increment(ctxt, reg_rmw(ctxt, VCPU_REGS_RCX), -1);
+		if ((ctxt->d & SrcMask) == SrcSI)
+			count = ctxt->src.count;
+		else
+			count = ctxt->dst.count;
+		register_address_increment(ctxt, reg_rmw(ctxt, VCPU_REGS_RCX),
+				-count);
 
 		if (!string_insn_completed(ctxt)) {
 			/*
-- 
cgit v1.2.3


From a50abc3b2b469ee80bc0f9ef5b6d457ef72659a9 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Wed, 5 Sep 2012 20:00:52 +0300
Subject: KVM: use symbolic constant for nr interrupts

interrupt_bitmap is KVM_NR_INTERRUPTS bits in size,
so just use that instead of hard-coded constants
and math.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f91e2c9d7cb..c4d451ed157 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2366,7 +2366,7 @@ static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
 static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
 				    struct kvm_interrupt *irq)
 {
-	if (irq->irq < 0 || irq->irq >= 256)
+	if (irq->irq < 0 || irq->irq >= KVM_NR_INTERRUPTS)
 		return -EINVAL;
 	if (irqchip_in_kernel(vcpu->kvm))
 		return -ENXIO;
@@ -5793,7 +5793,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 	if (mmu_reset_needed)
 		kvm_mmu_reset_context(vcpu);
 
-	max_bits = (sizeof sregs->interrupt_bitmap) << 3;
+	max_bits = KVM_NR_INTERRUPTS;
 	pending_vec = find_first_bit(
 		(const unsigned long *)sregs->interrupt_bitmap, max_bits);
 	if (pending_vec < max_bits) {
-- 
cgit v1.2.3


From 92b5265d38f6a4d33e9d43974f176f18547687d6 Mon Sep 17 00:00:00 2001
From: "Liu, Jinsong" <jinsong.liu@intel.com>
Date: Mon, 10 Sep 2012 06:55:39 +0800
Subject: KVM: Depend on HIGH_RES_TIMERS

KVM lapic timer and tsc deadline timer based on hrtimer,
setting a leftmost node to rb tree and then do hrtimer reprogram.
If hrtimer not configured as high resolution, hrtimer_enqueue_reprogram
do nothing and then make kvm lapic timer and tsc deadline timer fail.

Signed-off-by: Liu, Jinsong <jinsong.liu@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch')

diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 45c044f0fff..586f0005980 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -20,6 +20,7 @@ if VIRTUALIZATION
 config KVM
 	tristate "Kernel-based Virtual Machine (KVM) support"
 	depends on HAVE_KVM
+	depends on HIGH_RES_TIMERS
 	# for device assignment:
 	depends on PCI
 	# for TASKSTATS/TASK_DELAY_ACCT:
-- 
cgit v1.2.3


From 7de5bdc96c372ab875408c86e0099958dba89f56 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Fri, 7 Sep 2012 14:15:03 +0800
Subject: KVM: MMU: remove unnecessary check

Checking the return of kvm_mmu_get_page is unnecessary since it is
guaranteed by memory cache

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 399c177212b..aa0b469ee07 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2616,11 +2616,6 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
 			sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr,
 					      iterator.level - 1,
 					      1, ACC_ALL, iterator.sptep);
-			if (!sp) {
-				pgprintk("nonpaging_map: ENOMEM\n");
-				kvm_release_pfn_clean(pfn);
-				return -ENOMEM;
-			}
 
 			mmu_spte_set(iterator.sptep,
 				     __pa(sp->spt)
-- 
cgit v1.2.3


From ecba9a52acdf20530d561b7634b80c35c308943a Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Wed, 5 Sep 2012 19:30:01 +0900
Subject: KVM: x86: lapic: Clean up find_highest_vector() and count_vectors()

find_highest_vector() and count_vectors():
 - Instead of using magic values, define and use proper macros.

find_highest_vector():
 - Remove likely() which is there only for historical reasons and not
   doing correct branch predictions anymore.  Using such heuristics
   to optimize this function is not worth it now.  Let CPUs predict
   things instead.

 - Stop checking word[0] separately.  This was only needed for doing
   likely() optimization.

 - Use for loop, not while, to iterate over the register array to make
   the code clearer.

Note that we actually confirmed that the likely() did wrong predictions
by inserting debug code.

Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/lapic.c | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 07ad628dadb..6f9fd633c88 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -66,6 +66,7 @@
 #define APIC_DEST_NOSHORT		0x0
 #define APIC_DEST_MASK			0x800
 #define MAX_APIC_VECTOR			256
+#define APIC_VECTORS_PER_REG		32
 
 #define VEC_POS(v) ((v) & (32 - 1))
 #define REG_POS(v) (((v) >> 5) << 4)
@@ -208,25 +209,30 @@ static const unsigned int apic_lvt_mask[APIC_LVT_NUM] = {
 
 static int find_highest_vector(void *bitmap)
 {
-	u32 *word = bitmap;
-	int word_offset = MAX_APIC_VECTOR >> 5;
+	int vec;
+	u32 *reg;
 
-	while ((word_offset != 0) && (word[(--word_offset) << 2] == 0))
-		continue;
+	for (vec = MAX_APIC_VECTOR - APIC_VECTORS_PER_REG;
+	     vec >= 0; vec -= APIC_VECTORS_PER_REG) {
+		reg = bitmap + REG_POS(vec);
+		if (*reg)
+			return fls(*reg) - 1 + vec;
+	}
 
-	if (likely(!word_offset && !word[0]))
-		return -1;
-	else
-		return fls(word[word_offset << 2]) - 1 + (word_offset << 5);
+	return -1;
 }
 
 static u8 count_vectors(void *bitmap)
 {
-	u32 *word = bitmap;
-	int word_offset;
+	int vec;
+	u32 *reg;
 	u8 count = 0;
-	for (word_offset = 0; word_offset < MAX_APIC_VECTOR >> 5; ++word_offset)
-		count += hweight32(word[word_offset << 2]);
+
+	for (vec = 0; vec < MAX_APIC_VECTOR; vec += APIC_VECTORS_PER_REG) {
+		reg = bitmap + REG_POS(vec);
+		count += hweight32(*reg);
+	}
+
 	return count;
 }
 
-- 
cgit v1.2.3


From 83287ea420ced7242a704488aab0fcdcf2ced9ab Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 16 Sep 2012 15:10:57 +0300
Subject: KVM: VMX: Make lto-friendly

LTO (link-time optimization) doesn't like local labels to be referred to
from a different function, since the two functions may be built in separate
compilation units.  Use an external variable instead.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d62b4139a29..5faf12ace54 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -127,6 +127,8 @@ module_param(ple_gap, int, S_IRUGO);
 static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
 module_param(ple_window, int, S_IRUGO);
 
+extern const ulong vmx_return;
+
 #define NR_AUTOLOAD_MSRS 8
 #define VMCS02_POOL_SIZE 1
 
@@ -3724,8 +3726,7 @@ static void vmx_set_constant_host_state(void)
 	native_store_idt(&dt);
 	vmcs_writel(HOST_IDTR_BASE, dt.address);   /* 22.2.4 */
 
-	asm("mov $.Lkvm_vmx_return, %0" : "=r"(tmpl));
-	vmcs_writel(HOST_RIP, tmpl); /* 22.2.5 */
+	vmcs_writel(HOST_RIP, vmx_return); /* 22.2.5 */
 
 	rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
 	vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
@@ -6276,11 +6277,11 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 		"mov %c[rcx](%0), %%"R"cx \n\t" /* kills %0 (ecx) */
 
 		/* Enter guest mode */
-		"jne .Llaunched \n\t"
+		"jne 1f \n\t"
 		__ex(ASM_VMX_VMLAUNCH) "\n\t"
-		"jmp .Lkvm_vmx_return \n\t"
-		".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t"
-		".Lkvm_vmx_return: "
+		"jmp 2f \n\t"
+		"1: " __ex(ASM_VMX_VMRESUME) "\n\t"
+		"2: "
 		/* Save guest registers, load host registers, keep flags */
 		"mov %0, %c[wordsize](%%"R"sp) \n\t"
 		"pop %0 \n\t"
@@ -6306,6 +6307,10 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 
 		"pop  %%"R"bp; pop  %%"R"dx \n\t"
 		"setbe %c[fail](%0) \n\t"
+		".pushsection .rodata \n\t"
+		".global vmx_return \n\t"
+		"vmx_return: " _ASM_PTR " 2b \n\t"
+		".popsection"
 	      : : "c"(vmx), "d"((unsigned long)HOST_RSP),
 		[launched]"i"(offsetof(struct vcpu_vmx, __launched)),
 		[fail]"i"(offsetof(struct vcpu_vmx, fail)),
-- 
cgit v1.2.3


From b188c81f2e1a188ddda6a3d353e5b546c30a9b90 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 16 Sep 2012 15:10:58 +0300
Subject: KVM: VMX: Make use of asm.h

Use macros for bitness-insensitive register names, instead of
rolling our own.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 69 ++++++++++++++++++++++++------------------------------
 1 file changed, 30 insertions(+), 39 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 5faf12ace54..30bcb953afe 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6184,14 +6184,6 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
 					msrs[i].host);
 }
 
-#ifdef CONFIG_X86_64
-#define R "r"
-#define Q "q"
-#else
-#define R "e"
-#define Q "l"
-#endif
-
 static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -6240,30 +6232,30 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	vmx->__launched = vmx->loaded_vmcs->launched;
 	asm(
 		/* Store host registers */
-		"push %%"R"dx; push %%"R"bp;"
-		"push %%"R"cx \n\t" /* placeholder for guest rcx */
-		"push %%"R"cx \n\t"
-		"cmp %%"R"sp, %c[host_rsp](%0) \n\t"
+		"push %%" _ASM_DX "; push %%" _ASM_BP ";"
+		"push %%" _ASM_CX " \n\t" /* placeholder for guest rcx */
+		"push %%" _ASM_CX " \n\t"
+		"cmp %%" _ASM_SP ", %c[host_rsp](%0) \n\t"
 		"je 1f \n\t"
-		"mov %%"R"sp, %c[host_rsp](%0) \n\t"
+		"mov %%" _ASM_SP ", %c[host_rsp](%0) \n\t"
 		__ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
 		"1: \n\t"
 		/* Reload cr2 if changed */
-		"mov %c[cr2](%0), %%"R"ax \n\t"
-		"mov %%cr2, %%"R"dx \n\t"
-		"cmp %%"R"ax, %%"R"dx \n\t"
+		"mov %c[cr2](%0), %%" _ASM_AX " \n\t"
+		"mov %%cr2, %%" _ASM_DX " \n\t"
+		"cmp %%" _ASM_AX ", %%" _ASM_DX " \n\t"
 		"je 2f \n\t"
-		"mov %%"R"ax, %%cr2 \n\t"
+		"mov %%" _ASM_AX", %%cr2 \n\t"
 		"2: \n\t"
 		/* Check if vmlaunch of vmresume is needed */
 		"cmpl $0, %c[launched](%0) \n\t"
 		/* Load guest registers.  Don't clobber flags. */
-		"mov %c[rax](%0), %%"R"ax \n\t"
-		"mov %c[rbx](%0), %%"R"bx \n\t"
-		"mov %c[rdx](%0), %%"R"dx \n\t"
-		"mov %c[rsi](%0), %%"R"si \n\t"
-		"mov %c[rdi](%0), %%"R"di \n\t"
-		"mov %c[rbp](%0), %%"R"bp \n\t"
+		"mov %c[rax](%0), %%" _ASM_AX " \n\t"
+		"mov %c[rbx](%0), %%" _ASM_BX " \n\t"
+		"mov %c[rdx](%0), %%" _ASM_DX " \n\t"
+		"mov %c[rsi](%0), %%" _ASM_SI " \n\t"
+		"mov %c[rdi](%0), %%" _ASM_DI " \n\t"
+		"mov %c[rbp](%0), %%" _ASM_BP " \n\t"
 #ifdef CONFIG_X86_64
 		"mov %c[r8](%0),  %%r8  \n\t"
 		"mov %c[r9](%0),  %%r9  \n\t"
@@ -6274,7 +6266,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 		"mov %c[r14](%0), %%r14 \n\t"
 		"mov %c[r15](%0), %%r15 \n\t"
 #endif
-		"mov %c[rcx](%0), %%"R"cx \n\t" /* kills %0 (ecx) */
+		"mov %c[rcx](%0), %%" _ASM_CX " \n\t" /* kills %0 (ecx) */
 
 		/* Enter guest mode */
 		"jne 1f \n\t"
@@ -6283,15 +6275,15 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 		"1: " __ex(ASM_VMX_VMRESUME) "\n\t"
 		"2: "
 		/* Save guest registers, load host registers, keep flags */
-		"mov %0, %c[wordsize](%%"R"sp) \n\t"
+		"mov %0, %c[wordsize](%%" _ASM_SP ") \n\t"
 		"pop %0 \n\t"
-		"mov %%"R"ax, %c[rax](%0) \n\t"
-		"mov %%"R"bx, %c[rbx](%0) \n\t"
-		"pop"Q" %c[rcx](%0) \n\t"
-		"mov %%"R"dx, %c[rdx](%0) \n\t"
-		"mov %%"R"si, %c[rsi](%0) \n\t"
-		"mov %%"R"di, %c[rdi](%0) \n\t"
-		"mov %%"R"bp, %c[rbp](%0) \n\t"
+		"mov %%" _ASM_AX ", %c[rax](%0) \n\t"
+		"mov %%" _ASM_BX ", %c[rbx](%0) \n\t"
+		__ASM_SIZE(pop) " %c[rcx](%0) \n\t"
+		"mov %%" _ASM_DX ", %c[rdx](%0) \n\t"
+		"mov %%" _ASM_SI ", %c[rsi](%0) \n\t"
+		"mov %%" _ASM_DI ", %c[rdi](%0) \n\t"
+		"mov %%" _ASM_BP ", %c[rbp](%0) \n\t"
 #ifdef CONFIG_X86_64
 		"mov %%r8,  %c[r8](%0) \n\t"
 		"mov %%r9,  %c[r9](%0) \n\t"
@@ -6302,10 +6294,10 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 		"mov %%r14, %c[r14](%0) \n\t"
 		"mov %%r15, %c[r15](%0) \n\t"
 #endif
-		"mov %%cr2, %%"R"ax   \n\t"
-		"mov %%"R"ax, %c[cr2](%0) \n\t"
+		"mov %%cr2, %%" _ASM_AX "   \n\t"
+		"mov %%" _ASM_AX ", %c[cr2](%0) \n\t"
 
-		"pop  %%"R"bp; pop  %%"R"dx \n\t"
+		"pop  %%" _ASM_BP "; pop  %%" _ASM_DX " \n\t"
 		"setbe %c[fail](%0) \n\t"
 		".pushsection .rodata \n\t"
 		".global vmx_return \n\t"
@@ -6335,9 +6327,11 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 		[cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)),
 		[wordsize]"i"(sizeof(ulong))
 	      : "cc", "memory"
-		, R"ax", R"bx", R"di", R"si"
 #ifdef CONFIG_X86_64
+		, "rax", "rbx", "rdi", "rsi"
 		, "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
+#else
+		, "eax", "ebx", "edi", "esi"
 #endif
 	      );
 
@@ -6389,9 +6383,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	vmx_complete_interrupts(vmx);
 }
 
-#undef R
-#undef Q
-
 static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
-- 
cgit v1.2.3


From 7454766f7bead388251aedee35a478356a7f4e72 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 16 Sep 2012 15:10:59 +0300
Subject: KVM: SVM: Make use of asm.h

Use macros for bitness-insensitive register names, instead of
rolling our own.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/svm.c | 46 ++++++++++++++++++++--------------------------
 1 file changed, 20 insertions(+), 26 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 611c72875fb..818fceb3091 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3782,12 +3782,6 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu)
 	svm_complete_interrupts(svm);
 }
 
-#ifdef CONFIG_X86_64
-#define R "r"
-#else
-#define R "e"
-#endif
-
 static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
@@ -3814,13 +3808,13 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 	local_irq_enable();
 
 	asm volatile (
-		"push %%"R"bp; \n\t"
-		"mov %c[rbx](%[svm]), %%"R"bx \n\t"
-		"mov %c[rcx](%[svm]), %%"R"cx \n\t"
-		"mov %c[rdx](%[svm]), %%"R"dx \n\t"
-		"mov %c[rsi](%[svm]), %%"R"si \n\t"
-		"mov %c[rdi](%[svm]), %%"R"di \n\t"
-		"mov %c[rbp](%[svm]), %%"R"bp \n\t"
+		"push %%" _ASM_BP "; \n\t"
+		"mov %c[rbx](%[svm]), %%" _ASM_BX " \n\t"
+		"mov %c[rcx](%[svm]), %%" _ASM_CX " \n\t"
+		"mov %c[rdx](%[svm]), %%" _ASM_DX " \n\t"
+		"mov %c[rsi](%[svm]), %%" _ASM_SI " \n\t"
+		"mov %c[rdi](%[svm]), %%" _ASM_DI " \n\t"
+		"mov %c[rbp](%[svm]), %%" _ASM_BP " \n\t"
 #ifdef CONFIG_X86_64
 		"mov %c[r8](%[svm]),  %%r8  \n\t"
 		"mov %c[r9](%[svm]),  %%r9  \n\t"
@@ -3833,20 +3827,20 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 #endif
 
 		/* Enter guest mode */
-		"push %%"R"ax \n\t"
-		"mov %c[vmcb](%[svm]), %%"R"ax \n\t"
+		"push %%" _ASM_AX " \n\t"
+		"mov %c[vmcb](%[svm]), %%" _ASM_AX " \n\t"
 		__ex(SVM_VMLOAD) "\n\t"
 		__ex(SVM_VMRUN) "\n\t"
 		__ex(SVM_VMSAVE) "\n\t"
-		"pop %%"R"ax \n\t"
+		"pop %%" _ASM_AX " \n\t"
 
 		/* Save guest registers, load host registers */
-		"mov %%"R"bx, %c[rbx](%[svm]) \n\t"
-		"mov %%"R"cx, %c[rcx](%[svm]) \n\t"
-		"mov %%"R"dx, %c[rdx](%[svm]) \n\t"
-		"mov %%"R"si, %c[rsi](%[svm]) \n\t"
-		"mov %%"R"di, %c[rdi](%[svm]) \n\t"
-		"mov %%"R"bp, %c[rbp](%[svm]) \n\t"
+		"mov %%" _ASM_BX ", %c[rbx](%[svm]) \n\t"
+		"mov %%" _ASM_CX ", %c[rcx](%[svm]) \n\t"
+		"mov %%" _ASM_DX ", %c[rdx](%[svm]) \n\t"
+		"mov %%" _ASM_SI ", %c[rsi](%[svm]) \n\t"
+		"mov %%" _ASM_DI ", %c[rdi](%[svm]) \n\t"
+		"mov %%" _ASM_BP ", %c[rbp](%[svm]) \n\t"
 #ifdef CONFIG_X86_64
 		"mov %%r8,  %c[r8](%[svm]) \n\t"
 		"mov %%r9,  %c[r9](%[svm]) \n\t"
@@ -3857,7 +3851,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 		"mov %%r14, %c[r14](%[svm]) \n\t"
 		"mov %%r15, %c[r15](%[svm]) \n\t"
 #endif
-		"pop %%"R"bp"
+		"pop %%" _ASM_BP
 		:
 		: [svm]"a"(svm),
 		  [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)),
@@ -3878,9 +3872,11 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 		  [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15]))
 #endif
 		: "cc", "memory"
-		, R"bx", R"cx", R"dx", R"si", R"di"
 #ifdef CONFIG_X86_64
+		, "rbx", "rcx", "rdx", "rsi", "rdi"
 		, "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15"
+#else
+		, "ebx", "ecx", "edx", "esi", "edi"
 #endif
 		);
 
@@ -3940,8 +3936,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 	mark_all_clean(svm->vmcb);
 }
 
-#undef R
-
 static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
-- 
cgit v1.2.3


From 9fc77441e5e1bf80b794cc546d2243ee9f4afb75 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Sun, 16 Sep 2012 11:50:30 +0300
Subject: KVM: make processes waiting on vcpu mutex killable

vcpu mutex can be held for unlimited time so
taking it with mutex_lock on an ioctl is wrong:
one process could be passed a vcpu fd and
call this ioctl on the vcpu used by another process,
it will then be unkillable until the owner exits.

Call mutex_lock_killable instead and return status.
Note: mutex_lock_interruptible would be even nicer,
but I am not sure all users are prepared to handle EINTR
from these ioctls. They might misinterpret it as an error.

Cleanup paths expect a vcpu that can't be used by
any userspace so this will always succeed - catch bugs
by calling BUG_ON.

Catch callers that don't check return state by adding
__must_check.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c4d451ed157..19047eafa38 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6016,7 +6016,9 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 	int r;
 
 	vcpu->arch.mtrr_state.have_fixed = 1;
-	vcpu_load(vcpu);
+	r = vcpu_load(vcpu);
+	if (r)
+		return r;
 	r = kvm_arch_vcpu_reset(vcpu);
 	if (r == 0)
 		r = kvm_mmu_setup(vcpu);
@@ -6027,9 +6029,11 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 
 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 {
+	int r;
 	vcpu->arch.apf.msr_val = 0;
 
-	vcpu_load(vcpu);
+	r = vcpu_load(vcpu);
+	BUG_ON(r);
 	kvm_mmu_unload(vcpu);
 	vcpu_put(vcpu);
 
@@ -6275,7 +6279,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 
 static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
 {
-	vcpu_load(vcpu);
+	int r;
+	r = vcpu_load(vcpu);
+	BUG_ON(r);
 	kvm_mmu_unload(vcpu);
 	vcpu_put(vcpu);
 }
-- 
cgit v1.2.3


From 8ea667f259e3767fd3ee85a885c14e417835695e Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 12 Sep 2012 13:44:53 +0300
Subject: KVM: MMU: Push clean gpte write protection out of gpte_access()

gpte_access() computes the access permissions of a guest pte and also
write-protects clean gptes.  This is wrong when we are servicing a
write fault (since we'll be setting the dirty bit momentarily) but
correct when instantiating a speculative spte, or when servicing a
read fault (since we'll want to trap a following write in order to
set the dirty bit).

It doesn't seem to hurt in practice, but in order to make the code
readable, push the write protection out of gpte_access() and into
a new protect_clean_gpte() which is called explicitly when needed.

Reviewed-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c         | 12 ++++++++++++
 arch/x86/kvm/mmu.h         |  3 ++-
 arch/x86/kvm/paging_tmpl.h | 24 ++++++++++++------------
 3 files changed, 26 insertions(+), 13 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index aa0b469ee07..54c9cb4fdfa 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3408,6 +3408,18 @@ static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
 	return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
 }
 
+static inline void protect_clean_gpte(unsigned *access, unsigned gpte)
+{
+	unsigned mask;
+
+	BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK);
+
+	mask = (unsigned)~ACC_WRITE_MASK;
+	/* Allow write access to dirty gptes */
+	mask |= (gpte >> (PT_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) & PT_WRITABLE_MASK;
+	*access &= mask;
+}
+
 static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access,
 			   int *nr_present)
 {
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index e374db9af02..2832081e9b2 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -18,7 +18,8 @@
 #define PT_PCD_MASK (1ULL << 4)
 #define PT_ACCESSED_SHIFT 5
 #define PT_ACCESSED_MASK (1ULL << PT_ACCESSED_SHIFT)
-#define PT_DIRTY_MASK (1ULL << 6)
+#define PT_DIRTY_SHIFT 6
+#define PT_DIRTY_MASK (1ULL << PT_DIRTY_SHIFT)
 #define PT_PAGE_SIZE_MASK (1ULL << 7)
 #define PT_PAT_MASK (1ULL << 7)
 #define PT_GLOBAL_MASK (1ULL << 8)
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index bf8c42bf50f..bf7b4ffafab 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -101,14 +101,11 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
 	return (ret != orig_pte);
 }
 
-static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte,
-				   bool last)
+static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
 {
 	unsigned access;
 
 	access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
-	if (last && !is_dirty_gpte(gpte))
-		access &= ~ACC_WRITE_MASK;
 
 #if PTTYPE == 64
 	if (vcpu->arch.mmu.nx)
@@ -222,8 +219,7 @@ retry_walk:
 
 		last_gpte = FNAME(is_last_gpte)(walker, vcpu, mmu, pte);
 		if (last_gpte) {
-			pte_access = pt_access &
-				     FNAME(gpte_access)(vcpu, pte, true);
+			pte_access = pt_access & FNAME(gpte_access)(vcpu, pte);
 			/* check if the kernel is fetching from user page */
 			if (unlikely(pte_access & PT_USER_MASK) &&
 			    kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
@@ -274,7 +270,7 @@ retry_walk:
 			break;
 		}
 
-		pt_access &= FNAME(gpte_access)(vcpu, pte, false);
+		pt_access &= FNAME(gpte_access)(vcpu, pte);
 		--walker->level;
 	}
 
@@ -283,7 +279,9 @@ retry_walk:
 		goto error;
 	}
 
-	if (write_fault && unlikely(!is_dirty_gpte(pte))) {
+	if (!write_fault)
+		protect_clean_gpte(&pte_access, pte);
+	else if (unlikely(!is_dirty_gpte(pte))) {
 		int ret;
 
 		trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
@@ -368,7 +366,8 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 		return;
 
 	pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
-	pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte, true);
+	pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
+	protect_clean_gpte(&pte_access, gpte);
 	pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte));
 	if (mmu_invalid_pfn(pfn))
 		return;
@@ -441,8 +440,8 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
 		if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
 			continue;
 
-		pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte,
-								  true);
+		pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
+		protect_clean_gpte(&pte_access, gpte);
 		gfn = gpte_to_gfn(gpte);
 		pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
 				      pte_access & ACC_WRITE_MASK);
@@ -794,7 +793,8 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 
 		gfn = gpte_to_gfn(gpte);
 		pte_access = sp->role.access;
-		pte_access &= FNAME(gpte_access)(vcpu, gpte, true);
+		pte_access &= FNAME(gpte_access)(vcpu, gpte);
+		protect_clean_gpte(&pte_access, gpte);
 
 		if (sync_mmio_spte(&sp->spt[i], gfn, pte_access, &nr_present))
 			continue;
-- 
cgit v1.2.3


From edc2ae84eb40a3c062210fe01af1cae1633cc810 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 12 Sep 2012 13:53:08 +0300
Subject: KVM: MMU: Optimize gpte_access() slightly

If nx is disabled, then is gpte[63] is set we will hit a reserved
bit set fault before checking permissions; so we can ignore the
setting of efer.nxe.

Reviewed-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index bf7b4ffafab..064bcb32d84 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -106,10 +106,8 @@ static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
 	unsigned access;
 
 	access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
-
 #if PTTYPE == 64
-	if (vcpu->arch.mmu.nx)
-		access &= ~(gpte >> PT64_NX_SHIFT);
+	access &= ~(gpte >> PT64_NX_SHIFT);
 #endif
 	return access;
 }
-- 
cgit v1.2.3


From 3d34adec7081621ff51c195be045b87d75c0c49d Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 12 Sep 2012 14:03:28 +0300
Subject: KVM: MMU: Move gpte_access() out of paging_tmpl.h

We no longer rely on paging_tmpl.h defines; so we can move the function
to mmu.c.

Rely on zero extension to 64 bits to get the correct nx behaviour.

Reviewed-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c         | 10 ++++++++++
 arch/x86/kvm/paging_tmpl.h | 21 +++++----------------
 2 files changed, 15 insertions(+), 16 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 54c9cb4fdfa..f297a2ccf4f 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3437,6 +3437,16 @@ static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access,
 	return false;
 }
 
+static inline unsigned gpte_access(struct kvm_vcpu *vcpu, u64 gpte)
+{
+	unsigned access;
+
+	access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
+	access &= ~(gpte >> PT64_NX_SHIFT);
+
+	return access;
+}
+
 #define PTTYPE 64
 #include "paging_tmpl.h"
 #undef PTTYPE
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 064bcb32d84..1cbf576852c 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -101,17 +101,6 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
 	return (ret != orig_pte);
 }
 
-static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
-{
-	unsigned access;
-
-	access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
-#if PTTYPE == 64
-	access &= ~(gpte >> PT64_NX_SHIFT);
-#endif
-	return access;
-}
-
 static bool FNAME(is_last_gpte)(struct guest_walker *walker,
 				struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
 				pt_element_t gpte)
@@ -217,7 +206,7 @@ retry_walk:
 
 		last_gpte = FNAME(is_last_gpte)(walker, vcpu, mmu, pte);
 		if (last_gpte) {
-			pte_access = pt_access & FNAME(gpte_access)(vcpu, pte);
+			pte_access = pt_access & gpte_access(vcpu, pte);
 			/* check if the kernel is fetching from user page */
 			if (unlikely(pte_access & PT_USER_MASK) &&
 			    kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
@@ -268,7 +257,7 @@ retry_walk:
 			break;
 		}
 
-		pt_access &= FNAME(gpte_access)(vcpu, pte);
+		pt_access &= gpte_access(vcpu, pte);
 		--walker->level;
 	}
 
@@ -364,7 +353,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 		return;
 
 	pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
-	pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
+	pte_access = sp->role.access & gpte_access(vcpu, gpte);
 	protect_clean_gpte(&pte_access, gpte);
 	pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte));
 	if (mmu_invalid_pfn(pfn))
@@ -438,7 +427,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
 		if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
 			continue;
 
-		pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
+		pte_access = sp->role.access & gpte_access(vcpu, gpte);
 		protect_clean_gpte(&pte_access, gpte);
 		gfn = gpte_to_gfn(gpte);
 		pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
@@ -791,7 +780,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 
 		gfn = gpte_to_gfn(gpte);
 		pte_access = sp->role.access;
-		pte_access &= FNAME(gpte_access)(vcpu, gpte);
+		pte_access &= gpte_access(vcpu, gpte);
 		protect_clean_gpte(&pte_access, gpte);
 
 		if (sync_mmio_spte(&sp->spt[i], gfn, pte_access, &nr_present))
-- 
cgit v1.2.3


From 8cbc70696f149e44753b0fe60162b4ff96c2dd2b Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 16 Sep 2012 14:18:51 +0300
Subject: KVM: MMU: Update accessed and dirty bits after guest pagetable walk

While unspecified, the behaviour of Intel processors is to first
perform the page table walk, then, if the walk was successful, to
atomically update the accessed and dirty bits of walked paging elements.

While we are not required to follow this exactly, doing so will allow us
to perform the access permissions check after the walk is complete, rather
than after each walk step.

(the tricky case is SMEP: a zero in any pte's U bit makes the referenced
page a supervisor page, so we can't fault on a one bit during the walk
itself).

Reviewed-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 76 ++++++++++++++++++++++++++++------------------
 1 file changed, 47 insertions(+), 29 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 1cbf576852c..35a05dd2f69 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -63,10 +63,12 @@
  */
 struct guest_walker {
 	int level;
+	unsigned max_level;
 	gfn_t table_gfn[PT_MAX_FULL_LEVELS];
 	pt_element_t ptes[PT_MAX_FULL_LEVELS];
 	pt_element_t prefetch_ptes[PTE_PREFETCH_NUM];
 	gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
+	pt_element_t __user *ptep_user[PT_MAX_FULL_LEVELS];
 	unsigned pt_access;
 	unsigned pte_access;
 	gfn_t gfn;
@@ -119,6 +121,43 @@ static bool FNAME(is_last_gpte)(struct guest_walker *walker,
 	return false;
 }
 
+static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
+					     struct kvm_mmu *mmu,
+					     struct guest_walker *walker,
+					     int write_fault)
+{
+	unsigned level, index;
+	pt_element_t pte, orig_pte;
+	pt_element_t __user *ptep_user;
+	gfn_t table_gfn;
+	int ret;
+
+	for (level = walker->max_level; level >= walker->level; --level) {
+		pte = orig_pte = walker->ptes[level - 1];
+		table_gfn = walker->table_gfn[level - 1];
+		ptep_user = walker->ptep_user[level - 1];
+		index = offset_in_page(ptep_user) / sizeof(pt_element_t);
+		if (!(pte & PT_ACCESSED_MASK)) {
+			trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(pte));
+			pte |= PT_ACCESSED_MASK;
+		}
+		if (level == walker->level && write_fault && !is_dirty_gpte(pte)) {
+			trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
+			pte |= PT_DIRTY_MASK;
+		}
+		if (pte == orig_pte)
+			continue;
+
+		ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, orig_pte, pte);
+		if (ret)
+			return ret;
+
+		mark_page_dirty(vcpu->kvm, table_gfn);
+		walker->ptes[level] = pte;
+	}
+	return 0;
+}
+
 /*
  * Fetch a guest pte for a guest virtual address
  */
@@ -126,6 +165,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 				    struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
 				    gva_t addr, u32 access)
 {
+	int ret;
 	pt_element_t pte;
 	pt_element_t __user *uninitialized_var(ptep_user);
 	gfn_t table_gfn;
@@ -153,6 +193,7 @@ retry_walk:
 		--walker->level;
 	}
 #endif
+	walker->max_level = walker->level;
 	ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
 	       (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0);
 
@@ -183,6 +224,7 @@ retry_walk:
 		ptep_user = (pt_element_t __user *)((void *)host_addr + offset);
 		if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte))))
 			goto error;
+		walker->ptep_user[walker->level - 1] = ptep_user;
 
 		trace_kvm_mmu_paging_element(pte, walker->level);
 
@@ -214,21 +256,6 @@ retry_walk:
 					eperm = true;
 		}
 
-		if (!eperm && unlikely(!(pte & PT_ACCESSED_MASK))) {
-			int ret;
-			trace_kvm_mmu_set_accessed_bit(table_gfn, index,
-						       sizeof(pte));
-			ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index,
-						  pte, pte|PT_ACCESSED_MASK);
-			if (unlikely(ret < 0))
-				goto error;
-			else if (ret)
-				goto retry_walk;
-
-			mark_page_dirty(vcpu->kvm, table_gfn);
-			pte |= PT_ACCESSED_MASK;
-		}
-
 		walker->ptes[walker->level - 1] = pte;
 
 		if (last_gpte) {
@@ -268,21 +295,12 @@ retry_walk:
 
 	if (!write_fault)
 		protect_clean_gpte(&pte_access, pte);
-	else if (unlikely(!is_dirty_gpte(pte))) {
-		int ret;
 
-		trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
-		ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index,
-					  pte, pte|PT_DIRTY_MASK);
-		if (unlikely(ret < 0))
-			goto error;
-		else if (ret)
-			goto retry_walk;
-
-		mark_page_dirty(vcpu->kvm, table_gfn);
-		pte |= PT_DIRTY_MASK;
-		walker->ptes[walker->level - 1] = pte;
-	}
+	ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault);
+	if (unlikely(ret < 0))
+		goto error;
+	else if (ret)
+		goto retry_walk;
 
 	walker->pt_access = pt_access;
 	walker->pte_access = pte_access;
-- 
cgit v1.2.3


From 97d64b788114be1c4dc4bfe7a8ba2bf9643fe6af Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 12 Sep 2012 14:52:00 +0300
Subject: KVM: MMU: Optimize pte permission checks

walk_addr_generic() permission checks are a maze of branchy code, which is
performed four times per lookup.  It depends on the type of access, efer.nxe,
cr0.wp, cr4.smep, and in the near future, cr4.smap.

Optimize this away by precalculating all variants and storing them in a
bitmap.  The bitmap is recalculated when rarely-changing variables change
(cr0, cr4) and is indexed by the often-changing variables (page fault error
code, pte access permissions).

The permission check is moved to the end of the loop, otherwise an SMEP
fault could be reported as a false positive, when PDE.U=1 but PTE.U=0.
Noted by Xiao Guangrong.

The result is short, branch-free code.

Reviewed-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  7 +++++++
 arch/x86/kvm/mmu.c              | 38 ++++++++++++++++++++++++++++++++++++++
 arch/x86/kvm/mmu.h              | 19 ++++++++-----------
 arch/x86/kvm/paging_tmpl.h      | 22 ++++------------------
 arch/x86/kvm/x86.c              | 11 ++++-------
 5 files changed, 61 insertions(+), 36 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 64adb6117e1..3318bde206a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -287,6 +287,13 @@ struct kvm_mmu {
 	union kvm_mmu_page_role base_role;
 	bool direct_map;
 
+	/*
+	 * Bitmap; bit set = permission fault
+	 * Byte index: page fault error code [4:1]
+	 * Bit index: pte permissions in ACC_* format
+	 */
+	u8 permissions[16];
+
 	u64 *pae_root;
 	u64 *lm_root;
 	u64 rsvd_bits_mask[2][4];
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index f297a2ccf4f..9c6188931f8 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3516,6 +3516,38 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
 	}
 }
 
+static void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
+{
+	unsigned bit, byte, pfec;
+	u8 map;
+	bool fault, x, w, u, wf, uf, ff, smep;
+
+	smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
+	for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) {
+		pfec = byte << 1;
+		map = 0;
+		wf = pfec & PFERR_WRITE_MASK;
+		uf = pfec & PFERR_USER_MASK;
+		ff = pfec & PFERR_FETCH_MASK;
+		for (bit = 0; bit < 8; ++bit) {
+			x = bit & ACC_EXEC_MASK;
+			w = bit & ACC_WRITE_MASK;
+			u = bit & ACC_USER_MASK;
+
+			/* Not really needed: !nx will cause pte.nx to fault */
+			x |= !mmu->nx;
+			/* Allow supervisor writes if !cr0.wp */
+			w |= !is_write_protection(vcpu) && !uf;
+			/* Disallow supervisor fetches of user code if cr4.smep */
+			x &= !(smep && u && !uf);
+
+			fault = (ff && !x) || (uf && !u) || (wf && !w);
+			map |= fault << bit;
+		}
+		mmu->permissions[byte] = map;
+	}
+}
+
 static int paging64_init_context_common(struct kvm_vcpu *vcpu,
 					struct kvm_mmu *context,
 					int level)
@@ -3524,6 +3556,7 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu,
 	context->root_level = level;
 
 	reset_rsvds_bits_mask(vcpu, context);
+	update_permission_bitmask(vcpu, context);
 
 	ASSERT(is_pae(vcpu));
 	context->new_cr3 = paging_new_cr3;
@@ -3552,6 +3585,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu,
 	context->root_level = PT32_ROOT_LEVEL;
 
 	reset_rsvds_bits_mask(vcpu, context);
+	update_permission_bitmask(vcpu, context);
 
 	context->new_cr3 = paging_new_cr3;
 	context->page_fault = paging32_page_fault;
@@ -3612,6 +3646,8 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 		context->gva_to_gpa = paging32_gva_to_gpa;
 	}
 
+	update_permission_bitmask(vcpu, context);
+
 	return 0;
 }
 
@@ -3687,6 +3723,8 @@ static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
 		g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
 	}
 
+	update_permission_bitmask(vcpu, g_context);
+
 	return 0;
 }
 
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 2832081e9b2..584660775d0 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -89,17 +89,14 @@ static inline bool is_write_protection(struct kvm_vcpu *vcpu)
 	return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
 }
 
-static inline bool check_write_user_access(struct kvm_vcpu *vcpu,
-					   bool write_fault, bool user_fault,
-					   unsigned long pte)
+/*
+ * Will a fault with a given page-fault error code (pfec) cause a permission
+ * fault with the given access (in ACC_* format)?
+ */
+static inline bool permission_fault(struct kvm_mmu *mmu, unsigned pte_access,
+				    unsigned pfec)
 {
-	if (unlikely(write_fault && !is_writable_pte(pte)
-	      && (user_fault || is_write_protection(vcpu))))
-		return false;
-
-	if (unlikely(user_fault && !(pte & PT_USER_MASK)))
-		return false;
-
-	return true;
+	return (mmu->permissions[pfec >> 1] >> pte_access) & 1;
 }
+
 #endif
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 35a05dd2f69..8f6c59fadbb 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -169,7 +169,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 	pt_element_t pte;
 	pt_element_t __user *uninitialized_var(ptep_user);
 	gfn_t table_gfn;
-	unsigned index, pt_access, uninitialized_var(pte_access);
+	unsigned index, pt_access, pte_access;
 	gpa_t pte_gpa;
 	bool eperm, last_gpte;
 	int offset;
@@ -237,24 +237,9 @@ retry_walk:
 			goto error;
 		}
 
-		if (!check_write_user_access(vcpu, write_fault, user_fault,
-					  pte))
-			eperm = true;
-
-#if PTTYPE == 64
-		if (unlikely(fetch_fault && (pte & PT64_NX_MASK)))
-			eperm = true;
-#endif
+		pte_access = pt_access & gpte_access(vcpu, pte);
 
 		last_gpte = FNAME(is_last_gpte)(walker, vcpu, mmu, pte);
-		if (last_gpte) {
-			pte_access = pt_access & gpte_access(vcpu, pte);
-			/* check if the kernel is fetching from user page */
-			if (unlikely(pte_access & PT_USER_MASK) &&
-			    kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
-				if (fetch_fault && !user_fault)
-					eperm = true;
-		}
 
 		walker->ptes[walker->level - 1] = pte;
 
@@ -284,10 +269,11 @@ retry_walk:
 			break;
 		}
 
-		pt_access &= gpte_access(vcpu, pte);
+		pt_access &= pte_access;
 		--walker->level;
 	}
 
+	eperm |= permission_fault(mmu, pte_access, access);
 	if (unlikely(eperm)) {
 		errcode |= PFERR_PRESENT_MASK;
 		goto error;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 19047eafa38..497226e49d4 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3672,20 +3672,17 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
 				gpa_t *gpa, struct x86_exception *exception,
 				bool write)
 {
-	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+	u32 access = ((kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0)
+		| (write ? PFERR_WRITE_MASK : 0);
 
-	if (vcpu_match_mmio_gva(vcpu, gva) &&
-		  check_write_user_access(vcpu, write, access,
-		  vcpu->arch.access)) {
+	if (vcpu_match_mmio_gva(vcpu, gva)
+	    && !permission_fault(vcpu->arch.walk_mmu, vcpu->arch.access, access)) {
 		*gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
 					(gva & (PAGE_SIZE - 1));
 		trace_vcpu_match_mmio(gva, *gpa, write, false);
 		return 1;
 	}
 
-	if (write)
-		access |= PFERR_WRITE_MASK;
-
 	*gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
 
 	if (*gpa == UNMAPPED_GVA)
-- 
cgit v1.2.3


From 13d22b6aebb000aeaf137862c6c0e0c4d138d798 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 12 Sep 2012 15:12:09 +0300
Subject: KVM: MMU: Simplify walk_addr_generic() loop

The page table walk is coded as an infinite loop, with a special
case on the last pte.

Code it as an ordinary loop with a termination condition on the last
pte (large page or walk length exhausted), and put the last pte handling
code after the loop where it belongs.

Reviewed-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 60 +++++++++++++++++++---------------------------
 1 file changed, 25 insertions(+), 35 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 8f6c59fadbb..1b4c14d235a 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -171,12 +171,15 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 	gfn_t table_gfn;
 	unsigned index, pt_access, pte_access;
 	gpa_t pte_gpa;
-	bool eperm, last_gpte;
+	bool eperm;
 	int offset;
 	const int write_fault = access & PFERR_WRITE_MASK;
 	const int user_fault  = access & PFERR_USER_MASK;
 	const int fetch_fault = access & PFERR_FETCH_MASK;
 	u16 errcode = 0;
+	gpa_t real_gpa;
+	gfn_t gfn;
+	u32 ac;
 
 	trace_kvm_mmu_pagetable_walk(addr, access);
 retry_walk:
@@ -197,12 +200,16 @@ retry_walk:
 	ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
 	       (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0);
 
-	pt_access = ACC_ALL;
+	pt_access = pte_access = ACC_ALL;
+	++walker->level;
 
-	for (;;) {
+	do {
 		gfn_t real_gfn;
 		unsigned long host_addr;
 
+		pt_access &= pte_access;
+		--walker->level;
+
 		index = PT_INDEX(addr, walker->level);
 
 		table_gfn = gpte_to_gfn(pte);
@@ -239,39 +246,8 @@ retry_walk:
 
 		pte_access = pt_access & gpte_access(vcpu, pte);
 
-		last_gpte = FNAME(is_last_gpte)(walker, vcpu, mmu, pte);
-
 		walker->ptes[walker->level - 1] = pte;
-
-		if (last_gpte) {
-			int lvl = walker->level;
-			gpa_t real_gpa;
-			gfn_t gfn;
-			u32 ac;
-
-			gfn = gpte_to_gfn_lvl(pte, lvl);
-			gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) >> PAGE_SHIFT;
-
-			if (PTTYPE == 32 &&
-			    walker->level == PT_DIRECTORY_LEVEL &&
-			    is_cpuid_PSE36())
-				gfn += pse36_gfn_delta(pte);
-
-			ac = write_fault | fetch_fault | user_fault;
-
-			real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn),
-						      ac);
-			if (real_gpa == UNMAPPED_GVA)
-				return 0;
-
-			walker->gfn = real_gpa >> PAGE_SHIFT;
-
-			break;
-		}
-
-		pt_access &= pte_access;
-		--walker->level;
-	}
+	} while (!FNAME(is_last_gpte)(walker, vcpu, mmu, pte));
 
 	eperm |= permission_fault(mmu, pte_access, access);
 	if (unlikely(eperm)) {
@@ -279,6 +255,20 @@ retry_walk:
 		goto error;
 	}
 
+	gfn = gpte_to_gfn_lvl(pte, walker->level);
+	gfn += (addr & PT_LVL_OFFSET_MASK(walker->level)) >> PAGE_SHIFT;
+
+	if (PTTYPE == 32 && walker->level == PT_DIRECTORY_LEVEL && is_cpuid_PSE36())
+		gfn += pse36_gfn_delta(pte);
+
+	ac = write_fault | fetch_fault | user_fault;
+
+	real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), ac);
+	if (real_gpa == UNMAPPED_GVA)
+		return 0;
+
+	walker->gfn = real_gpa >> PAGE_SHIFT;
+
 	if (!write_fault)
 		protect_clean_gpte(&pte_access, pte);
 
-- 
cgit v1.2.3


From 6fd01b711bee96ce3356f7b6f370ab708e37504b Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 12 Sep 2012 20:46:56 +0300
Subject: KVM: MMU: Optimize is_last_gpte()

Instead of branchy code depending on level, gpte.ps, and mmu configuration,
prepare everything in a bitmap during mode changes and look it up during
runtime.

Reviewed-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  7 +++++++
 arch/x86/kvm/mmu.c              | 31 +++++++++++++++++++++++++++++++
 arch/x86/kvm/mmu.h              |  3 ++-
 arch/x86/kvm/paging_tmpl.h      | 20 +-------------------
 4 files changed, 41 insertions(+), 20 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3318bde206a..43aeb942283 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -298,6 +298,13 @@ struct kvm_mmu {
 	u64 *lm_root;
 	u64 rsvd_bits_mask[2][4];
 
+	/*
+	 * Bitmap: bit set = last pte in walk
+	 * index[0:1]: level (zero-based)
+	 * index[2]: pte.ps
+	 */
+	u8 last_pte_bitmap;
+
 	bool nx;
 
 	u64 pdptrs[4]; /* pae */
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9c6188931f8..d289fee1ffb 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3447,6 +3447,15 @@ static inline unsigned gpte_access(struct kvm_vcpu *vcpu, u64 gpte)
 	return access;
 }
 
+static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gpte)
+{
+	unsigned index;
+
+	index = level - 1;
+	index |= (gpte & PT_PAGE_SIZE_MASK) >> (PT_PAGE_SIZE_SHIFT - 2);
+	return mmu->last_pte_bitmap & (1 << index);
+}
+
 #define PTTYPE 64
 #include "paging_tmpl.h"
 #undef PTTYPE
@@ -3548,6 +3557,24 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu
 	}
 }
 
+static void update_last_pte_bitmap(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
+{
+	u8 map;
+	unsigned level, root_level = mmu->root_level;
+	const unsigned ps_set_index = 1 << 2;  /* bit 2 of index: ps */
+
+	if (root_level == PT32E_ROOT_LEVEL)
+		--root_level;
+	/* PT_PAGE_TABLE_LEVEL always terminates */
+	map = 1 | (1 << ps_set_index);
+	for (level = PT_DIRECTORY_LEVEL; level <= root_level; ++level) {
+		if (level <= PT_PDPE_LEVEL
+		    && (mmu->root_level >= PT32E_ROOT_LEVEL || is_pse(vcpu)))
+			map |= 1 << (ps_set_index | (level - 1));
+	}
+	mmu->last_pte_bitmap = map;
+}
+
 static int paging64_init_context_common(struct kvm_vcpu *vcpu,
 					struct kvm_mmu *context,
 					int level)
@@ -3557,6 +3584,7 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu,
 
 	reset_rsvds_bits_mask(vcpu, context);
 	update_permission_bitmask(vcpu, context);
+	update_last_pte_bitmap(vcpu, context);
 
 	ASSERT(is_pae(vcpu));
 	context->new_cr3 = paging_new_cr3;
@@ -3586,6 +3614,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu,
 
 	reset_rsvds_bits_mask(vcpu, context);
 	update_permission_bitmask(vcpu, context);
+	update_last_pte_bitmap(vcpu, context);
 
 	context->new_cr3 = paging_new_cr3;
 	context->page_fault = paging32_page_fault;
@@ -3647,6 +3676,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 	}
 
 	update_permission_bitmask(vcpu, context);
+	update_last_pte_bitmap(vcpu, context);
 
 	return 0;
 }
@@ -3724,6 +3754,7 @@ static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
 	}
 
 	update_permission_bitmask(vcpu, g_context);
+	update_last_pte_bitmap(vcpu, g_context);
 
 	return 0;
 }
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 584660775d0..69871080e86 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -20,7 +20,8 @@
 #define PT_ACCESSED_MASK (1ULL << PT_ACCESSED_SHIFT)
 #define PT_DIRTY_SHIFT 6
 #define PT_DIRTY_MASK (1ULL << PT_DIRTY_SHIFT)
-#define PT_PAGE_SIZE_MASK (1ULL << 7)
+#define PT_PAGE_SIZE_SHIFT 7
+#define PT_PAGE_SIZE_MASK (1ULL << PT_PAGE_SIZE_SHIFT)
 #define PT_PAT_MASK (1ULL << 7)
 #define PT_GLOBAL_MASK (1ULL << 8)
 #define PT64_NX_SHIFT 63
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 1b4c14d235a..134ea7b1c58 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -103,24 +103,6 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
 	return (ret != orig_pte);
 }
 
-static bool FNAME(is_last_gpte)(struct guest_walker *walker,
-				struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
-				pt_element_t gpte)
-{
-	if (walker->level == PT_PAGE_TABLE_LEVEL)
-		return true;
-
-	if ((walker->level == PT_DIRECTORY_LEVEL) && is_large_pte(gpte) &&
-	    (PTTYPE == 64 || is_pse(vcpu)))
-		return true;
-
-	if ((walker->level == PT_PDPE_LEVEL) && is_large_pte(gpte) &&
-	    (mmu->root_level == PT64_ROOT_LEVEL))
-		return true;
-
-	return false;
-}
-
 static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
 					     struct kvm_mmu *mmu,
 					     struct guest_walker *walker,
@@ -247,7 +229,7 @@ retry_walk:
 		pte_access = pt_access & gpte_access(vcpu, pte);
 
 		walker->ptes[walker->level - 1] = pte;
-	} while (!FNAME(is_last_gpte)(walker, vcpu, mmu, pte));
+	} while (!is_last_gpte(mmu, walker->level, pte));
 
 	eperm |= permission_fault(mmu, pte_access, access);
 	if (unlikely(eperm)) {
-- 
cgit v1.2.3


From 71331a1da1e3a66d14bb3864f99e32d84ab5a76f Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 16 Sep 2012 14:49:15 +0300
Subject: KVM: MMU: Eliminate eperm temporary

'eperm' is no longer used in the walker loop, so we can eliminate it.

Reviewed-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 134ea7b1c58..95a64d1dccc 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -153,7 +153,6 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 	gfn_t table_gfn;
 	unsigned index, pt_access, pte_access;
 	gpa_t pte_gpa;
-	bool eperm;
 	int offset;
 	const int write_fault = access & PFERR_WRITE_MASK;
 	const int user_fault  = access & PFERR_USER_MASK;
@@ -165,7 +164,6 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 
 	trace_kvm_mmu_pagetable_walk(addr, access);
 retry_walk:
-	eperm = false;
 	walker->level = mmu->root_level;
 	pte           = mmu->get_cr3(vcpu);
 
@@ -231,8 +229,7 @@ retry_walk:
 		walker->ptes[walker->level - 1] = pte;
 	} while (!is_last_gpte(mmu, walker->level, pte));
 
-	eperm |= permission_fault(mmu, pte_access, access);
-	if (unlikely(eperm)) {
+	if (unlikely(permission_fault(mmu, pte_access, access))) {
 		errcode |= PFERR_PRESENT_MASK;
 		goto error;
 	}
-- 
cgit v1.2.3


From b514c30f7729b66af481fde3c1225e832e339d5b Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 16 Sep 2012 15:03:02 +0300
Subject: KVM: MMU: Avoid access/dirty update loop if all is well

Keep track of accessed/dirty bits; if they are all set, do not
enter the accessed/dirty update loop.

Reviewed-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 95a64d1dccc..810c1da2ee4 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -151,7 +151,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 	pt_element_t pte;
 	pt_element_t __user *uninitialized_var(ptep_user);
 	gfn_t table_gfn;
-	unsigned index, pt_access, pte_access;
+	unsigned index, pt_access, pte_access, accessed_dirty, shift;
 	gpa_t pte_gpa;
 	int offset;
 	const int write_fault = access & PFERR_WRITE_MASK;
@@ -180,6 +180,7 @@ retry_walk:
 	ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
 	       (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0);
 
+	accessed_dirty = PT_ACCESSED_MASK;
 	pt_access = pte_access = ACC_ALL;
 	++walker->level;
 
@@ -224,6 +225,7 @@ retry_walk:
 			goto error;
 		}
 
+		accessed_dirty &= pte;
 		pte_access = pt_access & gpte_access(vcpu, pte);
 
 		walker->ptes[walker->level - 1] = pte;
@@ -251,11 +253,23 @@ retry_walk:
 	if (!write_fault)
 		protect_clean_gpte(&pte_access, pte);
 
-	ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault);
-	if (unlikely(ret < 0))
-		goto error;
-	else if (ret)
-		goto retry_walk;
+	/*
+	 * On a write fault, fold the dirty bit into accessed_dirty by shifting it one
+	 * place right.
+	 *
+	 * On a read fault, do nothing.
+	 */
+	shift = write_fault >> ilog2(PFERR_WRITE_MASK);
+	shift *= PT_DIRTY_SHIFT - PT_ACCESSED_SHIFT;
+	accessed_dirty &= pte >> shift;
+
+	if (unlikely(!accessed_dirty)) {
+		ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault);
+		if (unlikely(ret < 0))
+			goto error;
+		else if (ret)
+			goto retry_walk;
+	}
 
 	walker->pt_access = pt_access;
 	walker->pte_access = pte_access;
-- 
cgit v1.2.3


From c5421519f30bd5ed77857a78de6dc8414385e602 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 19 Sep 2012 19:33:48 +0300
Subject: KVM: MMU: Eliminate pointless temporary 'ac'

'ac' essentially reconstructs the 'access' variable we already
have, except for the PFERR_PRESENT_MASK and PFERR_RSVD_MASK.  As
these are not used by callees, just use 'access' directly.

Reviewed-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 810c1da2ee4..714e2c01a6f 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -160,7 +160,6 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 	u16 errcode = 0;
 	gpa_t real_gpa;
 	gfn_t gfn;
-	u32 ac;
 
 	trace_kvm_mmu_pagetable_walk(addr, access);
 retry_walk:
@@ -242,9 +241,7 @@ retry_walk:
 	if (PTTYPE == 32 && walker->level == PT_DIRECTORY_LEVEL && is_cpuid_PSE36())
 		gfn += pse36_gfn_delta(pte);
 
-	ac = write_fault | fetch_fault | user_fault;
-
-	real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), ac);
+	real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), access);
 	if (real_gpa == UNMAPPED_GVA)
 		return 0;
 
-- 
cgit v1.2.3


From 1e08ec4a130e2745d96df169e67c58df98a07311 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 13 Sep 2012 17:19:24 +0300
Subject: KVM: optimize apic interrupt delivery

Most interrupt are delivered to only one vcpu. Use pre-build tables to
find interrupt destination instead of looping through all vcpus. In case
of logical mode loop only through vcpus in a logical cluster irq is sent
to.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  12 +++
 arch/x86/kvm/lapic.c            | 188 +++++++++++++++++++++++++++++++++++++---
 arch/x86/kvm/lapic.h            |   3 +
 arch/x86/kvm/x86.c              |   2 +
 4 files changed, 193 insertions(+), 12 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 43aeb942283..0b902c98f27 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -525,6 +525,16 @@ struct kvm_arch_memory_slot {
 	struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1];
 };
 
+struct kvm_apic_map {
+	struct rcu_head rcu;
+	u8 ldr_bits;
+	/* fields bellow are used to decode ldr values in different modes */
+	u32 cid_shift, cid_mask, lid_mask;
+	struct kvm_lapic *phys_map[256];
+	/* first index is cluster id second is cpu id in a cluster */
+	struct kvm_lapic *logical_map[16][16];
+};
+
 struct kvm_arch {
 	unsigned int n_used_mmu_pages;
 	unsigned int n_requested_mmu_pages;
@@ -542,6 +552,8 @@ struct kvm_arch {
 	struct kvm_ioapic *vioapic;
 	struct kvm_pit *vpit;
 	int vapics_in_nmi_mode;
+	struct mutex apic_map_lock;
+	struct kvm_apic_map *apic_map;
 
 	unsigned int tss_addr;
 	struct page *apic_access_page;
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 6f9fd633c88..c6e6b721b6e 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -140,11 +140,110 @@ static inline int apic_enabled(struct kvm_lapic *apic)
 	(LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \
 	 APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
 
+static inline int apic_x2apic_mode(struct kvm_lapic *apic)
+{
+	return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
+}
+
 static inline int kvm_apic_id(struct kvm_lapic *apic)
 {
 	return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
 }
 
+static inline u16 apic_cluster_id(struct kvm_apic_map *map, u32 ldr)
+{
+	u16 cid;
+	ldr >>= 32 - map->ldr_bits;
+	cid = (ldr >> map->cid_shift) & map->cid_mask;
+
+	BUG_ON(cid >= ARRAY_SIZE(map->logical_map));
+
+	return cid;
+}
+
+static inline u16 apic_logical_id(struct kvm_apic_map *map, u32 ldr)
+{
+	ldr >>= (32 - map->ldr_bits);
+	return ldr & map->lid_mask;
+}
+
+static void recalculate_apic_map(struct kvm *kvm)
+{
+	struct kvm_apic_map *new, *old = NULL;
+	struct kvm_vcpu *vcpu;
+	int i;
+
+	new = kzalloc(sizeof(struct kvm_apic_map), GFP_KERNEL);
+
+	mutex_lock(&kvm->arch.apic_map_lock);
+
+	if (!new)
+		goto out;
+
+	new->ldr_bits = 8;
+	/* flat mode is default */
+	new->cid_shift = 8;
+	new->cid_mask = 0;
+	new->lid_mask = 0xff;
+
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		struct kvm_lapic *apic = vcpu->arch.apic;
+		u16 cid, lid;
+		u32 ldr;
+
+		if (!kvm_apic_present(vcpu))
+			continue;
+
+		/*
+		 * All APICs have to be configured in the same mode by an OS.
+		 * We take advatage of this while building logical id loockup
+		 * table. After reset APICs are in xapic/flat mode, so if we
+		 * find apic with different setting we assume this is the mode
+		 * OS wants all apics to be in; build lookup table accordingly.
+		 */
+		if (apic_x2apic_mode(apic)) {
+			new->ldr_bits = 32;
+			new->cid_shift = 16;
+			new->cid_mask = new->lid_mask = 0xffff;
+		} else if (kvm_apic_sw_enabled(apic) &&
+				!new->cid_mask /* flat mode */ &&
+				kvm_apic_get_reg(apic, APIC_DFR) == APIC_DFR_CLUSTER) {
+			new->cid_shift = 4;
+			new->cid_mask = 0xf;
+			new->lid_mask = 0xf;
+		}
+
+		new->phys_map[kvm_apic_id(apic)] = apic;
+
+		ldr = kvm_apic_get_reg(apic, APIC_LDR);
+		cid = apic_cluster_id(new, ldr);
+		lid = apic_logical_id(new, ldr);
+
+		if (lid)
+			new->logical_map[cid][ffs(lid) - 1] = apic;
+	}
+out:
+	old = rcu_dereference_protected(kvm->arch.apic_map,
+			lockdep_is_held(&kvm->arch.apic_map_lock));
+	rcu_assign_pointer(kvm->arch.apic_map, new);
+	mutex_unlock(&kvm->arch.apic_map_lock);
+
+	if (old)
+		kfree_rcu(old, rcu);
+}
+
+static inline void kvm_apic_set_id(struct kvm_lapic *apic, u8 id)
+{
+	apic_set_reg(apic, APIC_ID, id << 24);
+	recalculate_apic_map(apic->vcpu->kvm);
+}
+
+static inline void kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id)
+{
+	apic_set_reg(apic, APIC_LDR, id);
+	recalculate_apic_map(apic->vcpu->kvm);
+}
+
 static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type)
 {
 	return !(kvm_apic_get_reg(apic, lvt_type) & APIC_LVT_MASKED);
@@ -194,11 +293,6 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu)
 	apic_set_reg(apic, APIC_LVR, v);
 }
 
-static inline int apic_x2apic_mode(struct kvm_lapic *apic)
-{
-	return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
-}
-
 static const unsigned int apic_lvt_mask[APIC_LVT_NUM] = {
 	LVT_MASK ,      /* part LVTT mask, timer mode mask added at runtime */
 	LVT_MASK | APIC_MODE_MASK,	/* LVTTHMR */
@@ -483,6 +577,72 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
 	return result;
 }
 
+bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
+		struct kvm_lapic_irq *irq, int *r)
+{
+	struct kvm_apic_map *map;
+	unsigned long bitmap = 1;
+	struct kvm_lapic **dst;
+	int i;
+	bool ret = false;
+
+	*r = -1;
+
+	if (irq->shorthand == APIC_DEST_SELF) {
+		*r = kvm_apic_set_irq(src->vcpu, irq);
+		return true;
+	}
+
+	if (irq->shorthand)
+		return false;
+
+	rcu_read_lock();
+	map = rcu_dereference(kvm->arch.apic_map);
+
+	if (!map)
+		goto out;
+
+	if (irq->dest_mode == 0) { /* physical mode */
+		if (irq->delivery_mode == APIC_DM_LOWEST ||
+				irq->dest_id == 0xff)
+			goto out;
+		dst = &map->phys_map[irq->dest_id & 0xff];
+	} else {
+		u32 mda = irq->dest_id << (32 - map->ldr_bits);
+
+		dst = map->logical_map[apic_cluster_id(map, mda)];
+
+		bitmap = apic_logical_id(map, mda);
+
+		if (irq->delivery_mode == APIC_DM_LOWEST) {
+			int l = -1;
+			for_each_set_bit(i, &bitmap, 16) {
+				if (!dst[i])
+					continue;
+				if (l < 0)
+					l = i;
+				else if (kvm_apic_compare_prio(dst[i]->vcpu, dst[l]->vcpu) < 0)
+					l = i;
+			}
+
+			bitmap = (l >= 0) ? 1 << l : 0;
+		}
+	}
+
+	for_each_set_bit(i, &bitmap, 16) {
+		if (!dst[i])
+			continue;
+		if (*r < 0)
+			*r = 0;
+		*r += kvm_apic_set_irq(dst[i]->vcpu, irq);
+	}
+
+	ret = true;
+out:
+	rcu_read_unlock();
+	return ret;
+}
+
 /*
  * Add a pending IRQ into lapic.
  * Return 1 if successfully added and 0 if discarded.
@@ -886,7 +1046,7 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
 	switch (reg) {
 	case APIC_ID:		/* Local APIC ID */
 		if (!apic_x2apic_mode(apic))
-			apic_set_reg(apic, APIC_ID, val);
+			kvm_apic_set_id(apic, val >> 24);
 		else
 			ret = 1;
 		break;
@@ -902,15 +1062,16 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
 
 	case APIC_LDR:
 		if (!apic_x2apic_mode(apic))
-			apic_set_reg(apic, APIC_LDR, val & APIC_LDR_MASK);
+			kvm_apic_set_ldr(apic, val & APIC_LDR_MASK);
 		else
 			ret = 1;
 		break;
 
 	case APIC_DFR:
-		if (!apic_x2apic_mode(apic))
+		if (!apic_x2apic_mode(apic)) {
 			apic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF);
-		else
+			recalculate_apic_map(apic->vcpu->kvm);
+		} else
 			ret = 1;
 		break;
 
@@ -1141,6 +1302,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
 			static_key_slow_dec_deferred(&apic_hw_disabled);
 		else
 			static_key_slow_inc(&apic_hw_disabled.key);
+		recalculate_apic_map(vcpu->kvm);
 	}
 
 	if (!kvm_vcpu_is_bsp(apic->vcpu))
@@ -1150,7 +1312,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
 	if (apic_x2apic_mode(apic)) {
 		u32 id = kvm_apic_id(apic);
 		u32 ldr = ((id & ~0xf) << 16) | (1 << (id & 0xf));
-		apic_set_reg(apic, APIC_LDR, ldr);
+		kvm_apic_set_ldr(apic, ldr);
 	}
 	apic->base_address = apic->vcpu->arch.apic_base &
 			     MSR_IA32_APICBASE_BASE;
@@ -1175,7 +1337,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
 	/* Stop the timer in case it's a reset to an active apic */
 	hrtimer_cancel(&apic->lapic_timer.timer);
 
-	apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24);
+	kvm_apic_set_id(apic, vcpu->vcpu_id);
 	kvm_apic_set_version(apic->vcpu);
 
 	for (i = 0; i < APIC_LVT_NUM; i++)
@@ -1186,7 +1348,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
 	apic_set_reg(apic, APIC_DFR, 0xffffffffU);
 	apic_set_spiv(apic, 0xff);
 	apic_set_reg(apic, APIC_TASKPRI, 0);
-	apic_set_reg(apic, APIC_LDR, 0);
+	kvm_apic_set_ldr(apic, 0);
 	apic_set_reg(apic, APIC_ESR, 0);
 	apic_set_reg(apic, APIC_ICR, 0);
 	apic_set_reg(apic, APIC_ICR2, 0);
@@ -1404,6 +1566,8 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
 	/* set SPIV separately to get count of SW disabled APICs right */
 	apic_set_spiv(apic, *((u32 *)(s->regs + APIC_SPIV)));
 	memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
+	/* call kvm_apic_set_id() to put apic into apic_map */
+	kvm_apic_set_id(apic, kvm_apic_id(apic));
 	kvm_apic_set_version(vcpu);
 
 	apic_update_ppr(apic);
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 615a8b03016..e5ebf9f3571 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -52,6 +52,9 @@ int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
 int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq);
 int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type);
 
+bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
+		struct kvm_lapic_irq *irq, int *r);
+
 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
 void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
 void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 497226e49d4..fc2a0a132e4 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6270,6 +6270,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 	set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
 
 	raw_spin_lock_init(&kvm->arch.tsc_write_lock);
+	mutex_init(&kvm->arch.apic_map_lock);
 
 	return 0;
 }
@@ -6322,6 +6323,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
 		put_page(kvm->arch.apic_access_page);
 	if (kvm->arch.ept_identity_pagetable)
 		put_page(kvm->arch.ept_identity_pagetable);
+	kfree(rcu_dereference_check(kvm->arch.apic_map, 1));
 }
 
 void kvm_arch_free_memslot(struct kvm_memory_slot *free,
-- 
cgit v1.2.3


From 7a84428af7ca6a847f058c9ff244a18a2664fd1b Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Fri, 21 Sep 2012 11:58:03 -0600
Subject: KVM: Add resampling irqfds for level triggered interrupts

To emulate level triggered interrupts, add a resample option to
KVM_IRQFD.  When specified, a new resamplefd is provided that notifies
the user when the irqchip has been resampled by the VM.  This may, for
instance, indicate an EOI.  Also in this mode, posting of an interrupt
through an irqfd only asserts the interrupt.  On resampling, the
interrupt is automatically de-asserted prior to user notification.
This enables level triggered interrupts to be posted and re-enabled
from vfio with no userspace intervention.

All resampling irqfds can make use of a single irq source ID, so we
reserve a new one for this interface.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index fc2a0a132e4..7d44204c604 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2176,6 +2176,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_PCI_2_3:
 	case KVM_CAP_KVMCLOCK_CTRL:
 	case KVM_CAP_READONLY_MEM:
+	case KVM_CAP_IRQFD_RESAMPLE:
 		r = 1;
 		break;
 	case KVM_CAP_COALESCED_MMIO:
@@ -6268,6 +6269,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 
 	/* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
 	set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
+	/* Reserve bit 1 of irq_sources_bitmap for irqfd-resampler */
+	set_bit(KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
+		&kvm->arch.irq_sources_bitmap);
 
 	raw_spin_lock_init(&kvm->arch.tsc_write_lock);
 	mutex_init(&kvm->arch.apic_map_lock);
-- 
cgit v1.2.3


From c863901075a42d50678616d8ee4b96ef13080498 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Fri, 21 Sep 2012 05:42:55 +0200
Subject: KVM: x86: Fix guest debug across vcpu INIT reset

If we reset a vcpu on INIT, we so far overwrote dr7 as provided by
KVM_SET_GUEST_DEBUG, and we also cleared switch_db_regs unconditionally.

Fix this by saving the dr7 used for guest debugging and calculating the
effective register value as well as switch_db_regs on any potential
change. This will change to focus of the set_guest_debug vendor op to
update_dp_bp_intercept.

Found while trying to stop on start_secondary.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  4 ++--
 arch/x86/kvm/svm.c              | 23 ++++-------------------
 arch/x86/kvm/vmx.c              | 14 +-------------
 arch/x86/kvm/x86.c              | 26 +++++++++++++++++---------
 4 files changed, 24 insertions(+), 43 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 0b902c98f27..c9a91368fc5 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -471,6 +471,7 @@ struct kvm_vcpu_arch {
 	unsigned long dr6;
 	unsigned long dr7;
 	unsigned long eff_db[KVM_NR_DB_REGS];
+	unsigned long guest_debug_dr7;
 
 	u64 mcg_cap;
 	u64 mcg_status;
@@ -647,8 +648,7 @@ struct kvm_x86_ops {
 	void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
 	void (*vcpu_put)(struct kvm_vcpu *vcpu);
 
-	void (*set_guest_debug)(struct kvm_vcpu *vcpu,
-				struct kvm_guest_debug *dbg);
+	void (*update_db_bp_intercept)(struct kvm_vcpu *vcpu);
 	int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
 	int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
 	u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 818fceb3091..d017df3899e 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1146,7 +1146,6 @@ static void init_vmcb(struct vcpu_svm *svm)
 
 	svm_set_efer(&svm->vcpu, 0);
 	save->dr6 = 0xffff0ff0;
-	save->dr7 = 0x400;
 	kvm_set_rflags(&svm->vcpu, 2);
 	save->rip = 0x0000fff0;
 	svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
@@ -1643,7 +1642,7 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
 	mark_dirty(svm->vmcb, VMCB_SEG);
 }
 
-static void update_db_intercept(struct kvm_vcpu *vcpu)
+static void update_db_bp_intercept(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
@@ -1663,20 +1662,6 @@ static void update_db_intercept(struct kvm_vcpu *vcpu)
 		vcpu->guest_debug = 0;
 }
 
-static void svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
-{
-	struct vcpu_svm *svm = to_svm(vcpu);
-
-	if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
-		svm->vmcb->save.dr7 = dbg->arch.debugreg[7];
-	else
-		svm->vmcb->save.dr7 = vcpu->arch.dr7;
-
-	mark_dirty(svm->vmcb, VMCB_DR);
-
-	update_db_intercept(vcpu);
-}
-
 static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
 {
 	if (sd->next_asid > sd->max_asid) {
@@ -1748,7 +1733,7 @@ static int db_interception(struct vcpu_svm *svm)
 		if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP))
 			svm->vmcb->save.rflags &=
 				~(X86_EFLAGS_TF | X86_EFLAGS_RF);
-		update_db_intercept(&svm->vcpu);
+		update_db_bp_intercept(&svm->vcpu);
 	}
 
 	if (svm->vcpu.guest_debug &
@@ -3659,7 +3644,7 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
 	 */
 	svm->nmi_singlestep = true;
 	svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
-	update_db_intercept(vcpu);
+	update_db_bp_intercept(vcpu);
 }
 
 static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
@@ -4253,7 +4238,7 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.vcpu_load = svm_vcpu_load,
 	.vcpu_put = svm_vcpu_put,
 
-	.set_guest_debug = svm_guest_debug,
+	.update_db_bp_intercept = update_db_bp_intercept,
 	.get_msr = svm_get_msr,
 	.set_msr = svm_set_msr,
 	.get_segment_base = svm_get_segment_base,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 30bcb953afe..5d46c905e06 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2288,16 +2288,6 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
 	}
 }
 
-static void set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
-{
-	if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
-		vmcs_writel(GUEST_DR7, dbg->arch.debugreg[7]);
-	else
-		vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
-
-	update_exception_bitmap(vcpu);
-}
-
 static __init int cpu_has_kvm_support(void)
 {
 	return cpu_has_vmx();
@@ -3960,8 +3950,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 		kvm_rip_write(vcpu, 0);
 	kvm_register_write(vcpu, VCPU_REGS_RSP, 0);
 
-	vmcs_writel(GUEST_DR7, 0x400);
-
 	vmcs_writel(GUEST_GDTR_BASE, 0);
 	vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
 
@@ -7237,7 +7225,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.vcpu_load = vmx_vcpu_load,
 	.vcpu_put = vmx_vcpu_put,
 
-	.set_guest_debug = set_guest_debug,
+	.update_db_bp_intercept = update_exception_bitmap,
 	.get_msr = vmx_get_msr,
 	.set_msr = vmx_set_msr,
 	.get_segment_base = vmx_get_segment_base,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7d44204c604..b16d4a5bfa4 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -692,6 +692,18 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_get_cr8);
 
+static void kvm_update_dr7(struct kvm_vcpu *vcpu)
+{
+	unsigned long dr7;
+
+	if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
+		dr7 = vcpu->arch.guest_debug_dr7;
+	else
+		dr7 = vcpu->arch.dr7;
+	kvm_x86_ops->set_dr7(vcpu, dr7);
+	vcpu->arch.switch_db_regs = (dr7 & DR7_BP_EN_MASK);
+}
+
 static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
 {
 	switch (dr) {
@@ -717,10 +729,7 @@ static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
 		if (val & 0xffffffff00000000ULL)
 			return -1; /* #GP */
 		vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
-		if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
-			kvm_x86_ops->set_dr7(vcpu, vcpu->arch.dr7);
-			vcpu->arch.switch_db_regs = (val & DR7_BP_EN_MASK);
-		}
+		kvm_update_dr7(vcpu);
 		break;
 	}
 
@@ -5851,13 +5860,12 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
 	if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
 		for (i = 0; i < KVM_NR_DB_REGS; ++i)
 			vcpu->arch.eff_db[i] = dbg->arch.debugreg[i];
-		vcpu->arch.switch_db_regs =
-			(dbg->arch.debugreg[7] & DR7_BP_EN_MASK);
+		vcpu->arch.guest_debug_dr7 = dbg->arch.debugreg[7];
 	} else {
 		for (i = 0; i < KVM_NR_DB_REGS; i++)
 			vcpu->arch.eff_db[i] = vcpu->arch.db[i];
-		vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK);
 	}
+	kvm_update_dr7(vcpu);
 
 	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
 		vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) +
@@ -5869,7 +5877,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
 	 */
 	kvm_set_rflags(vcpu, rflags);
 
-	kvm_x86_ops->set_guest_debug(vcpu, dbg);
+	kvm_x86_ops->update_db_bp_intercept(vcpu);
 
 	r = 0;
 
@@ -6045,10 +6053,10 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
 	vcpu->arch.nmi_pending = 0;
 	vcpu->arch.nmi_injected = false;
 
-	vcpu->arch.switch_db_regs = 0;
 	memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
 	vcpu->arch.dr6 = DR6_FIXED_1;
 	vcpu->arch.dr7 = DR7_FIXED_1;
+	kvm_update_dr7(vcpu);
 
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
 	vcpu->arch.apf.msr_val = 0;
-- 
cgit v1.2.3


From 3d11df7abbff013b811d5615320580cd5d9d7d31 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Thu, 27 Sep 2012 17:29:45 +0200
Subject: KVM: s390: Fix vcpu_load handling in interrupt code

Recent changes (KVM: make processes waiting on vcpu mutex killable)
now requires to check the return value of vcpu_load. This triggered
a warning in s390 specific kvm code. Turns out that we can actually
remove the put/load, since schedule will do the right thing via
the preempt notifiers.

Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/s390/kvm/interrupt.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch')

diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 7556231fb07..ff1e2f8ef94 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -423,9 +423,7 @@ no_timer:
 		set_current_state(TASK_INTERRUPTIBLE);
 		spin_unlock_bh(&vcpu->arch.local_int.lock);
 		spin_unlock(&vcpu->arch.local_int.float_int->lock);
-		vcpu_put(vcpu);
 		schedule();
-		vcpu_load(vcpu);
 		spin_lock(&vcpu->arch.local_int.float_int->lock);
 		spin_lock_bh(&vcpu->arch.local_int.lock);
 	}
-- 
cgit v1.2.3