From 489f1d6526ab68ca1842398fa3ae95c597fe3d32 Mon Sep 17 00:00:00 2001 From: "Dong, Eddie" Date: Mon, 7 Jan 2008 11:14:20 +0200 Subject: KVM: MMU: Update shadow ptes on partial guest pte writes A guest partial guest pte write will leave shadow_trap_nonpresent_pte in spte, which generates a vmexit at the next guest access through that pte. This patch improves this by reading the full guest pte in advance and thus being able to update the spte and eliminate the vmexit. This helps pae guests which use two 32-bit writes to set a single 64-bit pte. [truncation fix by Eric] Signed-off-by: Yaozu (Eddie) Dong Signed-off-by: Feng (Eric) Liu Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 23 ++++++++++++++++------- arch/x86/kvm/paging_tmpl.h | 7 ++----- 2 files changed, 18 insertions(+), 12 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index e55af12e11b..28f9a44060c 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1329,8 +1329,7 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, - const void *new, int bytes, - int offset_in_pte) + const void *new) { if (sp->role.level != PT_PAGE_TABLE_LEVEL) { ++vcpu->kvm->stat.mmu_pde_zapped; @@ -1339,9 +1338,9 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, ++vcpu->kvm->stat.mmu_pte_updated; if (sp->role.glevels == PT32_ROOT_LEVEL) - paging32_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte); + paging32_update_pte(vcpu, sp, spte, new); else - paging64_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte); + paging64_update_pte(vcpu, sp, spte, new); } static bool need_remote_flush(u64 old, u64 new) @@ -1423,7 +1422,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, struct hlist_node *node, *n; struct hlist_head *bucket; unsigned index; - u64 entry; + u64 entry, gentry; u64 *spte; unsigned offset = offset_in_page(gpa); unsigned pte_size; @@ -1433,6 +1432,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, int level; int flooded = 0; int npte; + int r; pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes); mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes); @@ -1496,11 +1496,20 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, continue; } spte = &sp->spt[page_offset / sizeof(*spte)]; + if ((gpa & (pte_size - 1)) || (bytes < pte_size)) { + gentry = 0; + r = kvm_read_guest_atomic(vcpu->kvm, + gpa & ~(u64)(pte_size - 1), + &gentry, pte_size); + new = (const void *)&gentry; + if (r < 0) + new = NULL; + } while (npte--) { entry = *spte; mmu_pte_write_zap_pte(vcpu, sp, spte); - mmu_pte_write_new_pte(vcpu, sp, spte, new, bytes, - page_offset & (pte_size - 1)); + if (new) + mmu_pte_write_new_pte(vcpu, sp, spte, new); mmu_pte_write_flush_tlb(vcpu, entry, *spte); ++spte; } diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index ecc0856268c..c2fd2b96144 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -243,8 +243,7 @@ err: } static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, - u64 *spte, const void *pte, int bytes, - int offset_in_pte) + u64 *spte, const void *pte) { pt_element_t gpte; unsigned pte_access; @@ -252,12 +251,10 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, gpte = *(const pt_element_t *)pte; if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) { - if (!offset_in_pte && !is_present_pte(gpte)) + if (!is_present_pte(gpte)) set_shadow_pte(spte, shadow_notrap_nonpresent_pte); return; } - if (bytes < sizeof(pt_element_t)) - return; pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte); pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte); if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn) -- cgit v1.2.3 From 1ae0a13def678876b9acfb5ac1e2cf7d5d45a60d Mon Sep 17 00:00:00 2001 From: "Dong, Eddie" Date: Mon, 7 Jan 2008 13:20:25 +0200 Subject: KVM: MMU: Simplify hash table indexing Signed-off-by: Yaozu (Eddie) Dong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 28f9a44060c..6f8392d4034 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -559,7 +559,7 @@ static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) static unsigned kvm_page_table_hashfn(gfn_t gfn) { - return gfn; + return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1); } static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, @@ -663,7 +663,7 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) struct hlist_node *node; pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn); - index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; + index = kvm_page_table_hashfn(gfn); bucket = &kvm->arch.mmu_page_hash[index]; hlist_for_each_entry(sp, node, bucket, hash_link) if (sp->gfn == gfn && !sp->role.metaphysical) { @@ -701,7 +701,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, } pgprintk("%s: looking gfn %lx role %x\n", __FUNCTION__, gfn, role.word); - index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; + index = kvm_page_table_hashfn(gfn); bucket = &vcpu->kvm->arch.mmu_page_hash[index]; hlist_for_each_entry(sp, node, bucket, hash_link) if (sp->gfn == gfn && sp->role.word == role.word) { @@ -840,7 +840,7 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn); r = 0; - index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; + index = kvm_page_table_hashfn(gfn); bucket = &kvm->arch.mmu_page_hash[index]; hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) if (sp->gfn == gfn && !sp->role.metaphysical) { @@ -1450,7 +1450,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, vcpu->arch.last_pt_write_count = 1; vcpu->arch.last_pte_updated = NULL; } - index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; + index = kvm_page_table_hashfn(gfn); bucket = &vcpu->kvm->arch.mmu_page_hash[index]; hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) { if (sp->gfn != gfn || sp->role.metaphysical) -- cgit v1.2.3 From e09d082c03e137015bc0a17ca77e4b9dca08a5d7 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Fri, 18 Jan 2008 12:38:59 +0200 Subject: KVM: x86 emulator: add support for group decoding Certain x86 instructions use bits 3:5 of the byte following the opcode as an opcode extension, with the decode sometimes depending on bits 6:7 as well. Add support for this in the main decoding table rather than an ad-hock adaptation per opcode. Signed-off-by: Avi Kivity --- arch/x86/kvm/x86_emulate.c | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index 79586003397..46ecf349a11 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -65,6 +65,9 @@ #define MemAbs (1<<9) /* Memory operand is absolute displacement */ #define String (1<<10) /* String instruction (rep capable) */ #define Stack (1<<11) /* Stack instruction (push/pop) */ +#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ +#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ +#define GroupMask 0xff /* Group number stored in bits 0:7 */ static u16 opcode_table[256] = { /* 0x00 - 0x07 */ @@ -229,6 +232,12 @@ static u16 twobyte_table[256] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; +static u16 group_table[] = { +}; + +static u16 group2_table[] = { +}; + /* EFLAGS bit definitions. */ #define EFLG_OF (1<<11) #define EFLG_DF (1<<10) @@ -763,7 +772,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) struct decode_cache *c = &ctxt->decode; int rc = 0; int mode = ctxt->mode; - int def_op_bytes, def_ad_bytes; + int def_op_bytes, def_ad_bytes, group; /* Shadow copy of register state. Committed on successful emulation. */ @@ -864,12 +873,24 @@ done_prefixes: c->b = insn_fetch(u8, 1, c->eip); c->d = twobyte_table[c->b]; } + } - /* Unrecognised? */ - if (c->d == 0) { - DPRINTF("Cannot emulate %02x\n", c->b); - return -1; - } + if (c->d & Group) { + group = c->d & GroupMask; + c->modrm = insn_fetch(u8, 1, c->eip); + --c->eip; + + group = (group << 3) + ((c->modrm >> 3) & 7); + if ((c->d & GroupDual) && (c->modrm >> 6) == 3) + c->d = group2_table[group]; + else + c->d = group_table[group]; + } + + /* Unrecognised? */ + if (c->d == 0) { + DPRINTF("Cannot emulate %02x\n", c->b); + return -1; } if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack)) -- cgit v1.2.3 From 43bb19cd3398d3f544d8e2d6ed6c5c5d7b4e5819 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Fri, 18 Jan 2008 12:46:50 +0200 Subject: KVM: x86 emulator: group decoding for group 1A This adds group decode support for opcode 0x8f. Signed-off-by: Avi Kivity --- arch/x86/kvm/x86_emulate.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index 46ecf349a11..cf1ce7c316a 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -69,6 +69,10 @@ #define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ #define GroupMask 0xff /* Group number stored in bits 0:7 */ +enum { + Group1A, +}; + static u16 opcode_table[256] = { /* 0x00 - 0x07 */ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, @@ -133,7 +137,7 @@ static u16 opcode_table[256] = { /* 0x88 - 0x8F */ ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - 0, ModRM | DstReg, 0, DstMem | SrcNone | ModRM | Mov | Stack, + 0, ModRM | DstReg, 0, Group | Group1A, /* 0x90 - 0x9F */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0, @@ -233,6 +237,8 @@ static u16 twobyte_table[256] = { }; static u16 group_table[] = { + [Group1A*8] = + DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0, }; static u16 group2_table[] = { -- cgit v1.2.3 From 7d858a19efe5844a98e060931570359b70dea6d1 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Fri, 18 Jan 2008 12:58:04 +0200 Subject: KVM: x86 emulator: Group decoding for group 3 This adds group decoding support for opcodes 0xf6, 0xf7 (group 3). Signed-off-by: Avi Kivity --- arch/x86/kvm/x86_emulate.c | 34 ++++++++++------------------------ 1 file changed, 10 insertions(+), 24 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index cf1ce7c316a..52e65ae37f0 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -70,7 +70,7 @@ #define GroupMask 0xff /* Group number stored in bits 0:7 */ enum { - Group1A, + Group1A, Group3_Byte, Group3, }; static u16 opcode_table[256] = { @@ -171,8 +171,7 @@ static u16 opcode_table[256] = { 0, 0, 0, 0, /* 0xF0 - 0xF7 */ 0, 0, 0, 0, - ImplicitOps, ImplicitOps, - ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, + ImplicitOps, ImplicitOps, Group | Group3_Byte, Group | Group3, /* 0xF8 - 0xFF */ ImplicitOps, 0, ImplicitOps, ImplicitOps, 0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM @@ -239,6 +238,14 @@ static u16 twobyte_table[256] = { static u16 group_table[] = { [Group1A*8] = DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0, + [Group3_Byte*8] = + ByteOp | SrcImm | DstMem | ModRM, 0, + ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, + 0, 0, 0, 0, + [Group3*8] = + DstMem | SrcImm | ModRM | SrcImm, 0, + DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, + 0, 0, 0, 0, }; static u16 group2_table[] = { @@ -1070,26 +1077,6 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt, switch (c->modrm_reg) { case 0 ... 1: /* test */ - /* - * Special case in Grp3: test has an immediate - * source operand. - */ - c->src.type = OP_IMM; - c->src.ptr = (unsigned long *)c->eip; - c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - if (c->src.bytes == 8) - c->src.bytes = 4; - switch (c->src.bytes) { - case 1: - c->src.val = insn_fetch(s8, 1, c->eip); - break; - case 2: - c->src.val = insn_fetch(s16, 2, c->eip); - break; - case 4: - c->src.val = insn_fetch(s32, 4, c->eip); - break; - } emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); break; case 2: /* not */ @@ -1103,7 +1090,6 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt, rc = X86EMUL_UNHANDLEABLE; break; } -done: return rc; } -- cgit v1.2.3 From fd60754e4ffa992586346dd56451723b4c096626 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Fri, 18 Jan 2008 13:12:26 +0200 Subject: KVM: x86 emulator: Group decoding for groups 4 and 5 Add group decoding support for opcode 0xfe (group 4) and 0xff (group 5). Signed-off-by: Avi Kivity --- arch/x86/kvm/x86_emulate.c | 40 ++++++++++------------------------------ 1 file changed, 10 insertions(+), 30 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index 52e65ae37f0..7310368c485 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -70,7 +70,7 @@ #define GroupMask 0xff /* Group number stored in bits 0:7 */ enum { - Group1A, Group3_Byte, Group3, + Group1A, Group3_Byte, Group3, Group4, Group5, }; static u16 opcode_table[256] = { @@ -174,7 +174,7 @@ static u16 opcode_table[256] = { ImplicitOps, ImplicitOps, Group | Group3_Byte, Group | Group3, /* 0xF8 - 0xFF */ ImplicitOps, 0, ImplicitOps, ImplicitOps, - 0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM + 0, 0, Group | Group4, Group | Group5, }; static u16 twobyte_table[256] = { @@ -246,6 +246,12 @@ static u16 group_table[] = { DstMem | SrcImm | ModRM | SrcImm, 0, DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, 0, 0, 0, 0, + [Group4*8] = + ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, + 0, 0, 0, 0, 0, 0, + [Group5*8] = + DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, 0, 0, + SrcMem | ModRM, 0, SrcMem | ModRM | Stack, 0, }; static u16 group2_table[] = { @@ -1097,7 +1103,6 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { struct decode_cache *c = &ctxt->decode; - int rc; switch (c->modrm_reg) { case 0: /* inc */ @@ -1107,36 +1112,11 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt, emulate_1op("dec", c->dst, ctxt->eflags); break; case 4: /* jmp abs */ - if (c->b == 0xff) - c->eip = c->dst.val; - else { - DPRINTF("Cannot emulate %02x\n", c->b); - return X86EMUL_UNHANDLEABLE; - } + c->eip = c->src.val; break; case 6: /* push */ - - /* 64-bit mode: PUSH always pushes a 64-bit operand. */ - - if (ctxt->mode == X86EMUL_MODE_PROT64) { - c->dst.bytes = 8; - rc = ops->read_std((unsigned long)c->dst.ptr, - &c->dst.val, 8, ctxt->vcpu); - if (rc != 0) - return rc; - } - register_address_increment(c->regs[VCPU_REGS_RSP], - -c->dst.bytes); - rc = ops->write_emulated(register_address(ctxt->ss_base, - c->regs[VCPU_REGS_RSP]), &c->dst.val, - c->dst.bytes, ctxt->vcpu); - if (rc != 0) - return rc; - c->dst.type = OP_NONE; + emulate_push(ctxt); break; - default: - DPRINTF("Cannot emulate %02x\n", c->b); - return X86EMUL_UNHANDLEABLE; } return 0; } -- cgit v1.2.3 From d95058a1a7170ae2af2939cbdab0ff5d5e005238 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Fri, 18 Jan 2008 13:36:50 +0200 Subject: KVM: x86 emulator: add group 7 decoding This adds group decoding for opcode 0x0f 0x01 (group 7). Signed-off-by: Avi Kivity --- arch/x86/kvm/x86_emulate.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index 7310368c485..ef6de16bb59 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -70,7 +70,7 @@ #define GroupMask 0xff /* Group number stored in bits 0:7 */ enum { - Group1A, Group3_Byte, Group3, Group4, Group5, + Group1A, Group3_Byte, Group3, Group4, Group5, Group7, }; static u16 opcode_table[256] = { @@ -179,7 +179,7 @@ static u16 opcode_table[256] = { static u16 twobyte_table[256] = { /* 0x00 - 0x0F */ - 0, SrcMem | ModRM | DstReg, 0, 0, 0, 0, ImplicitOps, 0, + 0, Group | GroupDual | Group7, 0, 0, 0, 0, ImplicitOps, 0, ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0, /* 0x10 - 0x1F */ 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0, @@ -252,9 +252,14 @@ static u16 group_table[] = { [Group5*8] = DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, 0, 0, SrcMem | ModRM, 0, SrcMem | ModRM | Stack, 0, + [Group7*8] = + 0, 0, ModRM | SrcMem, ModRM | SrcMem, + SrcNone | ModRM | DstMem, 0, SrcMem | ModRM, SrcMem | ModRM | ByteOp, }; static u16 group2_table[] = { + [Group7*8] = + SrcNone | ModRM, 0, 0, 0, SrcNone | ModRM | DstMem, 0, SrcMem | ModRM, 0, }; /* EFLAGS bit definitions. */ -- cgit v1.2.3 From 1d6ad2073e5354912291277c606a57fd37330f04 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 23 Jan 2008 22:26:09 +0200 Subject: KVM: x86 emulator: group decoding for group 1 instructions Opcodes 0x80-0x83 Signed-off-by: Avi Kivity --- arch/x86/kvm/x86_emulate.c | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index ef6de16bb59..22900f70172 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -70,6 +70,7 @@ #define GroupMask 0xff /* Group number stored in bits 0:7 */ enum { + Group1_80, Group1_81, Group1_82, Group1_83, Group1A, Group3_Byte, Group3, Group4, Group5, Group7, }; @@ -130,8 +131,8 @@ static u16 opcode_table[256] = { ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, /* 0x80 - 0x87 */ - ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, - ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, + Group | Group1_80, Group | Group1_81, + Group | Group1_82, Group | Group1_83, ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, /* 0x88 - 0x8F */ @@ -236,6 +237,26 @@ static u16 twobyte_table[256] = { }; static u16 group_table[] = { + [Group1_80*8] = + ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, + ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, + ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, + ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, + [Group1_81*8] = + DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, + DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, + DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, + DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, + [Group1_82*8] = + ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, + ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, + ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, + ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, + [Group1_83*8] = + DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, + DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, + DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, + DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, [Group1A*8] = DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0, [Group3_Byte*8] = -- cgit v1.2.3 From d196e343361c229496adeda42335856da9d057de Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 24 Jan 2008 11:44:11 +0200 Subject: KVM: MMU: Decouple mmio from shadow page tables Currently an mmio guest pte is encoded in the shadow pagetable as a not-present trapping pte, with the SHADOW_IO_MARK bit set. However nothing is ever done with this information, so maintaining it is a useless complication. This patch moves the check for mmio to before shadow ptes are instantiated, so the shadow code is never invoked for ptes that reference mmio. The code is simpler, and with future work, can be made to handle mmio concurrently. Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 34 +++++++++++++++------------------- arch/x86/kvm/paging_tmpl.h | 17 ++++++++--------- 2 files changed, 23 insertions(+), 28 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 6f8392d4034..6651dfadae5 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -101,8 +101,6 @@ static int dbg = 1; #define PT_FIRST_AVAIL_BITS_SHIFT 9 #define PT64_SECOND_AVAIL_BITS_SHIFT 52 -#define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) - #define VALID_PAGE(x) ((x) != INVALID_PAGE) #define PT64_LEVEL_BITS 9 @@ -200,7 +198,6 @@ static int is_present_pte(unsigned long pte) static int is_shadow_present_pte(u64 pte) { - pte &= ~PT_SHADOW_IO_MARK; return pte != shadow_trap_nonpresent_pte && pte != shadow_notrap_nonpresent_pte; } @@ -215,11 +212,6 @@ static int is_dirty_pte(unsigned long pte) return pte & PT_DIRTY_MASK; } -static int is_io_pte(unsigned long pte) -{ - return pte & PT_SHADOW_IO_MARK; -} - static int is_rmap_pte(u64 pte) { return is_shadow_present_pte(pte); @@ -538,7 +530,7 @@ static int is_empty_shadow_page(u64 *spt) u64 *end; for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++) - if ((*pos & ~PT_SHADOW_IO_MARK) != shadow_trap_nonpresent_pte) { + if (*pos != shadow_trap_nonpresent_pte) { printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__, pos, *pos); return 0; @@ -926,13 +918,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, if (pte_access & ACC_USER_MASK) spte |= PT_USER_MASK; - if (is_error_page(page)) { - set_shadow_pte(shadow_pte, - shadow_trap_nonpresent_pte | PT_SHADOW_IO_MARK); - kvm_release_page_clean(page); - return; - } - spte |= page_to_phys(page); if ((pte_access & ACC_WRITE_MASK) @@ -1002,7 +987,7 @@ static int __nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, if (level == 1) { mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL, 0, write, 1, &pt_write, gfn, page); - return pt_write || is_io_pte(table[index]); + return pt_write; } if (table[index] == shadow_trap_nonpresent_pte) { @@ -1039,6 +1024,13 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) page = gfn_to_page(vcpu->kvm, gfn); up_read(¤t->mm->mmap_sem); + /* mmio */ + if (is_error_page(page)) { + kvm_release_page_clean(page); + up_read(&vcpu->kvm->slots_lock); + return 1; + } + spin_lock(&vcpu->kvm->mmu_lock); kvm_mmu_free_some_pages(vcpu); r = __nonpaging_map(vcpu, v, write, gfn, page); @@ -1406,10 +1398,14 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, return; gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; - down_read(¤t->mm->mmap_sem); + down_read(&vcpu->kvm->slots_lock); page = gfn_to_page(vcpu->kvm, gfn); - up_read(¤t->mm->mmap_sem); + up_read(&vcpu->kvm->slots_lock); + if (is_error_page(page)) { + kvm_release_page_clean(page); + return; + } vcpu->arch.update_pte.gfn = gfn; vcpu->arch.update_pte.page = page; } diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index c2fd2b96144..4b55f462e2b 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -399,6 +399,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, page = gfn_to_page(vcpu->kvm, walker.gfn); up_read(¤t->mm->mmap_sem); + /* mmio */ + if (is_error_page(page)) { + pgprintk("gfn %x is mmio\n", walker.gfn); + kvm_release_page_clean(page); + up_read(&vcpu->kvm->slots_lock); + return 1; + } + spin_lock(&vcpu->kvm->mmu_lock); kvm_mmu_free_some_pages(vcpu); shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, @@ -409,15 +417,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, if (!write_pt) vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ - /* - * mmio: emulate if accessible, otherwise its a guest fault. - */ - if (shadow_pte && is_io_pte(*shadow_pte)) { - spin_unlock(&vcpu->kvm->mmu_lock); - up_read(&vcpu->kvm->slots_lock); - return 1; - } - ++vcpu->stat.pf_fixed; kvm_mmu_audit(vcpu, "post page fault (fixed)"); spin_unlock(&vcpu->kvm->mmu_lock); -- cgit v1.2.3 From 2384d2b32640839a4d4d260ca7c5aa4edbf68d91 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Thu, 17 Jan 2008 15:14:33 +0800 Subject: KVM: VMX: Enable Virtual Processor Identification (VPID) To allow TLB entries to be retained across VM entry and VM exit, the VMM can now identify distinct address spaces through a new virtual-processor ID (VPID) field of the VMCS. [avi: drop vpid_sync_all()] [avi: add "cc" to asm constraints] Signed-off-by: Sheng Yang Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++++++---- arch/x86/kvm/vmx.h | 6 ++++ 2 files changed, 81 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 8e1462880d1..1157e8a4059 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -37,6 +37,9 @@ MODULE_LICENSE("GPL"); static int bypass_guest_pf = 1; module_param(bypass_guest_pf, bool, 0); +static int enable_vpid = 1; +module_param(enable_vpid, bool, 0); + struct vmcs { u32 revision_id; u32 abort; @@ -71,6 +74,7 @@ struct vcpu_vmx { unsigned rip; } irq; } rmode; + int vpid; }; static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) @@ -86,6 +90,9 @@ static DEFINE_PER_CPU(struct vmcs *, current_vmcs); static struct page *vmx_io_bitmap_a; static struct page *vmx_io_bitmap_b; +static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS); +static DEFINE_SPINLOCK(vmx_vpid_lock); + static struct vmcs_config { int size; int order; @@ -204,6 +211,12 @@ static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm) (irqchip_in_kernel(kvm))); } +static inline int cpu_has_vmx_vpid(void) +{ + return (vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_ENABLE_VPID); +} + static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) { int i; @@ -214,6 +227,20 @@ static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) return -1; } +static inline void __invvpid(int ext, u16 vpid, gva_t gva) +{ + struct { + u64 vpid : 16; + u64 rsvd : 48; + u64 gva; + } operand = { vpid, 0, gva }; + + asm volatile (ASM_VMX_INVVPID + /* CF==1 or ZF==1 --> rc = -1 */ + "; ja 1f ; ud2 ; 1:" + : : "a"(&operand), "c"(ext) : "cc", "memory"); +} + static struct kvm_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) { int i; @@ -257,6 +284,14 @@ static void vcpu_clear(struct vcpu_vmx *vmx) vmx->launched = 0; } +static inline void vpid_sync_vcpu_all(struct vcpu_vmx *vmx) +{ + if (vmx->vpid == 0) + return; + + __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0); +} + static unsigned long vmcs_readl(unsigned long field) { unsigned long value; @@ -490,6 +525,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (vcpu->cpu != cpu) { vcpu_clear(vmx); kvm_migrate_apic_timer(vcpu); + vpid_sync_vcpu_all(vmx); } if (per_cpu(current_vmcs, cpu) != vmx->vmcs) { @@ -971,7 +1007,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) { min = 0; opt = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | - SECONDARY_EXEC_WBINVD_EXITING; + SECONDARY_EXEC_WBINVD_EXITING | + SECONDARY_EXEC_ENABLE_VPID; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS2, &_cpu_based_2nd_exec_control) < 0) return -EIO; @@ -1239,6 +1276,11 @@ static void exit_lmode(struct kvm_vcpu *vcpu) #endif +static void vmx_flush_tlb(struct kvm_vcpu *vcpu) +{ + vpid_sync_vcpu_all(to_vmx(vcpu)); +} + static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) { vcpu->arch.cr4 &= KVM_GUEST_CR4_MASK; @@ -1275,6 +1317,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { + vmx_flush_tlb(vcpu); vmcs_writel(GUEST_CR3, cr3); if (vcpu->arch.cr0 & X86_CR0_PE) vmx_fpu_deactivate(vcpu); @@ -1494,6 +1537,22 @@ out: return r; } +static void allocate_vpid(struct vcpu_vmx *vmx) +{ + int vpid; + + vmx->vpid = 0; + if (!enable_vpid || !cpu_has_vmx_vpid()) + return; + spin_lock(&vmx_vpid_lock); + vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS); + if (vpid < VMX_NR_VPIDS) { + vmx->vpid = vpid; + __set_bit(vpid, vmx_vpid_bitmap); + } + spin_unlock(&vmx_vpid_lock); +} + /* * Sets up the vmcs for emulated real mode. */ @@ -1532,6 +1591,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + if (vmx->vpid == 0) + exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); } @@ -1704,6 +1765,9 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmcs_write64(APIC_ACCESS_ADDR, page_to_phys(vmx->vcpu.kvm->arch.apic_access_page)); + if (vmx->vpid != 0) + vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); + vmx->vcpu.arch.cr0 = 0x60000010; vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */ vmx_set_cr4(&vmx->vcpu, 0); @@ -1713,6 +1777,8 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmx_fpu_activate(&vmx->vcpu); update_exception_bitmap(&vmx->vcpu); + vpid_sync_vcpu_all(vmx); + return 0; out: @@ -2221,10 +2287,6 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) return 0; } -static void vmx_flush_tlb(struct kvm_vcpu *vcpu) -{ -} - static void update_tpr_threshold(struct kvm_vcpu *vcpu) { int max_irr, tpr; @@ -2489,6 +2551,10 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + spin_lock(&vmx_vpid_lock); + if (vmx->vpid != 0) + __clear_bit(vmx->vpid, vmx_vpid_bitmap); + spin_unlock(&vmx_vpid_lock); vmx_free_vmcs(vcpu); kfree(vmx->host_msrs); kfree(vmx->guest_msrs); @@ -2505,6 +2571,8 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) if (!vmx) return ERR_PTR(-ENOMEM); + allocate_vpid(vmx); + err = kvm_vcpu_init(&vmx->vcpu, kvm, id); if (err) goto free_vcpu; @@ -2652,6 +2720,8 @@ static int __init vmx_init(void) memset(iova, 0xff, PAGE_SIZE); kunmap(vmx_io_bitmap_b); + set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ + r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE); if (r) goto out1; diff --git a/arch/x86/kvm/vmx.h b/arch/x86/kvm/vmx.h index d52ae8d7303..436ce0f2909 100644 --- a/arch/x86/kvm/vmx.h +++ b/arch/x86/kvm/vmx.h @@ -49,6 +49,7 @@ * Definitions of Secondary Processor-Based VM-Execution Controls. */ #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001 +#define SECONDARY_EXEC_ENABLE_VPID 0x00000020 #define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 @@ -65,6 +66,7 @@ /* VMCS Encodings */ enum vmcs_field { + VIRTUAL_PROCESSOR_ID = 0x00000000, GUEST_ES_SELECTOR = 0x00000800, GUEST_CS_SELECTOR = 0x00000802, GUEST_SS_SELECTOR = 0x00000804, @@ -321,4 +323,8 @@ enum vmcs_field { #define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT 9 +#define VMX_NR_VPIDS (1 << 16) +#define VMX_VPID_EXTENT_SINGLE_CONTEXT 1 +#define VMX_VPID_EXTENT_ALL_CONTEXT 2 + #endif -- cgit v1.2.3 From f2b4b7ddf633ffa24ce7c89c9e0d8a06463484e3 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 31 Jan 2008 14:57:37 +0100 Subject: KVM: make EFER_RESERVED_BITS configurable for architecture code This patch give the SVM and VMX implementations the ability to add some bits the guest can set in its EFER register. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6b01552bd1f..ec9265b354b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -41,7 +41,7 @@ | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) -#define EFER_RESERVED_BITS 0xfffffffffffff2fe +static u64 __read_mostly efer_reserved_bits = 0xfffffffffffff2fe; #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU @@ -428,7 +428,7 @@ static u32 emulated_msrs[] = { static void set_efer(struct kvm_vcpu *vcpu, u64 efer) { - if (efer & EFER_RESERVED_BITS) { + if (efer & efer_reserved_bits) { printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n", efer); kvm_inject_gp(vcpu, 0); @@ -452,6 +452,13 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer) #endif +void kvm_enable_efer_bits(u64 mask) +{ + efer_reserved_bits &= ~mask; +} +EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); + + /* * Writes msr value into into the appropriate "register". * Returns 0 on success, non-0 otherwise. -- cgit v1.2.3 From 50a37eb4e05efaa7bac6a948fd4db1a48c728b99 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 31 Jan 2008 14:57:38 +0100 Subject: KVM: align valid EFER bits with the features of the host system This patch aligns the bits the guest can set in the EFER register with the features in the host processor. Currently it lets EFER.NX disabled if the processor does not support it and enables EFER.LME and EFER.LMA only for KVM on 64 bit hosts. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 3 +++ arch/x86/kvm/vmx.c | 4 ++++ arch/x86/kvm/x86.c | 10 +++++++++- 3 files changed, 16 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 1a582f1090e..ff3bc74af72 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -403,6 +403,9 @@ static __init int svm_hardware_setup(void) set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_ESP, 1, 1); set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_EIP, 1, 1); + if (boot_cpu_has(X86_FEATURE_NX)) + kvm_enable_efer_bits(EFER_NX); + for_each_online_cpu(cpu) { r = svm_cpu_init(cpu); if (r) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 1157e8a4059..a509910f6b5 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1117,6 +1117,10 @@ static __init int hardware_setup(void) { if (setup_vmcs_config(&vmcs_config) < 0) return -EIO; + + if (boot_cpu_has(X86_FEATURE_NX)) + kvm_enable_efer_bits(EFER_NX); + return alloc_kvm_area(); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ec9265b354b..db16f2353e4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -41,7 +41,15 @@ | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) -static u64 __read_mostly efer_reserved_bits = 0xfffffffffffff2fe; +/* EFER defaults: + * - enable syscall per default because its emulated by KVM + * - enable LME and LMA per default on 64 bit KVM + */ +#ifdef CONFIG_X86_64 +static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffafeULL; +#else +static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL; +#endif #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU -- cgit v1.2.3 From 9f62e19a1107466b9e9501e23a9dd5acb81fdca1 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 31 Jan 2008 14:57:39 +0100 Subject: KVM: VMX: unifdef the EFER specific code To allow access to the EFER register in 32bit KVM the EFER specific code has to be exported to the x86 generic code. This patch does this in a backwards compatible manner. [avi: add check for EFER-less hosts] Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a509910f6b5..76944f2c883 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1335,14 +1335,14 @@ static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) vcpu->arch.cr4 = cr4; } -#ifdef CONFIG_X86_64 - static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER); vcpu->arch.shadow_efer = efer; + if (!msr) + return; if (efer & EFER_LMA) { vmcs_write32(VM_ENTRY_CONTROLS, vmcs_read32(VM_ENTRY_CONTROLS) | @@ -1359,8 +1359,6 @@ static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) setup_msrs(vmx); } -#endif - static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) { struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; @@ -1775,9 +1773,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmx->vcpu.arch.cr0 = 0x60000010; vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */ vmx_set_cr4(&vmx->vcpu, 0); -#ifdef CONFIG_X86_64 vmx_set_efer(&vmx->vcpu, 0); -#endif vmx_fpu_activate(&vmx->vcpu); update_exception_bitmap(&vmx->vcpu); @@ -2668,9 +2664,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .set_cr0 = vmx_set_cr0, .set_cr3 = vmx_set_cr3, .set_cr4 = vmx_set_cr4, -#ifdef CONFIG_X86_64 .set_efer = vmx_set_efer, -#endif .get_idt = vmx_get_idt, .set_idt = vmx_set_idt, .get_gdt = vmx_get_gdt, -- cgit v1.2.3 From 9457a712a2f464c4b21bb7f78998775c69673a0c Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 31 Jan 2008 14:57:40 +0100 Subject: KVM: allow access to EFER in 32bit KVM This patch makes the EFER register accessible on a 32bit KVM host. This is necessary to boot 32 bit PAE guests under SVM. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index db16f2353e4..38edb2f558e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -432,8 +432,6 @@ static u32 emulated_msrs[] = { MSR_IA32_MISC_ENABLE, }; -#ifdef CONFIG_X86_64 - static void set_efer(struct kvm_vcpu *vcpu, u64 efer) { if (efer & efer_reserved_bits) { @@ -458,8 +456,6 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer) vcpu->arch.shadow_efer = efer; } -#endif - void kvm_enable_efer_bits(u64 mask) { efer_reserved_bits &= ~mask; @@ -489,11 +485,9 @@ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) { switch (msr) { -#ifdef CONFIG_X86_64 case MSR_EFER: set_efer(vcpu, data); break; -#endif case MSR_IA32_MC0_STATUS: pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n", __FUNCTION__, data); @@ -571,11 +565,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_IA32_MISC_ENABLE: data = vcpu->arch.ia32_misc_enable_msr; break; -#ifdef CONFIG_X86_64 case MSR_EFER: data = vcpu->arch.shadow_efer; break; -#endif default: pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); return 1; @@ -2880,9 +2872,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, set_cr8(vcpu, sregs->cr8); mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer; -#ifdef CONFIG_X86_64 kvm_x86_ops->set_efer(vcpu, sregs->efer); -#endif kvm_set_apic_base(vcpu, sregs->apic_base); kvm_x86_ops->decache_cr4_guest_bits(vcpu); -- cgit v1.2.3 From 33bd6a0b3e8baed6469c8e68ea1b16cb50c4f5af Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 7 Feb 2008 13:47:38 +0100 Subject: KVM: SVM: move feature detection to hardware setup code By moving the SVM feature detection from the each_cpu code to the hardware setup code it runs only once. As an additional advance the feature check is now available earlier in the module setup process. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index ff3bc74af72..5f527dc0e16 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -302,7 +302,6 @@ static void svm_hardware_enable(void *garbage) svm_data->asid_generation = 1; svm_data->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; svm_data->next_asid = svm_data->max_asid + 1; - svm_features = cpuid_edx(SVM_CPUID_FUNC); asm volatile ("sgdt %0" : "=m"(gdt_descr)); gdt = (struct desc_struct *)gdt_descr.address; @@ -411,6 +410,9 @@ static __init int svm_hardware_setup(void) if (r) goto err_2; } + + svm_features = cpuid_edx(SVM_CPUID_FUNC); + return 0; err_2: -- cgit v1.2.3 From e3da3acdb32c1804a5c853feebcc037b7434076f Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 7 Feb 2008 13:47:39 +0100 Subject: KVM: SVM: add detection of Nested Paging feature Let SVM detect if the Nested Paging feature is available on the hardware. Disable it to keep this patch series bisectable. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 5f527dc0e16..c12a75953b5 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -47,6 +47,8 @@ MODULE_LICENSE("GPL"); #define SVM_FEATURE_LBRV (1 << 1) #define SVM_DEATURE_SVML (1 << 2) +static bool npt_enabled = false; + static void kvm_reput_irq(struct vcpu_svm *svm); static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) @@ -413,6 +415,12 @@ static __init int svm_hardware_setup(void) svm_features = cpuid_edx(SVM_CPUID_FUNC); + if (!svm_has(SVM_FEATURE_NPT)) + npt_enabled = false; + + if (npt_enabled) + printk(KERN_INFO "kvm: Nested Paging enabled\n"); + return 0; err_2: -- cgit v1.2.3 From 6c7dac72d5c7dc0e09512dce865398167be9a8f7 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 7 Feb 2008 13:47:40 +0100 Subject: KVM: SVM: add module parameter to disable Nested Paging To disable the use of the Nested Paging feature even if it is available in hardware this patch adds a module parameter. Nested Paging can be disabled by passing npt=0 to the kvm_amd module. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index c12a75953b5..fb5d6c2e6a0 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -48,6 +48,9 @@ MODULE_LICENSE("GPL"); #define SVM_DEATURE_SVML (1 << 2) static bool npt_enabled = false; +static int npt = 1; + +module_param(npt, int, S_IRUGO); static void kvm_reput_irq(struct vcpu_svm *svm); @@ -418,6 +421,11 @@ static __init int svm_hardware_setup(void) if (!svm_has(SVM_FEATURE_NPT)) npt_enabled = false; + if (npt_enabled && !npt) { + printk(KERN_INFO "kvm: Nested Paging disabled\n"); + npt_enabled = false; + } + if (npt_enabled) printk(KERN_INFO "kvm: Nested Paging enabled\n"); -- cgit v1.2.3 From 1855267210e1a8c9d41fe3a3c7a0d42eca5fb7cd Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 7 Feb 2008 13:47:41 +0100 Subject: KVM: export information about NPT to generic x86 code The generic x86 code has to know if the specific implementation uses Nested Paging. In the generic code Nested Paging is called Two Dimensional Paging (TDP) to avoid confusion with (future) TDP implementations of other vendors. This patch exports the availability of TDP to the generic x86 code. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 15 +++++++++++++++ arch/x86/kvm/svm.c | 4 +++- 2 files changed, 18 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 6651dfadae5..21cfa289d0f 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -32,6 +32,15 @@ #include #include +/* + * When setting this variable to true it enables Two-Dimensional-Paging + * where the hardware walks 2 page tables: + * 1. the guest-virtual to guest-physical + * 2. while doing 1. it walks guest-physical to host-physical + * If the hardware supports that we don't need to do shadow paging. + */ +static bool tdp_enabled = false; + #undef MMU_DEBUG #undef AUDIT @@ -1582,6 +1591,12 @@ out: } EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); +void kvm_enable_tdp(void) +{ + tdp_enabled = true; +} +EXPORT_SYMBOL_GPL(kvm_enable_tdp); + static void free_mmu_pages(struct kvm_vcpu *vcpu) { struct kvm_mmu_page *sp; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index fb5d6c2e6a0..9e29a13136c 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -426,8 +426,10 @@ static __init int svm_hardware_setup(void) npt_enabled = false; } - if (npt_enabled) + if (npt_enabled) { printk(KERN_INFO "kvm: Nested Paging enabled\n"); + kvm_enable_tdp(); + } return 0; -- cgit v1.2.3 From 4d9976bbdc09e08b69fc12fee2042c3528187b32 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 7 Feb 2008 13:47:42 +0100 Subject: KVM: MMU: make the __nonpaging_map function generic The mapping function for the nonpaging case in the softmmu does basically the same as required for Nested Paging. Make this function generic so it can be used for both. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 21cfa289d0f..33cd7c982dd 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -979,10 +979,9 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) { } -static int __nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, - gfn_t gfn, struct page *page) +static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, + gfn_t gfn, struct page *page, int level) { - int level = PT32E_ROOT_LEVEL; hpa_t table_addr = vcpu->arch.mmu.root_hpa; int pt_write = 0; @@ -1042,7 +1041,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) spin_lock(&vcpu->kvm->mmu_lock); kvm_mmu_free_some_pages(vcpu); - r = __nonpaging_map(vcpu, v, write, gfn, page); + r = __direct_map(vcpu, v, write, gfn, page, PT32E_ROOT_LEVEL); spin_unlock(&vcpu->kvm->mmu_lock); up_read(&vcpu->kvm->slots_lock); -- cgit v1.2.3 From cc4b6871e771e76dc1de06adb8aed261a1c66be8 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 7 Feb 2008 13:47:43 +0100 Subject: KVM: export the load_pdptrs() function to modules The load_pdptrs() function is required in the SVM module for NPT support. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 38edb2f558e..0c910c774a9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -213,6 +213,7 @@ out: return ret; } +EXPORT_SYMBOL_GPL(load_pdptrs); static bool pdptrs_changed(struct kvm_vcpu *vcpu) { -- cgit v1.2.3 From fb72d1674d860b0c9ef9b66b7f4f01fe5b3d2c00 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 7 Feb 2008 13:47:44 +0100 Subject: KVM: MMU: add TDP support to the KVM MMU This patch contains the changes to the KVM MMU necessary for support of the Nested Paging feature in AMD Barcelona and Phenom Processors. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++--- arch/x86/kvm/mmu.h | 6 +++++ 2 files changed, 82 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 33cd7c982dd..f7541fe22cd 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1097,6 +1097,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu) int i; gfn_t root_gfn; struct kvm_mmu_page *sp; + int metaphysical = 0; root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT; @@ -1105,14 +1106,20 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu) hpa_t root = vcpu->arch.mmu.root_hpa; ASSERT(!VALID_PAGE(root)); + if (tdp_enabled) + metaphysical = 1; sp = kvm_mmu_get_page(vcpu, root_gfn, 0, - PT64_ROOT_LEVEL, 0, ACC_ALL, NULL); + PT64_ROOT_LEVEL, metaphysical, + ACC_ALL, NULL); root = __pa(sp->spt); ++sp->root_count; vcpu->arch.mmu.root_hpa = root; return; } #endif + metaphysical = !is_paging(vcpu); + if (tdp_enabled) + metaphysical = 1; for (i = 0; i < 4; ++i) { hpa_t root = vcpu->arch.mmu.pae_root[i]; @@ -1126,7 +1133,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu) } else if (vcpu->arch.mmu.root_level == 0) root_gfn = 0; sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, - PT32_ROOT_LEVEL, !is_paging(vcpu), + PT32_ROOT_LEVEL, metaphysical, ACC_ALL, NULL); root = __pa(sp->spt); ++sp->root_count; @@ -1160,6 +1167,36 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, error_code & PFERR_WRITE_MASK, gfn); } +static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, + u32 error_code) +{ + struct page *page; + int r; + + ASSERT(vcpu); + ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); + + r = mmu_topup_memory_caches(vcpu); + if (r) + return r; + + down_read(¤t->mm->mmap_sem); + page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); + if (is_error_page(page)) { + kvm_release_page_clean(page); + up_read(¤t->mm->mmap_sem); + return 1; + } + spin_lock(&vcpu->kvm->mmu_lock); + kvm_mmu_free_some_pages(vcpu); + r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK, + gpa >> PAGE_SHIFT, page, TDP_ROOT_LEVEL); + spin_unlock(&vcpu->kvm->mmu_lock); + up_read(¤t->mm->mmap_sem); + + return r; +} + static void nonpaging_free(struct kvm_vcpu *vcpu) { mmu_free_roots(vcpu); @@ -1253,7 +1290,35 @@ static int paging32E_init_context(struct kvm_vcpu *vcpu) return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL); } -static int init_kvm_mmu(struct kvm_vcpu *vcpu) +static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) +{ + struct kvm_mmu *context = &vcpu->arch.mmu; + + context->new_cr3 = nonpaging_new_cr3; + context->page_fault = tdp_page_fault; + context->free = nonpaging_free; + context->prefetch_page = nonpaging_prefetch_page; + context->shadow_root_level = TDP_ROOT_LEVEL; + context->root_hpa = INVALID_PAGE; + + if (!is_paging(vcpu)) { + context->gva_to_gpa = nonpaging_gva_to_gpa; + context->root_level = 0; + } else if (is_long_mode(vcpu)) { + context->gva_to_gpa = paging64_gva_to_gpa; + context->root_level = PT64_ROOT_LEVEL; + } else if (is_pae(vcpu)) { + context->gva_to_gpa = paging64_gva_to_gpa; + context->root_level = PT32E_ROOT_LEVEL; + } else { + context->gva_to_gpa = paging32_gva_to_gpa; + context->root_level = PT32_ROOT_LEVEL; + } + + return 0; +} + +static int init_kvm_softmmu(struct kvm_vcpu *vcpu) { ASSERT(vcpu); ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); @@ -1268,6 +1333,14 @@ static int init_kvm_mmu(struct kvm_vcpu *vcpu) return paging32_init_context(vcpu); } +static int init_kvm_mmu(struct kvm_vcpu *vcpu) +{ + if (tdp_enabled) + return init_kvm_tdp_mmu(vcpu); + else + return init_kvm_softmmu(vcpu); +} + static void destroy_kvm_mmu(struct kvm_vcpu *vcpu) { ASSERT(vcpu); diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 1fce19ec7a2..e64e9f56a65 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -3,6 +3,12 @@ #include +#ifdef CONFIG_X86_64 +#define TDP_ROOT_LEVEL PT64_ROOT_LEVEL +#else +#define TDP_ROOT_LEVEL PT32E_ROOT_LEVEL +#endif + static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) { if (unlikely(vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES)) -- cgit v1.2.3 From 709ddebf81cb40e3c36c6109a7892e8b93a09464 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 7 Feb 2008 13:47:45 +0100 Subject: KVM: SVM: add support for Nested Paging This patch contains the SVM architecture dependent changes for KVM to enable support for the Nested Paging feature of AMD Barcelona and Phenom processors. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 67 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 9e29a13136c..8e9d4a5dacd 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -47,7 +47,12 @@ MODULE_LICENSE("GPL"); #define SVM_FEATURE_LBRV (1 << 1) #define SVM_DEATURE_SVML (1 << 2) +/* enable NPT for AMD64 and X86 with PAE */ +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) +static bool npt_enabled = true; +#else static bool npt_enabled = false; +#endif static int npt = 1; module_param(npt, int, S_IRUGO); @@ -187,7 +192,7 @@ static inline void flush_guest_tlb(struct kvm_vcpu *vcpu) static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) { - if (!(efer & EFER_LMA)) + if (!npt_enabled && !(efer & EFER_LMA)) efer &= ~EFER_LME; to_svm(vcpu)->vmcb->save.efer = efer | MSR_EFER_SVME_MASK; @@ -573,6 +578,22 @@ static void init_vmcb(struct vmcb *vmcb) save->cr0 = 0x00000010 | X86_CR0_PG | X86_CR0_WP; save->cr4 = X86_CR4_PAE; /* rdx = ?? */ + + if (npt_enabled) { + /* Setup VMCB for Nested Paging */ + control->nested_ctl = 1; + control->intercept_exceptions &= ~(1 << PF_VECTOR); + control->intercept_cr_read &= ~(INTERCEPT_CR0_MASK| + INTERCEPT_CR3_MASK); + control->intercept_cr_write &= ~(INTERCEPT_CR0_MASK| + INTERCEPT_CR3_MASK); + save->g_pat = 0x0007040600070406ULL; + /* enable caching because the QEMU Bios doesn't enable it */ + save->cr0 = X86_CR0_ET; + save->cr3 = 0; + save->cr4 = 0; + } + } static int svm_vcpu_reset(struct kvm_vcpu *vcpu) @@ -807,6 +828,9 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) } } #endif + if (npt_enabled) + goto set; + if ((vcpu->arch.cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) { svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); vcpu->fpu_active = 1; @@ -814,18 +838,26 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) vcpu->arch.cr0 = cr0; cr0 |= X86_CR0_PG | X86_CR0_WP; - cr0 &= ~(X86_CR0_CD | X86_CR0_NW); if (!vcpu->fpu_active) { svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR); cr0 |= X86_CR0_TS; } +set: + /* + * re-enable caching here because the QEMU bios + * does not do it - this results in some delay at + * reboot + */ + cr0 &= ~(X86_CR0_CD | X86_CR0_NW); svm->vmcb->save.cr0 = cr0; } static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { vcpu->arch.cr4 = cr4; - to_svm(vcpu)->vmcb->save.cr4 = cr4 | X86_CR4_PAE; + if (!npt_enabled) + cr4 |= X86_CR4_PAE; + to_svm(vcpu)->vmcb->save.cr4 = cr4; } static void svm_set_segment(struct kvm_vcpu *vcpu, @@ -1313,14 +1345,34 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm, [SVM_EXIT_WBINVD] = emulate_on_interception, [SVM_EXIT_MONITOR] = invalid_op_interception, [SVM_EXIT_MWAIT] = invalid_op_interception, + [SVM_EXIT_NPF] = pf_interception, }; - static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); u32 exit_code = svm->vmcb->control.exit_code; + if (npt_enabled) { + int mmu_reload = 0; + if ((vcpu->arch.cr0 ^ svm->vmcb->save.cr0) & X86_CR0_PG) { + svm_set_cr0(vcpu, svm->vmcb->save.cr0); + mmu_reload = 1; + } + vcpu->arch.cr0 = svm->vmcb->save.cr0; + vcpu->arch.cr3 = svm->vmcb->save.cr3; + if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { + if (!load_pdptrs(vcpu, vcpu->arch.cr3)) { + kvm_inject_gp(vcpu, 0); + return 1; + } + } + if (mmu_reload) { + kvm_mmu_reset_context(vcpu); + kvm_mmu_load(vcpu); + } + } + kvm_reput_irq(svm); if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) { @@ -1331,7 +1383,8 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) } if (is_external_interrupt(svm->vmcb->control.exit_int_info) && - exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR) + exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR && + exit_code != SVM_EXIT_NPF) printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x " "exit_code 0x%x\n", __FUNCTION__, svm->vmcb->control.exit_int_info, @@ -1522,6 +1575,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) svm->host_dr6 = read_dr6(); svm->host_dr7 = read_dr7(); svm->vmcb->save.cr2 = vcpu->arch.cr2; + /* required for live migration with NPT */ + if (npt_enabled) + svm->vmcb->save.cr3 = vcpu->arch.cr3; if (svm->vmcb->save.dr7 & 0xff) { write_dr7(0); @@ -1665,6 +1721,12 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) { struct vcpu_svm *svm = to_svm(vcpu); + if (npt_enabled) { + svm->vmcb->control.nested_cr3 = root; + force_new_asid(vcpu); + return; + } + svm->vmcb->save.cr3 = root; force_new_asid(vcpu); -- cgit v1.2.3 From 2e11384c2c6f1ce662b1e5b05ba49b216a052f2a Mon Sep 17 00:00:00 2001 From: Ryan Harper Date: Mon, 11 Feb 2008 10:26:38 -0600 Subject: KVM: VMX: fix typo in VMX header define Looking at Intel Volume 3b, page 148, table 20-11 and noticed that the field name is 'Deliver' not 'Deliever'. Attached patch changes the define name and its user in vmx.c Signed-off-by: Ryan Harper Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 6 +++--- arch/x86/kvm/vmx.h | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 76944f2c883..2d5ccec3f9e 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -632,7 +632,7 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, { vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, nr | INTR_TYPE_EXCEPTION - | (has_error_code ? INTR_INFO_DELIEVER_CODE_MASK : 0) + | (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0) | INTR_INFO_VALID_MASK); if (has_error_code) vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); @@ -1935,7 +1935,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) error_code = 0; rip = vmcs_readl(GUEST_RIP); - if (intr_info & INTR_INFO_DELIEVER_CODE_MASK) + if (intr_info & INTR_INFO_DELIVER_CODE_MASK) error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); if (is_page_fault(intr_info)) { cr2 = vmcs_readl(EXIT_QUALIFICATION); @@ -2351,7 +2351,7 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu) vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); - if (unlikely(idtv_info_field & INTR_INFO_DELIEVER_CODE_MASK)) + if (unlikely(idtv_info_field & INTR_INFO_DELIVER_CODE_MASK)) vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, vmcs_read32(IDT_VECTORING_ERROR_CODE)); if (unlikely(has_ext_irq)) diff --git a/arch/x86/kvm/vmx.h b/arch/x86/kvm/vmx.h index 436ce0f2909..5dff4606b98 100644 --- a/arch/x86/kvm/vmx.h +++ b/arch/x86/kvm/vmx.h @@ -233,12 +233,12 @@ enum vmcs_field { */ #define INTR_INFO_VECTOR_MASK 0xff /* 7:0 */ #define INTR_INFO_INTR_TYPE_MASK 0x700 /* 10:8 */ -#define INTR_INFO_DELIEVER_CODE_MASK 0x800 /* 11 */ +#define INTR_INFO_DELIVER_CODE_MASK 0x800 /* 11 */ #define INTR_INFO_VALID_MASK 0x80000000 /* 31 */ #define VECTORING_INFO_VECTOR_MASK INTR_INFO_VECTOR_MASK #define VECTORING_INFO_TYPE_MASK INTR_INFO_INTR_TYPE_MASK -#define VECTORING_INFO_DELIEVER_CODE_MASK INTR_INFO_DELIEVER_CODE_MASK +#define VECTORING_INFO_DELIVER_CODE_MASK INTR_INFO_DELIVER_CODE_MASK #define VECTORING_INFO_VALID_MASK INTR_INFO_VALID_MASK #define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */ -- cgit v1.2.3 From e6101a96c9efb74c98bba6322d4c5ea89e47e0fe Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 13 Feb 2008 18:58:45 +0100 Subject: KVM: SVM: let init_vmcb() take struct vcpu_svm as parameter Change the parameter of the init_vmcb() function in the kvm-amd module from struct vmcb to struct vcpu_svm. Signed-off-by: Joerg Roedel Signed-off-by: Markus Rechberger Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 8e9d4a5dacd..d934819733c 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -471,10 +471,10 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type) seg->base = 0; } -static void init_vmcb(struct vmcb *vmcb) +static void init_vmcb(struct vcpu_svm *svm) { - struct vmcb_control_area *control = &vmcb->control; - struct vmcb_save_area *save = &vmcb->save; + struct vmcb_control_area *control = &svm->vmcb->control; + struct vmcb_save_area *save = &svm->vmcb->save; control->intercept_cr_read = INTERCEPT_CR0_MASK | INTERCEPT_CR3_MASK | @@ -600,7 +600,7 @@ static int svm_vcpu_reset(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - init_vmcb(svm->vmcb); + init_vmcb(svm); if (vcpu->vcpu_id != 0) { svm->vmcb->save.rip = 0; @@ -638,7 +638,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; svm->asid_generation = 0; memset(svm->db_regs, 0, sizeof(svm->db_regs)); - init_vmcb(svm->vmcb); + init_vmcb(svm); fx_init(&svm->vcpu); svm->vcpu.fpu_active = 1; @@ -1024,7 +1024,7 @@ static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) * so reinitialize it. */ clear_page(svm->vmcb); - init_vmcb(svm->vmcb); + init_vmcb(svm); kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; return 0; -- cgit v1.2.3 From f65c229c3e7743c6654c16b9ec6248466b5eef21 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 13 Feb 2008 18:58:46 +0100 Subject: KVM: SVM: allocate the MSR permission map per VCPU This patch changes the kvm-amd module to allocate the SVM MSR permission map per VCPU instead of a global map for all VCPUs. With this we have more flexibility allowing specific guests to access virtualized MSRs. This is required for LBR virtualization. Signed-off-by: Joerg Roedel Signed-off-by: Markus Rechberger Signed-off-by: Avi Kivity --- arch/x86/kvm/kvm_svm.h | 2 ++ arch/x86/kvm/svm.c | 67 ++++++++++++++++++++++++-------------------------- 2 files changed, 34 insertions(+), 35 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/kvm_svm.h b/arch/x86/kvm/kvm_svm.h index ecdfe97e463..65ef0fc2c03 100644 --- a/arch/x86/kvm/kvm_svm.h +++ b/arch/x86/kvm/kvm_svm.h @@ -39,6 +39,8 @@ struct vcpu_svm { unsigned long host_db_regs[NUM_DB_REGS]; unsigned long host_dr6; unsigned long host_dr7; + + u32 *msrpm; }; #endif diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index d934819733c..281a2ffe122 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -65,7 +65,6 @@ static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) } unsigned long iopm_base; -unsigned long msrpm_base; struct kvm_ldttss_desc { u16 limit0; @@ -370,12 +369,29 @@ static void set_msr_interception(u32 *msrpm, unsigned msr, BUG(); } +static void svm_vcpu_init_msrpm(u32 *msrpm) +{ + memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER)); + +#ifdef CONFIG_X86_64 + set_msr_interception(msrpm, MSR_GS_BASE, 1, 1); + set_msr_interception(msrpm, MSR_FS_BASE, 1, 1); + set_msr_interception(msrpm, MSR_KERNEL_GS_BASE, 1, 1); + set_msr_interception(msrpm, MSR_LSTAR, 1, 1); + set_msr_interception(msrpm, MSR_CSTAR, 1, 1); + set_msr_interception(msrpm, MSR_SYSCALL_MASK, 1, 1); +#endif + set_msr_interception(msrpm, MSR_K6_STAR, 1, 1); + set_msr_interception(msrpm, MSR_IA32_SYSENTER_CS, 1, 1); + set_msr_interception(msrpm, MSR_IA32_SYSENTER_ESP, 1, 1); + set_msr_interception(msrpm, MSR_IA32_SYSENTER_EIP, 1, 1); +} + static __init int svm_hardware_setup(void) { int cpu; struct page *iopm_pages; - struct page *msrpm_pages; - void *iopm_va, *msrpm_va; + void *iopm_va; int r; iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER); @@ -388,37 +404,13 @@ static __init int svm_hardware_setup(void) clear_bit(0x80, iopm_va); /* allow direct access to PC debug port */ iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT; - - msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER); - - r = -ENOMEM; - if (!msrpm_pages) - goto err_1; - - msrpm_va = page_address(msrpm_pages); - memset(msrpm_va, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER)); - msrpm_base = page_to_pfn(msrpm_pages) << PAGE_SHIFT; - -#ifdef CONFIG_X86_64 - set_msr_interception(msrpm_va, MSR_GS_BASE, 1, 1); - set_msr_interception(msrpm_va, MSR_FS_BASE, 1, 1); - set_msr_interception(msrpm_va, MSR_KERNEL_GS_BASE, 1, 1); - set_msr_interception(msrpm_va, MSR_LSTAR, 1, 1); - set_msr_interception(msrpm_va, MSR_CSTAR, 1, 1); - set_msr_interception(msrpm_va, MSR_SYSCALL_MASK, 1, 1); -#endif - set_msr_interception(msrpm_va, MSR_K6_STAR, 1, 1); - set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_CS, 1, 1); - set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_ESP, 1, 1); - set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_EIP, 1, 1); - if (boot_cpu_has(X86_FEATURE_NX)) kvm_enable_efer_bits(EFER_NX); for_each_online_cpu(cpu) { r = svm_cpu_init(cpu); if (r) - goto err_2; + goto err; } svm_features = cpuid_edx(SVM_CPUID_FUNC); @@ -438,10 +430,7 @@ static __init int svm_hardware_setup(void) return 0; -err_2: - __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER); - msrpm_base = 0; -err_1: +err: __free_pages(iopm_pages, IOPM_ALLOC_ORDER); iopm_base = 0; return r; @@ -449,9 +438,8 @@ err_1: static __exit void svm_hardware_unsetup(void) { - __free_pages(pfn_to_page(msrpm_base >> PAGE_SHIFT), MSRPM_ALLOC_ORDER); __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER); - iopm_base = msrpm_base = 0; + iopm_base = 0; } static void init_seg(struct vmcb_seg *seg) @@ -536,7 +524,7 @@ static void init_vmcb(struct vcpu_svm *svm) (1ULL << INTERCEPT_MWAIT); control->iopm_base_pa = iopm_base; - control->msrpm_base_pa = msrpm_base; + control->msrpm_base_pa = __pa(svm->msrpm); control->tsc_offset = 0; control->int_ctl = V_INTR_MASKING_MASK; @@ -615,6 +603,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) { struct vcpu_svm *svm; struct page *page; + struct page *msrpm_pages; int err; svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); @@ -633,6 +622,13 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) goto uninit; } + err = -ENOMEM; + msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER); + if (!msrpm_pages) + goto uninit; + svm->msrpm = page_address(msrpm_pages); + svm_vcpu_init_msrpm(svm->msrpm); + svm->vmcb = page_address(page); clear_page(svm->vmcb); svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; @@ -661,6 +657,7 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); __free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT)); + __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER); kvm_vcpu_uninit(vcpu); kmem_cache_free(kvm_vcpu_cache, svm); } -- cgit v1.2.3 From 24e09cbf480a72f9c952af4ca77b159503dca44b Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 13 Feb 2008 18:58:47 +0100 Subject: KVM: SVM: enable LBR virtualization This patch implements the Last Branch Record Virtualization (LBRV) feature of the AMD Barcelona and Phenom processors into the kvm-amd module. It will only be enabled if the guest enables last branch recording in the DEBUG_CTL MSR. So there is no increased world switch overhead when the guest doesn't use these MSRs. Signed-off-by: Joerg Roedel Signed-off-by: Markus Rechberger Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 39 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 281a2ffe122..7d73e935dcc 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -47,6 +47,8 @@ MODULE_LICENSE("GPL"); #define SVM_FEATURE_LBRV (1 << 1) #define SVM_DEATURE_SVML (1 << 2) +#define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) + /* enable NPT for AMD64 and X86 with PAE */ #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) static bool npt_enabled = true; @@ -387,6 +389,28 @@ static void svm_vcpu_init_msrpm(u32 *msrpm) set_msr_interception(msrpm, MSR_IA32_SYSENTER_EIP, 1, 1); } +static void svm_enable_lbrv(struct vcpu_svm *svm) +{ + u32 *msrpm = svm->msrpm; + + svm->vmcb->control.lbr_ctl = 1; + set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1); + set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1); + set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 1, 1); + set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 1, 1); +} + +static void svm_disable_lbrv(struct vcpu_svm *svm) +{ + u32 *msrpm = svm->msrpm; + + svm->vmcb->control.lbr_ctl = 0; + set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0); + set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0); + set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 0, 0); + set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0); +} + static __init int svm_hardware_setup(void) { int cpu; @@ -1231,8 +1255,19 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) svm->vmcb->save.sysenter_esp = data; break; case MSR_IA32_DEBUGCTLMSR: - pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n", - __FUNCTION__, data); + if (!svm_has(SVM_FEATURE_LBRV)) { + pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n", + __FUNCTION__, data); + break; + } + if (data & DEBUGCTL_RESERVED_BITS) + return 1; + + svm->vmcb->save.dbgctl = data; + if (data & (1ULL<<0)) + svm_enable_lbrv(svm); + else + svm_disable_lbrv(svm); break; case MSR_K7_EVNTSEL0: case MSR_K7_EVNTSEL1: -- cgit v1.2.3 From 18068523d3a0b41fcee5b53cdb437a0ab4d65e4b Mon Sep 17 00:00:00 2001 From: Glauber de Oliveira Costa Date: Fri, 15 Feb 2008 17:52:47 -0200 Subject: KVM: paravirtualized clocksource: host part This is the host part of kvm clocksource implementation. As it does not include clockevents, it is a fairly simple implementation. We only have to register a per-vcpu area, and start writing to it periodically. The area is binary compatible with xen, as we use the same shadow_info structure. [marcelo: fix bad_page on MSR_KVM_SYSTEM_TIME] [avi: save full value of the msr, even if enable bit is clear] [avi: clear previous value of time_page] Signed-off-by: Glauber de Oliveira Costa Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 113 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 112 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0c910c774a9..256c0fc92b6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -19,6 +19,7 @@ #include "irq.h" #include "mmu.h" +#include #include #include #include @@ -424,7 +425,7 @@ static u32 msrs_to_save[] = { #ifdef CONFIG_X86_64 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, #endif - MSR_IA32_TIME_STAMP_COUNTER, + MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, }; static unsigned num_msrs_to_save; @@ -482,6 +483,70 @@ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) return kvm_set_msr(vcpu, index, *data); } +static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) +{ + static int version; + struct kvm_wall_clock wc; + struct timespec wc_ts; + + if (!wall_clock) + return; + + version++; + + down_read(&kvm->slots_lock); + kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); + + wc_ts = current_kernel_time(); + wc.wc_sec = wc_ts.tv_sec; + wc.wc_nsec = wc_ts.tv_nsec; + wc.wc_version = version; + + kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc)); + + version++; + kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); + up_read(&kvm->slots_lock); +} + +static void kvm_write_guest_time(struct kvm_vcpu *v) +{ + struct timespec ts; + unsigned long flags; + struct kvm_vcpu_arch *vcpu = &v->arch; + void *shared_kaddr; + + if ((!vcpu->time_page)) + return; + + /* Keep irq disabled to prevent changes to the clock */ + local_irq_save(flags); + kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER, + &vcpu->hv_clock.tsc_timestamp); + ktime_get_ts(&ts); + local_irq_restore(flags); + + /* With all the info we got, fill in the values */ + + vcpu->hv_clock.system_time = ts.tv_nsec + + (NSEC_PER_SEC * (u64)ts.tv_sec); + /* + * The interface expects us to write an even number signaling that the + * update is finished. Since the guest won't see the intermediate + * state, we just write "2" at the end + */ + vcpu->hv_clock.version = 2; + + shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0); + + memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock, + sizeof(vcpu->hv_clock)); + + kunmap_atomic(shared_kaddr, KM_USER0); + + mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT); +} + int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) { @@ -511,6 +576,44 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) case MSR_IA32_MISC_ENABLE: vcpu->arch.ia32_misc_enable_msr = data; break; + case MSR_KVM_WALL_CLOCK: + vcpu->kvm->arch.wall_clock = data; + kvm_write_wall_clock(vcpu->kvm, data); + break; + case MSR_KVM_SYSTEM_TIME: { + if (vcpu->arch.time_page) { + kvm_release_page_dirty(vcpu->arch.time_page); + vcpu->arch.time_page = NULL; + } + + vcpu->arch.time = data; + + /* we verify if the enable bit is set... */ + if (!(data & 1)) + break; + + /* ...but clean it before doing the actual write */ + vcpu->arch.time_offset = data & ~(PAGE_MASK | 1); + + vcpu->arch.hv_clock.tsc_to_system_mul = + clocksource_khz2mult(tsc_khz, 22); + vcpu->arch.hv_clock.tsc_shift = 22; + + down_read(¤t->mm->mmap_sem); + down_read(&vcpu->kvm->slots_lock); + vcpu->arch.time_page = + gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT); + up_read(&vcpu->kvm->slots_lock); + up_read(¤t->mm->mmap_sem); + + if (is_error_page(vcpu->arch.time_page)) { + kvm_release_page_clean(vcpu->arch.time_page); + vcpu->arch.time_page = NULL; + } + + kvm_write_guest_time(vcpu); + break; + } default: pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data); return 1; @@ -569,6 +672,12 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_EFER: data = vcpu->arch.shadow_efer; break; + case MSR_KVM_WALL_CLOCK: + data = vcpu->kvm->arch.wall_clock; + break; + case MSR_KVM_SYSTEM_TIME: + data = vcpu->arch.time; + break; default: pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); return 1; @@ -696,6 +805,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_USER_MEMORY: case KVM_CAP_SET_TSS_ADDR: case KVM_CAP_EXT_CPUID: + case KVM_CAP_CLOCKSOURCE: r = 1; break; case KVM_CAP_VAPIC: @@ -771,6 +881,7 @@ out: void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { kvm_x86_ops->vcpu_load(vcpu, cpu); + kvm_write_guest_time(vcpu); } void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) -- cgit v1.2.3 From 790c73f6289a204f858ffdcbe4a2b38e91657ec6 Mon Sep 17 00:00:00 2001 From: Glauber de Oliveira Costa Date: Fri, 15 Feb 2008 17:52:48 -0200 Subject: x86: KVM guest: paravirtualized clocksource This is the guest part of kvm clock implementation It does not do tsc-only timing, as tsc can have deltas between cpus, and it did not seem worthy to me to keep adjusting them. We do use it, however, for fine-grained adjustment. Other than that, time comes from the host. [randy dunlap: add missing include] [randy dunlap: disallow on Voyager or Visual WS] Signed-off-by: Glauber de Oliveira Costa Signed-off-by: Randy Dunlap Signed-off-by: Avi Kivity --- arch/x86/Kconfig | 11 ++++ arch/x86/kernel/Makefile | 1 + arch/x86/kernel/kvmclock.c | 160 +++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/setup_32.c | 5 ++ arch/x86/kernel/setup_64.c | 5 ++ 5 files changed, 182 insertions(+) create mode 100644 arch/x86/kernel/kvmclock.c (limited to 'arch') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 2fadf794483..40cedc255ed 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -373,6 +373,17 @@ config VMI at the moment), by linking the kernel to a GPL-ed ROM module provided by the hypervisor. +config KVM_CLOCK + bool "KVM paravirtualized clock" + select PARAVIRT + depends on !(X86_VISWS || X86_VOYAGER) + help + Turning on this option will allow you to run a paravirtualized clock + when running over the KVM hypervisor. Instead of relying on a PIT + (or probably other) emulation by the underlying device model, the host + provides the guest with timing infrastructure such as time of day, and + system time + source "arch/x86/lguest/Kconfig" config PARAVIRT diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 90e092d0af0..483047a3302 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -80,6 +80,7 @@ obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o +obj-$(CONFIG_KVM_CLOCK) += kvmclock.o obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o ifdef CONFIG_INPUT_PCSPKR diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c new file mode 100644 index 00000000000..b999f5e5b3b --- /dev/null +++ b/arch/x86/kernel/kvmclock.c @@ -0,0 +1,160 @@ +/* KVM paravirtual clock driver. A clocksource implementation + Copyright (C) 2008 Glauber de Oliveira Costa, Red Hat Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +*/ + +#include +#include +#include +#include +#include +#include + +#define KVM_SCALE 22 + +static int kvmclock = 1; + +static int parse_no_kvmclock(char *arg) +{ + kvmclock = 0; + return 0; +} +early_param("no-kvmclock", parse_no_kvmclock); + +/* The hypervisor will put information about time periodically here */ +static DEFINE_PER_CPU_SHARED_ALIGNED(struct kvm_vcpu_time_info, hv_clock); +#define get_clock(cpu, field) per_cpu(hv_clock, cpu).field + +static inline u64 kvm_get_delta(u64 last_tsc) +{ + int cpu = smp_processor_id(); + u64 delta = native_read_tsc() - last_tsc; + return (delta * get_clock(cpu, tsc_to_system_mul)) >> KVM_SCALE; +} + +static struct kvm_wall_clock wall_clock; +static cycle_t kvm_clock_read(void); +/* + * The wallclock is the time of day when we booted. Since then, some time may + * have elapsed since the hypervisor wrote the data. So we try to account for + * that with system time + */ +unsigned long kvm_get_wallclock(void) +{ + u32 wc_sec, wc_nsec; + u64 delta; + struct timespec ts; + int version, nsec; + int low, high; + + low = (int)__pa(&wall_clock); + high = ((u64)__pa(&wall_clock) >> 32); + + delta = kvm_clock_read(); + + native_write_msr(MSR_KVM_WALL_CLOCK, low, high); + do { + version = wall_clock.wc_version; + rmb(); + wc_sec = wall_clock.wc_sec; + wc_nsec = wall_clock.wc_nsec; + rmb(); + } while ((wall_clock.wc_version != version) || (version & 1)); + + delta = kvm_clock_read() - delta; + delta += wc_nsec; + nsec = do_div(delta, NSEC_PER_SEC); + set_normalized_timespec(&ts, wc_sec + delta, nsec); + /* + * Of all mechanisms of time adjustment I've tested, this one + * was the champion! + */ + return ts.tv_sec + 1; +} + +int kvm_set_wallclock(unsigned long now) +{ + return 0; +} + +/* + * This is our read_clock function. The host puts an tsc timestamp each time + * it updates a new time. Without the tsc adjustment, we can have a situation + * in which a vcpu starts to run earlier (smaller system_time), but probes + * time later (compared to another vcpu), leading to backwards time + */ +static cycle_t kvm_clock_read(void) +{ + u64 last_tsc, now; + int cpu; + + preempt_disable(); + cpu = smp_processor_id(); + + last_tsc = get_clock(cpu, tsc_timestamp); + now = get_clock(cpu, system_time); + + now += kvm_get_delta(last_tsc); + preempt_enable(); + + return now; +} +static struct clocksource kvm_clock = { + .name = "kvm-clock", + .read = kvm_clock_read, + .rating = 400, + .mask = CLOCKSOURCE_MASK(64), + .mult = 1 << KVM_SCALE, + .shift = KVM_SCALE, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, +}; + +static int kvm_register_clock(void) +{ + int cpu = smp_processor_id(); + int low, high; + low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1; + high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32); + + return native_write_msr_safe(MSR_KVM_SYSTEM_TIME, low, high); +} + +static void kvm_setup_secondary_clock(void) +{ + /* + * Now that the first cpu already had this clocksource initialized, + * we shouldn't fail. + */ + WARN_ON(kvm_register_clock()); + /* ok, done with our trickery, call native */ + setup_secondary_APIC_clock(); +} + +void __init kvmclock_init(void) +{ + if (!kvm_para_available()) + return; + + if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) { + if (kvm_register_clock()) + return; + pv_time_ops.get_wallclock = kvm_get_wallclock; + pv_time_ops.set_wallclock = kvm_set_wallclock; + pv_time_ops.sched_clock = kvm_clock_read; + pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock; + clocksource_register(&kvm_clock); + } +} diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index 44cc9b93393..5a849ddd09e 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -47,6 +47,7 @@ #include #include #include +#include #include