diff options
173 files changed, 5562 insertions, 3350 deletions
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index 03497909539..31575e220f3 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -556,3 +556,35 @@ Why: udev fully replaces this special file system that only contains CAPI NCCI TTY device nodes. User space (pppdcapiplugin) works without noticing the difference. Who: Jan Kiszka <jan.kiszka@web.de> + +---------------------------- + +What: KVM memory aliases support +When: July 2010 +Why: Memory aliasing support is used for speeding up guest vga access + through the vga windows. + + Modern userspace no longer uses this feature, so it's just bitrotted + code and can be removed with no impact. +Who: Avi Kivity <avi@redhat.com> + +---------------------------- + +What: KVM kernel-allocated memory slots +When: July 2010 +Why: Since 2.6.25, kvm supports user-allocated memory slots, which are + much more flexible than kernel-allocated slots. All current userspace + supports the newer interface and this code can be removed with no + impact. +Who: Avi Kivity <avi@redhat.com> + +---------------------------- + +What: KVM paravirt mmu host support +When: January 2011 +Why: The paravirt mmu host support is slower than non-paravirt mmu, both + on newer and older hardware. It is already not exposed to the guest, + and kept only for live migration purposes. +Who: Avi Kivity <avi@redhat.com> + +---------------------------- diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 18b9d0ca063..06bbbed7120 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -460,13 +460,6 @@ in sys_read() and friends. --------------------------- dquot_operations ------------------------------- prototypes: - int (*initialize) (struct inode *, int); - int (*drop) (struct inode *); - int (*alloc_space) (struct inode *, qsize_t, int); - int (*alloc_inode) (const struct inode *, unsigned long); - int (*free_space) (struct inode *, qsize_t); - int (*free_inode) (const struct inode *, unsigned long); - int (*transfer) (struct inode *, struct iattr *); int (*write_dquot) (struct dquot *); int (*acquire_dquot) (struct dquot *); int (*release_dquot) (struct dquot *); @@ -479,13 +472,6 @@ a proper locking wrt the filesystem and call the generic quota operations. What filesystem should expect from the generic quota functions: FS recursion Held locks when called -initialize: yes maybe dqonoff_sem -drop: yes - -alloc_space: ->mark_dirty() - -alloc_inode: ->mark_dirty() - -free_space: ->mark_dirty() - -free_inode: ->mark_dirty() - -transfer: yes - write_dquot: yes dqonoff_sem or dqptr_sem acquire_dquot: yes dqonoff_sem or dqptr_sem release_dquot: yes dqonoff_sem or dqptr_sem @@ -495,10 +481,6 @@ write_info: yes dqonoff_sem FS recursion means calling ->quota_read() and ->quota_write() from superblock operations. -->alloc_space(), ->alloc_inode(), ->free_space(), ->free_inode() are called -only directly by the filesystem and do not call any fs functions only -the ->mark_dirty() operation. - More details about quota locking can be found in fs/dquot.c. --------------------------- vm_operations_struct ----------------------------- diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt index 2811e452f75..c6416a39816 100644 --- a/Documentation/kvm/api.txt +++ b/Documentation/kvm/api.txt @@ -23,12 +23,12 @@ of a virtual machine. The ioctls belong to three classes Only run vcpu ioctls from the same thread that was used to create the vcpu. -2. File descritpors +2. File descriptors The kvm API is centered around file descriptors. An initial open("/dev/kvm") obtains a handle to the kvm subsystem; this handle can be used to issue system ioctls. A KVM_CREATE_VM ioctl on this -handle will create a VM file descripror which can be used to issue VM +handle will create a VM file descriptor which can be used to issue VM ioctls. A KVM_CREATE_VCPU ioctl on a VM fd will create a virtual cpu and return a file descriptor pointing to it. Finally, ioctls on a vcpu fd can be used to control the vcpu, including the important task of @@ -643,7 +643,7 @@ Type: vm ioctl Parameters: struct kvm_clock_data (in) Returns: 0 on success, -1 on error -Sets the current timestamp of kvmclock to the valued specific in its parameter. +Sets the current timestamp of kvmclock to the value specified in its parameter. In conjunction with KVM_GET_CLOCK, it is used to ensure monotonicity on scenarios such as migration. @@ -795,11 +795,11 @@ Unused. __u64 data_offset; /* relative to kvm_run start */ } io; -If exit_reason is KVM_EXIT_IO_IN or KVM_EXIT_IO_OUT, then the vcpu has +If exit_reason is KVM_EXIT_IO, then the vcpu has executed a port I/O instruction which could not be satisfied by kvm. data_offset describes where the data is located (KVM_EXIT_IO_OUT) or where kvm expects application code to place the data for the next -KVM_RUN invocation (KVM_EXIT_IO_IN). Data format is a patcked array. +KVM_RUN invocation (KVM_EXIT_IO_IN). Data format is a packed array. struct { struct kvm_debug_exit_arch arch; @@ -815,7 +815,7 @@ Unused. __u8 is_write; } mmio; -If exit_reason is KVM_EXIT_MMIO or KVM_EXIT_IO_OUT, then the vcpu has +If exit_reason is KVM_EXIT_MMIO, then the vcpu has executed a memory-mapped I/O instruction which could not be satisfied by kvm. The 'data' member contains the written data if 'is_write' is true, and should be filled by application code otherwise. diff --git a/MAINTAINERS b/MAINTAINERS index c6591bca646..51d8b5221dd 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3173,7 +3173,7 @@ F: arch/x86/include/asm/svm.h F: arch/x86/kvm/svm.c KERNEL VIRTUAL MACHINE (KVM) FOR POWERPC -M: Hollis Blanchard <hollisb@us.ibm.com> +M: Alexander Graf <agraf@suse.de> L: kvm-ppc@vger.kernel.org W: http://kvm.qumranet.com S: Supported diff --git a/arch/ia64/kvm/Kconfig b/arch/ia64/kvm/Kconfig index 01c75797119..fa4d1e59deb 100644 --- a/arch/ia64/kvm/Kconfig +++ b/arch/ia64/kvm/Kconfig @@ -26,6 +26,7 @@ config KVM select ANON_INODES select HAVE_KVM_IRQCHIP select KVM_APIC_ARCHITECTURE + select KVM_MMIO ---help--- Support hosting fully virtualized guest machines using hardware virtualization extensions. You will need a fairly recent diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c index 5fdeec5fddc..26e0e089bfe 100644 --- a/arch/ia64/kvm/kvm-ia64.c +++ b/arch/ia64/kvm/kvm-ia64.c @@ -241,10 +241,10 @@ static int handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 0; mmio: if (p->dir) - r = kvm_io_bus_read(&vcpu->kvm->mmio_bus, p->addr, + r = kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, p->addr, p->size, &p->data); else - r = kvm_io_bus_write(&vcpu->kvm->mmio_bus, p->addr, + r = kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, p->addr, p->size, &p->data); if (r) printk(KERN_ERR"kvm: No iodevice found! addr:%lx\n", p->addr); @@ -636,12 +636,9 @@ static void kvm_vcpu_post_transition(struct kvm_vcpu *vcpu) static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { union context *host_ctx, *guest_ctx; - int r; + int r, idx; - /* - * down_read() may sleep and return with interrupts enabled - */ - down_read(&vcpu->kvm->slots_lock); + idx = srcu_read_lock(&vcpu->kvm->srcu); again: if (signal_pending(current)) { @@ -663,7 +660,7 @@ again: if (r < 0) goto vcpu_run_fail; - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&vcpu->kvm->srcu, idx); kvm_guest_enter(); /* @@ -687,7 +684,7 @@ again: kvm_guest_exit(); preempt_enable(); - down_read(&vcpu->kvm->slots_lock); + idx = srcu_read_lock(&vcpu->kvm->srcu); r = kvm_handle_exit(kvm_run, vcpu); @@ -697,10 +694,10 @@ again: } out: - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&vcpu->kvm->srcu, idx); if (r > 0) { kvm_resched(vcpu); - down_read(&vcpu->kvm->slots_lock); + idx = srcu_read_lock(&vcpu->kvm->srcu); goto again; } @@ -971,7 +968,7 @@ long kvm_arch_vm_ioctl(struct file *filp, goto out; r = kvm_setup_default_irq_routing(kvm); if (r) { - kfree(kvm->arch.vioapic); + kvm_ioapic_destroy(kvm); goto out; } break; @@ -1377,12 +1374,14 @@ static void free_kvm(struct kvm *kvm) static void kvm_release_vm_pages(struct kvm *kvm) { + struct kvm_memslots *slots; struct kvm_memory_slot *memslot; int i, j; unsigned long base_gfn; - for (i = 0; i < kvm->nmemslots; i++) { - memslot = &kvm->memslots[i]; + slots = rcu_dereference(kvm->memslots); + for (i = 0; i < slots->nmemslots; i++) { + memslot = &slots->memslots[i]; base_gfn = memslot->base_gfn; for (j = 0; j < memslot->npages; j++) { @@ -1405,6 +1404,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm) kfree(kvm->arch.vioapic); kvm_release_vm_pages(kvm); kvm_free_physmem(kvm); + cleanup_srcu_struct(&kvm->srcu); free_kvm(kvm); } @@ -1576,15 +1576,15 @@ out: return r; } -int kvm_arch_set_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem, +int kvm_arch_prepare_memory_region(struct kvm *kvm, + struct kvm_memory_slot *memslot, struct kvm_memory_slot old, + struct kvm_userspace_memory_region *mem, int user_alloc) { unsigned long i; unsigned long pfn; - int npages = mem->memory_size >> PAGE_SHIFT; - struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot]; + int npages = memslot->npages; unsigned long base_gfn = memslot->base_gfn; if (base_gfn + npages > (KVM_MAX_MEM_SIZE >> PAGE_SHIFT)) @@ -1608,6 +1608,14 @@ int kvm_arch_set_memory_region(struct kvm *kvm, return 0; } +void kvm_arch_commit_memory_region(struct kvm *kvm, + struct kvm_userspace_memory_region *mem, + struct kvm_memory_slot old, + int user_alloc) +{ + return; +} + void kvm_arch_flush_shadow(struct kvm *kvm) { kvm_flush_remote_tlbs(kvm); @@ -1802,7 +1810,7 @@ static int kvm_ia64_sync_dirty_log(struct kvm *kvm, if (log->slot >= KVM_MEMORY_SLOTS) goto out; - memslot = &kvm->memslots[log->slot]; + memslot = &kvm->memslots->memslots[log->slot]; r = -ENOENT; if (!memslot->dirty_bitmap) goto out; @@ -1827,6 +1835,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot; int is_dirty = 0; + mutex_lock(&kvm->slots_lock); spin_lock(&kvm->arch.dirty_log_lock); r = kvm_ia64_sync_dirty_log(kvm, log); @@ -1840,12 +1849,13 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, /* If nothing is dirty, don't bother messing with page tables. */ if (is_dirty) { kvm_flush_remote_tlbs(kvm); - memslot = &kvm->memslots[log->slot]; + memslot = &kvm->memslots->memslots[log->slot]; n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; memset(memslot->dirty_bitmap, 0, n); } r = 0; out: + mutex_unlock(&kvm->slots_lock); spin_unlock(&kvm->arch.dirty_log_lock); return r; } diff --git a/arch/ia64/kvm/kvm_fw.c b/arch/ia64/kvm/kvm_fw.c index e4b82319881..cb548ee9fca 100644 --- a/arch/ia64/kvm/kvm_fw.c +++ b/arch/ia64/kvm/kvm_fw.c @@ -75,7 +75,7 @@ static void set_pal_result(struct kvm_vcpu *vcpu, struct exit_ctl_data *p; p = kvm_get_exit_data(vcpu); - if (p && p->exit_reason == EXIT_REASON_PAL_CALL) { + if (p->exit_reason == EXIT_REASON_PAL_CALL) { p->u.pal_data.ret = result; return ; } @@ -87,7 +87,7 @@ static void set_sal_result(struct kvm_vcpu *vcpu, struct exit_ctl_data *p; p = kvm_get_exit_data(vcpu); - if (p && p->exit_reason == EXIT_REASON_SAL_CALL) { + if (p->exit_reason == EXIT_REASON_SAL_CALL) { p->u.sal_data.ret = result; return ; } @@ -322,7 +322,7 @@ static u64 kvm_get_pal_call_index(struct kvm_vcpu *vcpu) struct exit_ctl_data *p; p = kvm_get_exit_data(vcpu); - if (p && (p->exit_reason == EXIT_REASON_PAL_CALL)) + if (p->exit_reason == EXIT_REASON_PAL_CALL) index = p->u.pal_data.gr28; return index; @@ -646,18 +646,16 @@ static void kvm_get_sal_call_data(struct kvm_vcpu *vcpu, u64 *in0, u64 *in1, p = kvm_get_exit_data(vcpu); - if (p) { - if (p->exit_reason == EXIT_REASON_SAL_CALL) { - *in0 = p->u.sal_data.in0; - *in1 = p->u.sal_data.in1; - *in2 = p->u.sal_data.in2; - *in3 = p->u.sal_data.in3; - *in4 = p->u.sal_data.in4; - *in5 = p->u.sal_data.in5; - *in6 = p->u.sal_data.in6; - *in7 = p->u.sal_data.in7; - return ; - } + if (p->exit_reason == EXIT_REASON_SAL_CALL) { + *in0 = p->u.sal_data.in0; + *in1 = p->u.sal_data.in1; + *in2 = p->u.sal_data.in2; + *in3 = p->u.sal_data.in3; + *in4 = p->u.sal_data.in4; + *in5 = p->u.sal_data.in5; + *in6 = p->u.sal_data.in6; + *in7 = p->u.sal_data.in7; + return ; } *in0 = 0; } diff --git a/arch/ia64/kvm/mmio.c b/arch/ia64/kvm/mmio.c index 9bf55afd08d..fb8f9f59a1e 100644 --- a/arch/ia64/kvm/mmio.c +++ b/arch/ia64/kvm/mmio.c @@ -316,8 +316,8 @@ void emulate_io_inst(struct kvm_vcpu *vcpu, u64 padr, u64 ma) return; } else { inst_type = -1; - panic_vm(vcpu, "Unsupported MMIO access instruction! \ - Bunld[0]=0x%lx, Bundle[1]=0x%lx\n", + panic_vm(vcpu, "Unsupported MMIO access instruction! " + "Bunld[0]=0x%lx, Bundle[1]=0x%lx\n", bundle.i64[0], bundle.i64[1]); } diff --git a/arch/ia64/kvm/vcpu.c b/arch/ia64/kvm/vcpu.c index dce75b70cdd..958815c9787 100644 --- a/arch/ia64/kvm/vcpu.c +++ b/arch/ia64/kvm/vcpu.c @@ -1639,8 +1639,8 @@ void vcpu_set_psr(struct kvm_vcpu *vcpu, unsigned long val) * Otherwise panic */ if (val & (IA64_PSR_PK | IA64_PSR_IS | IA64_PSR_VM)) - panic_vm(vcpu, "Only support guests with vpsr.pk =0 \ - & vpsr.is=0\n"); + panic_vm(vcpu, "Only support guests with vpsr.pk =0 " + "& vpsr.is=0\n"); /* * For those IA64_PSR bits: id/da/dd/ss/ed/ia diff --git a/arch/powerpc/include/asm/kvm_asm.h b/arch/powerpc/include/asm/kvm_asm.h index af2abe74f54..aadf2dd6f84 100644 --- a/arch/powerpc/include/asm/kvm_asm.h +++ b/arch/powerpc/include/asm/kvm_asm.h @@ -97,4 +97,10 @@ #define RESUME_HOST RESUME_FLAG_HOST #define RESUME_HOST_NV (RESUME_FLAG_HOST|RESUME_FLAG_NV) +#define KVM_GUEST_MODE_NONE 0 +#define KVM_GUEST_MODE_GUEST 1 +#define KVM_GUEST_MODE_SKIP 2 + +#define KVM_INST_FETCH_FAILED -1 + #endif /* __POWERPC_KVM_ASM_H__ */ diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index 74b7369770d..db7db0a9696 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -22,7 +22,7 @@ #include <linux/types.h> #include <linux/kvm_host.h> -#include <asm/kvm_ppc.h> +#include <asm/kvm_book3s_64_asm.h> struct kvmppc_slb { u64 esid; @@ -33,7 +33,8 @@ struct kvmppc_slb { bool Ks; bool Kp; bool nx; - bool large; + bool large; /* PTEs are 16MB */ + bool tb; /* 1TB segment */ bool class; }; @@ -69,6 +70,7 @@ struct kvmppc_sid_map { struct kvmppc_vcpu_book3s { struct kvm_vcpu vcpu; + struct kvmppc_book3s_shadow_vcpu shadow_vcpu; struct kvmppc_sid_map sid_map[SID_MAP_NUM]; struct kvmppc_slb slb[64]; struct { @@ -89,6 +91,7 @@ struct kvmppc_vcpu_book3s { u64 vsid_next; u64 vsid_max; int context_id; + ulong prog_flags; /* flags to inject when giving a 700 trap */ }; #define CONTEXT_HOST 0 @@ -119,6 +122,10 @@ extern void kvmppc_set_bat(struct kvm_vcpu *vcpu, struct kvmppc_bat *bat, extern u32 kvmppc_trampoline_lowmem; extern u32 kvmppc_trampoline_enter; +extern void kvmppc_rmcall(ulong srr0, ulong srr1); +extern void kvmppc_load_up_fpu(void); +extern void kvmppc_load_up_altivec(void); +extern void kvmppc_load_up_vsx(void); static inline struct kvmppc_vcpu_book3s *to_book3s(struct kvm_vcpu *vcpu) { diff --git a/arch/powerpc/include/asm/kvm_book3s_64_asm.h b/arch/powerpc/include/asm/kvm_book3s_64_asm.h index 2e06ee8184e..183461b4840 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64_asm.h +++ b/arch/powerpc/include/asm/kvm_book3s_64_asm.h @@ -20,6 +20,8 @@ #ifndef __ASM_KVM_BOOK3S_ASM_H__ #define __ASM_KVM_BOOK3S_ASM_H__ +#ifdef __ASSEMBLY__ + #ifdef CONFIG_KVM_BOOK3S_64_HANDLER #include <asm/kvm_asm.h> @@ -55,4 +57,20 @@ kvmppc_resume_\intno: #endif /* CONFIG_KVM_BOOK3S_64_HANDLER */ +#else /*__ASSEMBLY__ */ + +struct kvmppc_book3s_shadow_vcpu { + ulong gpr[14]; + u32 cr; + u32 xer; + ulong host_r1; + ulong host_r2; + ulong handler; + ulong scratch0; + ulong scratch1; + ulong vmhandler; +}; + +#endif /*__ASSEMBLY__ */ + #endif /* __ASM_KVM_BOOK3S_ASM_H__ */ diff --git a/arch/powerpc/include/asm/kvm_e500.h b/arch/powerpc/include/asm/kvm_e500.h index 9d497ce4972..7fea26fffb2 100644 --- a/arch/powerpc/include/asm/kvm_e500.h +++ b/arch/powerpc/include/asm/kvm_e500.h @@ -52,9 +52,12 @@ struct kvmppc_vcpu_e500 { u32 mas5; u32 mas6; u32 mas7; + u32 l1csr0; u32 l1csr1; u32 hid0; u32 hid1; + u32 tlb0cfg; + u32 tlb1cfg; struct kvm_vcpu vcpu; }; diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 1201f62d0d7..5e5bae7e152 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -167,23 +167,40 @@ struct kvm_vcpu_arch { ulong trampoline_lowmem; ulong trampoline_enter; ulong highmem_handler; + ulong rmcall; ulong host_paca_phys; struct kvmppc_mmu mmu; #endif - u64 fpr[32]; ulong gpr[32]; + u64 fpr[32]; + u32 fpscr; + +#ifdef CONFIG_ALTIVEC + vector128 vr[32]; + vector128 vscr; +#endif + +#ifdef CONFIG_VSX + u64 vsr[32]; +#endif + ulong pc; - u32 cr; ulong ctr; ulong lr; + +#ifdef CONFIG_BOOKE ulong xer; + u32 cr; +#endif ulong msr; #ifdef CONFIG_PPC64 ulong shadow_msr; + ulong shadow_srr1; ulong hflags; + ulong guest_owned_ext; #endif u32 mmucr; ulong sprg0; @@ -242,6 +259,8 @@ struct kvm_vcpu_arch { #endif ulong fault_dear; ulong fault_esr; + ulong queued_dear; + ulong queued_esr; gpa_t paddr_accessed; u8 io_gpr; /* GPR used as IO source/target */ diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 269ee46ab02..e2642829e43 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -28,6 +28,9 @@ #include <linux/types.h> #include <linux/kvm_types.h> #include <linux/kvm_host.h> +#ifdef CONFIG_PPC_BOOK3S +#include <asm/kvm_book3s.h> +#endif enum emulation_result { EMULATE_DONE, /* no further processing */ @@ -80,8 +83,9 @@ extern void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu); extern void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu); extern int kvmppc_core_pending_dec(struct kvm_vcpu *vcpu); -extern void kvmppc_core_queue_program(struct kvm_vcpu *vcpu); +extern void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong flags); extern void kvmppc_core_queue_dec(struct kvm_vcpu *vcpu); +extern void kvmppc_core_dequeue_dec(struct kvm_vcpu *vcpu); extern void kvmppc_core_queue_external(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq); @@ -95,4 +99,81 @@ extern void kvmppc_booke_exit(void); extern void kvmppc_core_destroy_mmu(struct kvm_vcpu *vcpu); +#ifdef CONFIG_PPC_BOOK3S + +/* We assume we're always acting on the current vcpu */ + +static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val) +{ + if ( num < 14 ) { + get_paca()->shadow_vcpu.gpr[num] = val; + to_book3s(vcpu)->shadow_vcpu.gpr[num] = val; + } else + vcpu->arch.gpr[num] = val; +} + +static inline ulong kvmppc_get_gpr(struct kvm_vcpu *vcpu, int num) +{ + if ( num < 14 ) + return get_paca()->shadow_vcpu.gpr[num]; + else + return vcpu->arch.gpr[num]; +} + +static inline void kvmppc_set_cr(struct kvm_vcpu *vcpu, u32 val) +{ + get_paca()->shadow_vcpu.cr = val; + to_book3s(vcpu)->shadow_vcpu.cr = val; +} + +static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu) +{ + return get_paca()->shadow_vcpu.cr; +} + +static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, u32 val) +{ + get_paca()->shadow_vcpu.xer = val; + to_book3s(vcpu)->shadow_vcpu.xer = val; +} + +static inline u32 kvmppc_get_xer(struct kvm_vcpu *vcpu) +{ + return get_paca()->shadow_vcpu.xer; +} + +#else + +static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val) +{ + vcpu->arch.gpr[num] = val; +} + +static inline ulong kvmppc_get_gpr(struct kvm_vcpu *vcpu, int num) +{ + return vcpu->arch.gpr[num]; +} + +static inline void kvmppc_set_cr(struct kvm_vcpu *vcpu, u32 val) +{ + vcpu->arch.cr = val; +} + +static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.cr; +} + +static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, u32 val) +{ + vcpu->arch.xer = val; +} + +static inline u32 kvmppc_get_xer(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.xer; +} + +#endif + #endif /* __POWERPC_KVM_PPC_H__ */ diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index 5e9b4ef7141..d8a693109c8 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -19,6 +19,9 @@ #include <asm/mmu.h> #include <asm/page.h> #include <asm/exception-64e.h> +#ifdef CONFIG_KVM_BOOK3S_64_HANDLER +#include <asm/kvm_book3s_64_asm.h> +#endif register struct paca_struct *local_paca asm("r13"); @@ -135,6 +138,8 @@ struct paca_struct { u64 esid; u64 vsid; } kvm_slb[64]; /* guest SLB */ + /* We use this to store guest state in */ + struct kvmppc_book3s_shadow_vcpu shadow_vcpu; u8 kvm_slb_max; /* highest used guest slb entry */ u8 kvm_in_guest; /* are we inside the guest? */ #endif diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index bc8dd53f718..5572e86223f 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -426,6 +426,10 @@ #define SRR1_WAKEMT 0x00280000 /* mtctrl */ #define SRR1_WAKEDEC 0x00180000 /* Decrementer interrupt */ #define SRR1_WAKETHERM 0x00100000 /* Thermal management interrupt */ +#define SRR1_PROGFPE 0x00100000 /* Floating Point Enabled */ +#define SRR1_PROGPRIV 0x00040000 /* Privileged instruction */ +#define SRR1_PROGTRAP 0x00020000 /* Trap */ +#define SRR1_PROGADDR 0x00010000 /* SRR0 contains subsequent addr */ #define SPRN_HSRR0 0x13A /* Save/Restore Register 0 */ #define SPRN_HSRR1 0x13B /* Save/Restore Register 1 */ diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index a6c2b63227b..957ceb7059c 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -194,6 +194,30 @@ int main(void) DEFINE(PACA_KVM_IN_GUEST, offsetof(struct paca_struct, kvm_in_guest)); DEFINE(PACA_KVM_SLB, offsetof(struct paca_struct, kvm_slb)); DEFINE(PACA_KVM_SLB_MAX, offsetof(struct paca_struct, kvm_slb_max)); + DEFINE(PACA_KVM_CR, offsetof(struct paca_struct, shadow_vcpu.cr)); + DEFINE(PACA_KVM_XER, offsetof(struct paca_struct, shadow_vcpu.xer)); + DEFINE(PACA_KVM_R0, offsetof(struct paca_struct, shadow_vcpu.gpr[0])); + DEFINE(PACA_KVM_R1, offsetof(struct paca_struct, shadow_vcpu.gpr[1])); + DEFINE(PACA_KVM_R2, offsetof(struct paca_struct, shadow_vcpu.gpr[2])); + DEFINE(PACA_KVM_R3, offsetof(struct paca_struct, shadow_vcpu.gpr[3])); + DEFINE(PACA_KVM_R4, offsetof(struct paca_struct, shadow_vcpu.gpr[4])); + DEFINE(PACA_KVM_R5, offsetof(struct paca_struct, shadow_vcpu.gpr[5])); + DEFINE(PACA_KVM_R6, offsetof(struct paca_struct, shadow_vcpu.gpr[6])); + DEFINE(PACA_KVM_R7, offsetof(struct paca_struct, shadow_vcpu.gpr[7])); + DEFINE(PACA_KVM_R8, offsetof(struct paca_struct, shadow_vcpu.gpr[8])); + DEFINE(PACA_KVM_R9, offsetof(struct paca_struct, shadow_vcpu.gpr[9])); + DEFINE(PACA_KVM_R10, offsetof(struct paca_struct, shadow_vcpu.gpr[10])); + DEFINE(PACA_KVM_R11, offsetof(struct paca_struct, shadow_vcpu.gpr[11])); + DEFINE(PACA_KVM_R12, offsetof(struct paca_struct, shadow_vcpu.gpr[12])); + DEFINE(PACA_KVM_R13, offsetof(struct paca_struct, shadow_vcpu.gpr[13])); + DEFINE(PACA_KVM_HOST_R1, offsetof(struct paca_struct, shadow_vcpu.host_r1)); + DEFINE(PACA_KVM_HOST_R2, offsetof(struct paca_struct, shadow_vcpu.host_r2)); + DEFINE(PACA_KVM_VMHANDLER, offsetof(struct paca_struct, + shadow_vcpu.vmhandler)); + DEFINE(PACA_KVM_SCRATCH0, offsetof(struct paca_struct, + shadow_vcpu.scratch0)); + DEFINE(PACA_KVM_SCRATCH1, offsetof(struct paca_struct, + shadow_vcpu.scratch1)); #endif #endif /* CONFIG_PPC64 */ @@ -389,8 +413,6 @@ int main(void) DEFINE(VCPU_HOST_PID, offsetof(struct kvm_vcpu, arch.host_pid)); DEFINE(VCPU_GPRS, offsetof(struct kvm_vcpu, arch.gpr)); DEFINE(VCPU_LR, offsetof(struct kvm_vcpu, arch.lr)); - DEFINE(VCPU_CR, offsetof(struct kvm_vcpu, arch.cr)); - DEFINE(VCPU_XER, offsetof(struct kvm_vcpu, arch.xer)); DEFINE(VCPU_CTR, offsetof(struct kvm_vcpu, arch.ctr)); DEFINE(VCPU_PC, offsetof(struct kvm_vcpu, arch.pc)); DEFINE(VCPU_MSR, offsetof(struct kvm_vcpu, arch.msr)); @@ -411,11 +433,16 @@ int main(void) DEFINE(VCPU_HOST_R2, offsetof(struct kvm_vcpu, arch.host_r2)); DEFINE(VCPU_HOST_MSR, offsetof(struct kvm_vcpu, arch.host_msr)); DEFINE(VCPU_SHADOW_MSR, offsetof(struct kvm_vcpu, arch.shadow_msr)); + DEFINE(VCPU_SHADOW_SRR1, offsetof(struct kvm_vcpu, arch.shadow_srr1)); DEFINE(VCPU_TRAMPOLINE_LOWMEM, offsetof(struct kvm_vcpu, arch.trampoline_lowmem)); DEFINE(VCPU_TRAMPOLINE_ENTER, offsetof(struct kvm_vcpu, arch.trampoline_enter)); DEFINE(VCPU_HIGHMEM_HANDLER, offsetof(struct kvm_vcpu, arch.highmem_handler)); + DEFINE(VCPU_RMCALL, offsetof(struct kvm_vcpu, arch.rmcall)); DEFINE(VCPU_HFLAGS, offsetof(struct kvm_vcpu, arch.hflags)); -#endif +#else + DEFINE(VCPU_CR, offsetof(struct kvm_vcpu, arch.cr)); + DEFINE(VCPU_XER, offsetof(struct kvm_vcpu, arch.xer)); +#endif /* CONFIG_PPC64 */ #endif #ifdef CONFIG_44x DEFINE(PGD_T_LOG2, PGD_T_LOG2); diff --git a/arch/powerpc/kernel/ppc_ksyms.c b/arch/powerpc/kernel/ppc_ksyms.c index 425451453e9..ab3e392ac63 100644 --- a/arch/powerpc/kernel/ppc_ksyms.c +++ b/arch/powerpc/kernel/ppc_ksyms.c @@ -107,6 +107,7 @@ EXPORT_SYMBOL(giveup_altivec); #endif /* CONFIG_ALTIVEC */ #ifdef CONFIG_VSX EXPORT_SYMBOL(giveup_vsx); +EXPORT_SYMBOL_GPL(__giveup_vsx); #endif /* CONFIG_VSX */ #ifdef CONFIG_SPE EXPORT_SYMBOL(giveup_spe); diff --git a/arch/powerpc/kvm/44x_emulate.c b/arch/powerpc/kvm/44x_emulate.c index 61af58fcece..65ea083a5b2 100644 --- a/arch/powerpc/kvm/44x_emulate.c +++ b/arch/powerpc/kvm/44x_emulate.c @@ -65,13 +65,14 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, */ switch (dcrn) { case DCRN_CPR0_CONFIG_ADDR: - vcpu->arch.gpr[rt] = vcpu->arch.cpr0_cfgaddr; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.cpr0_cfgaddr); break; case DCRN_CPR0_CONFIG_DATA: local_irq_disable(); mtdcr(DCRN_CPR0_CONFIG_ADDR, vcpu->arch.cpr0_cfgaddr); - vcpu->arch.gpr[rt] = mfdcr(DCRN_CPR0_CONFIG_DATA); + kvmppc_set_gpr(vcpu, rt, + mfdcr(DCRN_CPR0_CONFIG_DATA)); local_irq_enable(); break; default: @@ -93,11 +94,11 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, /* emulate some access in kernel */ switch (dcrn) { case DCRN_CPR0_CONFIG_ADDR: - vcpu->arch.cpr0_cfgaddr = vcpu->arch.gpr[rs]; + vcpu->arch.cpr0_cfgaddr = kvmppc_get_gpr(vcpu, rs); break; default: run->dcr.dcrn = dcrn; - run->dcr.data = vcpu->arch.gpr[rs]; + run->dcr.data = kvmppc_get_gpr(vcpu, rs); run->dcr.is_write = 1; vcpu->arch.dcr_needed = 1; kvmppc_account_exit(vcpu, DCR_EXITS); @@ -146,13 +147,13 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) switch (sprn) { case SPRN_PID: - kvmppc_set_pid(vcpu, vcpu->arch.gpr[rs]); break; + kvmppc_set_pid(vcpu, kvmppc_get_gpr(vcpu, rs)); break; case SPRN_MMUCR: - vcpu->arch.mmucr = vcpu->arch.gpr[rs]; break; + vcpu->arch.mmucr = kvmppc_get_gpr(vcpu, rs); break; case SPRN_CCR0: - vcpu->arch.ccr0 = vcpu->arch.gpr[rs]; break; + vcpu->arch.ccr0 = kvmppc_get_gpr(vcpu, rs); break; case SPRN_CCR1: - vcpu->arch.ccr1 = vcpu->arch.gpr[rs]; break; + vcpu->arch.ccr1 = kvmppc_get_gpr(vcpu, rs); break; default: emulated = kvmppc_booke_emulate_mtspr(vcpu, sprn, rs); } @@ -167,13 +168,13 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) switch (sprn) { case SPRN_PID: - vcpu->arch.gpr[rt] = vcpu->arch.pid; break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.pid); break; case SPRN_MMUCR: - vcpu->arch.gpr[rt] = vcpu->arch.mmucr; break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.mmucr); break; case SPRN_CCR0: - vcpu->arch.gpr[rt] = vcpu->arch.ccr0; break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.ccr0); break; case SPRN_CCR1: - vcpu->arch.gpr[rt] = vcpu->arch.ccr1; break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.ccr1); break; default: emulated = kvmppc_booke_emulate_mfspr(vcpu, sprn, rt); } diff --git a/arch/powerpc/kvm/44x_tlb.c b/arch/powerpc/kvm/44x_tlb.c index ff3cb63b811..2570fcc7665 100644 --- a/arch/powerpc/kvm/44x_tlb.c +++ b/arch/powerpc/kvm/44x_tlb.c @@ -439,7 +439,7 @@ int kvmppc_44x_emul_tlbwe(struct kvm_vcpu *vcpu, u8 ra, u8 rs, u8 ws) struct kvmppc_44x_tlbe *tlbe; unsigned int gtlb_index; - gtlb_index = vcpu->arch.gpr[ra]; + gtlb_index = kvmppc_get_gpr(vcpu, ra); if (gtlb_index > KVM44x_GUEST_TLB_SIZE) { printk("%s: index %d\n", __func__, gtlb_index); kvmppc_dump_vcpu(vcpu); @@ -455,15 +455,15 @@ int kvmppc_44x_emul_tlbwe(struct kvm_vcpu *vcpu, u8 ra, u8 rs, u8 ws) switch (ws) { case PPC44x_TLB_PAGEID: tlbe->tid = get_mmucr_stid(vcpu); - tlbe->word0 = vcpu->arch.gpr[rs]; + tlbe->word0 = kvmppc_get_gpr(vcpu, rs); break; case PPC44x_TLB_XLAT: - tlbe->word1 = vcpu->arch.gpr[rs]; + tlbe->word1 = kvmppc_get_gpr(vcpu, rs); break; case PPC44x_TLB_ATTRIB: - tlbe->word2 = vcpu->arch.gpr[rs]; + tlbe->word2 = kvmppc_get_gpr(vcpu, rs); break; default: @@ -500,18 +500,20 @@ int kvmppc_44x_emul_tlbsx(struct kvm_vcpu *vcpu, u8 rt, u8 ra, u8 rb, u8 rc) unsigned int as = get_mmucr_sts(vcpu); unsigned int pid = get_mmucr_stid(vcpu); - ea = vcpu->arch.gpr[rb]; + ea = kvmppc_get_gpr(vcpu, rb); if (ra) - ea += vcpu->arch.gpr[ra]; + ea += kvmppc_get_gpr(vcpu, ra); gtlb_index = kvmppc_44x_tlb_index(vcpu, ea, pid, as); if (rc) { + u32 cr = kvmppc_get_cr(vcpu); + if (gtlb_index < 0) - vcpu->arch.cr &= ~0x20000000; + kvmppc_set_cr(vcpu, cr & ~0x20000000); else - vcpu->arch.cr |= 0x20000000; + kvmppc_set_cr(vcpu, cr | 0x20000000); } - vcpu->arch.gpr[rt] = gtlb_index; + kvmppc_set_gpr(vcpu, rt, gtlb_index); kvmppc_set_exit_type(vcpu, EMULATED_TLBSX_EXITS); return EMULATE_DONE; diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index fe037fdaf1b..60624cc9f4d 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -20,6 +20,7 @@ config KVM bool select PREEMPT_NOTIFIERS select ANON_INODES + select KVM_MMIO config KVM_BOOK3S_64_HANDLER bool diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 3e294bd9b8c..9a271f0929c 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -33,12 +33,9 @@ /* #define EXIT_DEBUG */ /* #define EXIT_DEBUG_SIMPLE */ +/* #define DEBUG_EXT */ -/* Without AGGRESSIVE_DEC we only fire off a DEC interrupt when DEC turns 0. - * When set, we retrigger a DEC interrupt after that if DEC <= 0. - * PPC32 Linux runs faster without AGGRESSIVE_DEC, PPC64 Linux requires it. */ - -/* #define AGGRESSIVE_DEC */ +static void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr); struct kvm_stats_debugfs_item debugfs_entries[] = { { "exits", VCPU_STAT(sum_exits) }, @@ -72,16 +69,24 @@ void kvmppc_core_load_guest_debugstate(struct kvm_vcpu *vcpu) void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { memcpy(get_paca()->kvm_slb, to_book3s(vcpu)->slb_shadow, sizeof(get_paca()->kvm_slb)); + memcpy(&get_paca()->shadow_vcpu, &to_book3s(vcpu)->shadow_vcpu, + sizeof(get_paca()->shadow_vcpu)); get_paca()->kvm_slb_max = to_book3s(vcpu)->slb_shadow_max; } void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) { memcpy(to_book3s(vcpu)->slb_shadow, get_paca()->kvm_slb, sizeof(get_paca()->kvm_slb)); + memcpy(&to_book3s(vcpu)->shadow_vcpu, &get_paca()->shadow_vcpu, + sizeof(get_paca()->shadow_vcpu)); to_book3s(vcpu)->slb_shadow_max = get_paca()->kvm_slb_max; + + kvmppc_giveup_ext(vcpu, MSR_FP); + kvmppc_giveup_ext(vcpu, MSR_VEC); + kvmppc_giveup_ext(vcpu, MSR_VSX); } -#if defined(AGGRESSIVE_DEC) || defined(EXIT_DEBUG) +#if defined(EXIT_DEBUG) static u32 kvmppc_get_dec(struct kvm_vcpu *vcpu) { u64 jd = mftb() - vcpu->arch.dec_jiffies; @@ -89,6 +94,23 @@ static u32 kvmppc_get_dec(struct kvm_vcpu *vcpu) } #endif +static void kvmppc_recalc_shadow_msr(struct kvm_vcpu *vcpu) +{ + vcpu->arch.shadow_msr = vcpu->arch.msr; + /* Guest MSR values */ + vcpu->arch.shadow_msr &= MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | + MSR_BE | MSR_DE; + /* Process MSR values */ + vcpu->arch.shadow_msr |= MSR_ME | MSR_RI | MSR_IR | MSR_DR | MSR_PR | + MSR_EE; + /* External providers the guest reserved */ + vcpu->arch.shadow_msr |= (vcpu->arch.msr & vcpu->arch.guest_owned_ext); + /* 64-bit Process MSR values */ +#ifdef CONFIG_PPC_BOOK3S_64 + vcpu->arch.shadow_msr |= MSR_ISF | MSR_HV; +#endif +} + void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr) { ulong old_msr = vcpu->arch.msr; @@ -96,12 +118,10 @@ void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr) #ifdef EXIT_DEBUG printk(KERN_INFO "KVM: Set MSR to 0x%llx\n", msr); #endif + msr &= to_book3s(vcpu)->msr_mask; vcpu->arch.msr = msr; - vcpu->arch.shadow_msr = msr | MSR_USER32; - vcpu->arch.shadow_msr &= ( MSR_VEC | MSR_VSX | MSR_FP | MSR_FE0 | - MSR_USER64 | MSR_SE | MSR_BE | MSR_DE | - MSR_FE1); + kvmppc_recalc_shadow_msr(vcpu); if (msr & (MSR_WE|MSR_POW)) { if (!vcpu->arch.pending_exceptions) { @@ -125,11 +145,10 @@ void kvmppc_inject_interrupt(struct kvm_vcpu *vcpu, int vec, u64 flags) vcpu->arch.mmu.reset_msr(vcpu); } -void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec) +static int kvmppc_book3s_vec2irqprio(unsigned int vec) { unsigned int prio; - vcpu->stat.queue_intr++; switch (vec) { case 0x100: prio = BOOK3S_IRQPRIO_SYSTEM_RESET; break; case 0x200: prio = BOOK3S_IRQPRIO_MACHINE_CHECK; break; @@ -149,15 +168,31 @@ void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec) default: prio = BOOK3S_IRQPRIO_MAX; break; } - set_bit(prio, &vcpu->arch.pending_exceptions); + return prio; +} + +static void kvmppc_book3s_dequeue_irqprio(struct kvm_vcpu *vcpu, + unsigned int vec) +{ + clear_bit(kvmppc_book3s_vec2irqprio(vec), + &vcpu->arch.pending_exceptions); +} + +void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec) +{ + vcpu->stat.queue_intr++; + + set_bit(kvmppc_book3s_vec2irqprio(vec), + &vcpu->arch.pending_exceptions); #ifdef EXIT_DEBUG printk(KERN_INFO "Queueing interrupt %x\n", vec); #endif } -void kvmppc_core_queue_program(struct kvm_vcpu *vcpu) +void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong flags) { + to_book3s(vcpu)->prog_flags = flags; kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_PROGRAM); } @@ -171,6 +206,11 @@ int kvmppc_core_pending_dec(struct kvm_vcpu *vcpu) return test_bit(BOOK3S_INTERRUPT_DECREMENTER >> 7, &vcpu->arch.pending_exceptions); } +void kvmppc_core_dequeue_dec(struct kvm_vcpu *vcpu) +{ + kvmppc_book3s_dequeue_irqprio(vcpu, BOOK3S_INTERRUPT_DECREMENTER); +} + void kvmppc_core_queue_external(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq) { @@ -181,6 +221,7 @@ int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority) { int deliver = 1; int vec = 0; + ulong flags = 0ULL; switch (priority) { case BOOK3S_IRQPRIO_DECREMENTER: @@ -214,6 +255,7 @@ int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority) break; case BOOK3S_IRQPRIO_PROGRAM: vec = BOOK3S_INTERRUPT_PROGRAM; + flags = to_book3s(vcpu)->prog_flags; break; case BOOK3S_IRQPRIO_VSX: vec = BOOK3S_INTERRUPT_VSX; @@ -244,7 +286,7 @@ int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority) #endif if (deliver) - kvmppc_inject_interrupt(vcpu, vec, 0ULL); + kvmppc_inject_interrupt(vcpu, vec, flags); return deliver; } @@ -254,21 +296,15 @@ void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu) unsigned long *pending = &vcpu->arch.pending_exceptions; unsigned int priority; - /* XXX be more clever here - no need to mftb() on every entry */ - /* Issue DEC again if it's still active */ -#ifdef AGGRESSIVE_DEC - if (vcpu->arch.msr & MSR_EE) - if (kvmppc_get_dec(vcpu) & 0x80000000) - kvmppc_core_queue_dec(vcpu); -#endif - #ifdef EXIT_DEBUG if (vcpu->arch.pending_exceptions) printk(KERN_EMERG "KVM: Check pending: %lx\n", vcpu->arch.pending_exceptions); #endif priority = __ffs(*pending); while (priority <= (sizeof(unsigned int) * 8)) { - if (kvmppc_book3s_irqprio_deliver(vcpu, priority)) { + if (kvmppc_book3s_irqprio_deliver(vcpu, priority) && + (priority != BOOK3S_IRQPRIO_DECREMENTER)) { + /* DEC interrupts get cleared by mtdec */ clear_bit(priority, &vcpu->arch.pending_exceptions); break; } @@ -503,14 +539,14 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu, /* Page not found in guest PTE entries */ vcpu->arch.dear = vcpu->arch.fault_dear; to_book3s(vcpu)->dsisr = vcpu->arch.fault_dsisr; - vcpu->arch.msr |= (vcpu->arch.shadow_msr & 0x00000000f8000000ULL); + vcpu->arch.msr |= (vcpu->arch.shadow_srr1 & 0x00000000f8000000ULL); kvmppc_book3s_queue_irqprio(vcpu, vec); } else if (page_found == -EPERM) { /* Storage protection */ vcpu->arch.dear = vcpu->arch.fault_dear; to_book3s(vcpu)->dsisr = vcpu->arch.fault_dsisr & ~DSISR_NOHPTE; to_book3s(vcpu)->dsisr |= DSISR_PROTFAULT; - vcpu->arch.msr |= (vcpu->arch.shadow_msr & 0x00000000f8000000ULL); + vcpu->arch.msr |= (vcpu->arch.shadow_srr1 & 0x00000000f8000000ULL); kvmppc_book3s_queue_irqprio(vcpu, vec); } else if (page_found == -EINVAL) { /* Page not found in guest SLB */ @@ -532,13 +568,122 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu, r = kvmppc_emulate_mmio(run, vcpu); if ( r == RESUME_HOST_NV ) r = RESUME_HOST; - if ( r == RESUME_GUEST_NV ) - r = RESUME_GUEST; } return r; } +static inline int get_fpr_index(int i) +{ +#ifdef CONFIG_VSX + i *= 2; +#endif + return i; +} + +/* Give up external provider (FPU, Altivec, VSX) */ +static void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr) +{ + struct thread_struct *t = ¤t->thread; + u64 *vcpu_fpr = vcpu->arch.fpr; + u64 *vcpu_vsx = vcpu->arch.vsr; + u64 *thread_fpr = (u64*)t->fpr; + int i; + + if (!(vcpu->arch.guest_owned_ext & msr)) + return; + +#ifdef DEBUG_EXT + printk(KERN_INFO "Giving up ext 0x%lx\n", msr); +#endif + + switch (msr) { + case MSR_FP: + giveup_fpu(current); + for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++) + vcpu_fpr[i] = thread_fpr[get_fpr_index(i)]; + + vcpu->arch.fpscr = t->fpscr.val; + break; + case MSR_VEC: +#ifdef CONFIG_ALTIVEC + giveup_altivec(current); + memcpy(vcpu->arch.vr, t->vr, sizeof(vcpu->arch.vr)); + vcpu->arch.vscr = t->vscr; +#endif + break; + case MSR_VSX: +#ifdef CONFIG_VSX + __giveup_vsx(current); + for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr); i++) + vcpu_vsx[i] = thread_fpr[get_fpr_index(i) + 1]; +#endif + break; + default: + BUG(); + } + + vcpu->arch.guest_owned_ext &= ~msr; + current->thread.regs->msr &= ~msr; + kvmppc_recalc_shadow_msr(vcpu); +} + +/* Handle external providers (FPU, Altivec, VSX) */ +static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr, + ulong msr) +{ + struct thread_struct *t = ¤t->thread; + u64 *vcpu_fpr = vcpu->arch.fpr; + u64 *vcpu_vsx = vcpu->arch.vsr; + u64 *thread_fpr = (u64*)t->fpr; + int i; + + if (!(vcpu->arch.msr & msr)) { + kvmppc_book3s_queue_irqprio(vcpu, exit_nr); + return RESUME_GUEST; + } + +#ifdef DEBUG_EXT + printk(KERN_INFO "Loading up ext 0x%lx\n", msr); +#endif + + current->thread.regs->msr |= msr; + + switch (msr) { + case MSR_FP: + for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++) + thread_fpr[get_fpr_index(i)] = vcpu_fpr[i]; + + t->fpscr.val = vcpu->arch.fpscr; + t->fpexc_mode = 0; + kvmppc_load_up_fpu(); + break; + case MSR_VEC: +#ifdef CONFIG_ALTIVEC + memcpy(t->vr, vcpu->arch.vr, sizeof(vcpu->arch.vr)); + t->vscr = vcpu->arch.vscr; + t->vrsave = -1; + kvmppc_load_up_altivec(); +#endif + break; + case MSR_VSX: +#ifdef CONFIG_VSX + for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr); i++) + thread_fpr[get_fpr_index(i) + 1] = vcpu_vsx[i]; + kvmppc_load_up_vsx(); +#endif + break; + default: + BUG(); + } + + vcpu->arch.guest_owned_ext |= msr; + + kvmppc_recalc_shadow_msr(vcpu); + + return RESUME_GUEST; +} + int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned int exit_nr) { @@ -563,7 +708,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, case BOOK3S_INTERRUPT_INST_STORAGE: vcpu->stat.pf_instruc++; /* only care about PTEG not found errors, but leave NX alone */ - if (vcpu->arch.shadow_msr & 0x40000000) { + if (vcpu->arch.shadow_srr1 & 0x40000000) { r = kvmppc_handle_pagefault(run, vcpu, vcpu->arch.pc, exit_nr); vcpu->stat.sp_instruc++; } else if (vcpu->arch.mmu.is_dcbz32(vcpu) && @@ -575,7 +720,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, */ kvmppc_mmu_pte_flush(vcpu, vcpu->arch.pc, ~0xFFFULL); } else { - vcpu->arch.msr |= (vcpu->arch.shadow_msr & 0x58000000); + vcpu->arch.msr |= vcpu->arch.shadow_srr1 & 0x58000000; kvmppc_book3s_queue_irqprio(vcpu, exit_nr); kvmppc_mmu_pte_flush(vcpu, vcpu->arch.pc, ~0xFFFULL); r = RESUME_GUEST; @@ -621,6 +766,9 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, case BOOK3S_INTERRUPT_PROGRAM: { enum emulation_result er; + ulong flags; + + flags = vcpu->arch.shadow_srr1 & 0x1f0000ull; if (vcpu->arch.msr & MSR_PR) { #ifdef EXIT_DEBUG @@ -628,7 +776,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, #endif if ((vcpu->arch.last_inst & 0xff0007ff) != (INS_DCBZ & 0xfffffff7)) { - kvmppc_book3s_queue_irqprio(vcpu, exit_nr); + kvmppc_core_queue_program(vcpu, flags); r = RESUME_GUEST; break; } @@ -638,12 +786,12 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, er = kvmppc_emulate_instruction(run, vcpu); switch (er) { case EMULATE_DONE: - r = RESUME_GUEST; + r = RESUME_GUEST_NV; break; case EMULATE_FAIL: printk(KERN_CRIT "%s: emulation at %lx failed (%08x)\n", __func__, vcpu->arch.pc, vcpu->arch.last_inst); - kvmppc_book3s_queue_irqprio(vcpu, exit_nr); + kvmppc_core_queue_program(vcpu, flags); r = RESUME_GUEST; break; default: @@ -653,23 +801,30 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, } case BOOK3S_INTERRUPT_SYSCALL: #ifdef EXIT_DEBUG - printk(KERN_INFO "Syscall Nr %d\n", (int)vcpu->arch.gpr[0]); + printk(KERN_INFO "Syscall Nr %d\n", (int)kvmppc_get_gpr(vcpu, 0)); #endif vcpu->stat.syscall_exits++; kvmppc_book3s_queue_irqprio(vcpu, exit_nr); r = RESUME_GUEST; break; - case BOOK3S_INTERRUPT_MACHINE_CHECK: case BOOK3S_INTERRUPT_FP_UNAVAIL: - case BOOK3S_INTERRUPT_TRACE: + r = kvmppc_handle_ext(vcpu, exit_nr, MSR_FP); + break; case BOOK3S_INTERRUPT_ALTIVEC: + r = kvmppc_handle_ext(vcpu, exit_nr, MSR_VEC); + break; case BOOK3S_INTERRUPT_VSX: + r = kvmppc_handle_ext(vcpu, exit_nr, MSR_VSX); + break; + case BOOK3S_INTERRUPT_MACHINE_CHECK: + case BOOK3S_INTERRUPT_TRACE: kvmppc_book3s_queue_irqprio(vcpu, exit_nr); r = RESUME_GUEST; break; default: /* Ugh - bork here! What did we get? */ - printk(KERN_EMERG "exit_nr=0x%x | pc=0x%lx | msr=0x%lx\n", exit_nr, vcpu->arch.pc, vcpu->arch.shadow_msr); + printk(KERN_EMERG "exit_nr=0x%x | pc=0x%lx | msr=0x%lx\n", + exit_nr, vcpu->arch.pc, vcpu->arch.shadow_srr1); r = RESUME_HOST; BUG(); break; @@ -712,10 +867,10 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) int i; regs->pc = vcpu->arch.pc; - regs->cr = vcpu->arch.cr; + regs->cr = kvmppc_get_cr(vcpu); regs->ctr = vcpu->arch.ctr; regs->lr = vcpu->arch.lr; - regs->xer = vcpu->arch.xer; + regs->xer = kvmppc_get_xer(vcpu); regs->msr = vcpu->arch.msr; regs->srr0 = vcpu->arch.srr0; regs->srr1 = vcpu->arch.srr1; @@ -729,7 +884,7 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) regs->sprg7 = vcpu->arch.sprg6; for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) - regs->gpr[i] = vcpu->arch.gpr[i]; + regs->gpr[i] = kvmppc_get_gpr(vcpu, i); return 0; } @@ -739,10 +894,10 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) int i; vcpu->arch.pc = regs->pc; - vcpu->arch.cr = regs->cr; + kvmppc_set_cr(vcpu, regs->cr); vcpu->arch.ctr = regs->ctr; vcpu->arch.lr = regs->lr; - vcpu->arch.xer = regs->xer; + kvmppc_set_xer(vcpu, regs->xer); kvmppc_set_msr(vcpu, regs->msr); vcpu->arch.srr0 = regs->srr0; vcpu->arch.srr1 = regs->srr1; @@ -754,8 +909,8 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) vcpu->arch.sprg6 = regs->sprg5; vcpu->arch.sprg7 = regs->sprg6; - for (i = 0; i < ARRAY_SIZE(vcpu->arch.gpr); i++) - vcpu->arch.gpr[i] = regs->gpr[i]; + for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) + kvmppc_set_gpr(vcpu, i, regs->gpr[i]); return 0; } @@ -850,7 +1005,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, int is_dirty = 0; int r, n; - down_write(&kvm->slots_lock); + mutex_lock(&kvm->slots_lock); r = kvm_get_dirty_log(kvm, log, &is_dirty); if (r) @@ -858,7 +1013,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, /* If nothing is dirty, don't bother messing with page tables. */ if (is_dirty) { - memslot = &kvm->memslots[log->slot]; + memslot = &kvm->memslots->memslots[log->slot]; ga = memslot->base_gfn << PAGE_SHIFT; ga_end = ga + (memslot->npages << PAGE_SHIFT); @@ -872,7 +1027,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, r = 0; out: - up_write(&kvm->slots_lock); + mutex_unlock(&kvm->slots_lock); return r; } @@ -910,6 +1065,7 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) vcpu->arch.trampoline_lowmem = kvmppc_trampoline_lowmem; vcpu->arch.trampoline_enter = kvmppc_trampoline_enter; vcpu->arch.highmem_handler = (ulong)kvmppc_handler_highmem; + vcpu->arch.rmcall = *(ulong*)kvmppc_rmcall; vcpu->arch.shadow_msr = MSR_USER64; @@ -943,6 +1099,10 @@ extern int __kvmppc_vcpu_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu); int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) { int ret; + struct thread_struct ext_bkp; + bool save_vec = current->thread.used_vr; + bool save_vsx = current->thread.used_vsr; + ulong ext_msr; /* No need to go into the guest when all we do is going out */ if (signal_pending(current)) { @@ -950,6 +1110,35 @@ int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) return -EINTR; } + /* Save FPU state in stack */ + if (current->thread.regs->msr & MSR_FP) + giveup_fpu(current); + memcpy(ext_bkp.fpr, current->thread.fpr, sizeof(current->thread.fpr)); + ext_bkp.fpscr = current->thread.fpscr; + ext_bkp.fpexc_mode = current->thread.fpexc_mode; + +#ifdef CONFIG_ALTIVEC + /* Save Altivec state in stack */ + if (save_vec) { + if (current->thread.regs->msr & MSR_VEC) + giveup_altivec(current); + memcpy(ext_bkp.vr, current->thread.vr, sizeof(ext_bkp.vr)); + ext_bkp.vscr = current->thread.vscr; + ext_bkp.vrsave = current->thread.vrsave; + } + ext_bkp.used_vr = current->thread.used_vr; +#endif + +#ifdef CONFIG_VSX + /* Save VSX state in stack */ + if (save_vsx && (current->thread.regs->msr & MSR_VSX)) + __giveup_vsx(current); + ext_bkp.used_vsr = current->thread.used_vsr; +#endif + + /* Remember the MSR with disabled extensions */ + ext_msr = current->thread.regs->msr; + /* XXX we get called with irq disabled - change that! */ local_irq_enable(); @@ -957,6 +1146,32 @@ int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) local_irq_disable(); + current->thread.regs->msr = ext_msr; + + /* Make sure we save the guest FPU/Altivec/VSX state */ + kvmppc_giveup_ext(vcpu, MSR_FP); + kvmppc_giveup_ext(vcpu, MSR_VEC); + kvmppc_giveup_ext(vcpu, MSR_VSX); + + /* Restore FPU state from stack */ + memcpy(current->thread.fpr, ext_bkp.fpr, sizeof(ext_bkp.fpr)); + current->thread.fpscr = ext_bkp.fpscr; + current->thread.fpexc_mode = ext_bkp.fpexc_mode; + +#ifdef CONFIG_ALTIVEC + /* Restore Altivec state from stack */ + if (save_vec && current->thread.used_vr) { + memcpy(current->thread.vr, ext_bkp.vr, sizeof(ext_bkp.vr)); + current->thread.vscr = ext_bkp.vscr; + current->thread.vrsave= ext_bkp.vrsave; + } + current->thread.used_vr = ext_bkp.used_vr; +#endif + +#ifdef CONFIG_VSX + current->thread.used_vsr = ext_bkp.used_vsr; +#endif + return ret; } diff --git a/arch/powerpc/kvm/book3s_64_emulate.c b/arch/powerpc/kvm/book3s_64_emulate.c index 1027eac6d47..2b0ee7e040c 100644 --- a/arch/powerpc/kvm/book3s_64_emulate.c +++ b/arch/powerpc/kvm/book3s_64_emulate.c @@ -65,11 +65,11 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, case 31: switch (get_xop(inst)) { case OP_31_XOP_MFMSR: - vcpu->arch.gpr[get_rt(inst)] = vcpu->arch.msr; + kvmppc_set_gpr(vcpu, get_rt(inst), vcpu->arch.msr); break; case OP_31_XOP_MTMSRD: { - ulong rs = vcpu->arch.gpr[get_rs(inst)]; + ulong rs = kvmppc_get_gpr(vcpu, get_rs(inst)); if (inst & 0x10000) { vcpu->arch.msr &= ~(MSR_RI | MSR_EE); vcpu->arch.msr |= rs & (MSR_RI | MSR_EE); @@ -78,30 +78,30 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, break; } case OP_31_XOP_MTMSR: - kvmppc_set_msr(vcpu, vcpu->arch.gpr[get_rs(inst)]); + kvmppc_set_msr(vcpu, kvmppc_get_gpr(vcpu, get_rs(inst))); break; case OP_31_XOP_MFSRIN: { int srnum; - srnum = (vcpu->arch.gpr[get_rb(inst)] >> 28) & 0xf; + srnum = (kvmppc_get_gpr(vcpu, get_rb(inst)) >> 28) & 0xf; if (vcpu->arch.mmu.mfsrin) { u32 sr; sr = vcpu->arch.mmu.mfsrin(vcpu, srnum); - vcpu->arch.gpr[get_rt(inst)] = sr; + kvmppc_set_gpr(vcpu, get_rt(inst), sr); } break; } case OP_31_XOP_MTSRIN: vcpu->arch.mmu.mtsrin(vcpu, - (vcpu->arch.gpr[get_rb(inst)] >> 28) & 0xf, - vcpu->arch.gpr[get_rs(inst)]); + (kvmppc_get_gpr(vcpu, get_rb(inst)) >> 28) & 0xf, + kvmppc_get_gpr(vcpu, get_rs(inst))); break; case OP_31_XOP_TLBIE: case OP_31_XOP_TLBIEL: { bool large = (inst & 0x00200000) ? true : false; - ulong addr = vcpu->arch.gpr[get_rb(inst)]; + ulong addr = kvmppc_get_gpr(vcpu, get_rb(inst)); vcpu->arch.mmu.tlbie(vcpu, addr, large); break; } @@ -111,14 +111,16 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, if (!vcpu->arch.mmu.slbmte) return EMULATE_FAIL; - vcpu->arch.mmu.slbmte(vcpu, vcpu->arch.gpr[get_rs(inst)], - vcpu->arch.gpr[get_rb(inst)]); + vcpu->arch.mmu.slbmte(vcpu, + kvmppc_get_gpr(vcpu, get_rs(inst)), + kvmppc_get_gpr(vcpu, get_rb(inst))); break; case OP_31_XOP_SLBIE: if (!vcpu->arch.mmu.slbie) return EMULATE_FAIL; - vcpu->arch.mmu.slbie(vcpu, vcpu->arch.gpr[get_rb(inst)]); + vcpu->arch.mmu.slbie(vcpu, + kvmppc_get_gpr(vcpu, get_rb(inst))); break; case OP_31_XOP_SLBIA: if (!vcpu->arch.mmu.slbia) @@ -132,9 +134,9 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, } else { ulong t, rb; - rb = vcpu->arch.gpr[get_rb(inst)]; + rb = kvmppc_get_gpr(vcpu, get_rb(inst)); t = vcpu->arch.mmu.slbmfee(vcpu, rb); - vcpu->arch.gpr[get_rt(inst)] = t; + kvmppc_set_gpr(vcpu, get_rt(inst), t); } break; case OP_31_XOP_SLBMFEV: @@ -143,20 +145,20 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, } else { ulong t, rb; - rb = vcpu->arch.gpr[get_rb(inst)]; + rb = kvmppc_get_gpr(vcpu, get_rb(inst)); t = vcpu->arch.mmu.slbmfev(vcpu, rb); - vcpu->arch.gpr[get_rt(inst)] = t; + kvmppc_set_gpr(vcpu, get_rt(inst), t); } break; case OP_31_XOP_DCBZ: { - ulong rb = vcpu->arch.gpr[get_rb(inst)]; + ulong rb = kvmppc_get_gpr(vcpu, get_rb(inst)); ulong ra = 0; ulong addr; u32 zeros[8] = { 0, 0, 0, 0, 0, 0, 0, 0 }; if (get_ra(inst)) - ra = vcpu->arch.gpr[get_ra(inst)]; + ra = kvmppc_get_gpr(vcpu, get_ra(inst)); addr = (ra + rb) & ~31ULL; if (!(vcpu->arch.msr & MSR_SF)) @@ -233,43 +235,44 @@ static void kvmppc_write_bat(struct kvm_vcpu *vcpu, int sprn, u32 val) int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) { int emulated = EMULATE_DONE; + ulong spr_val = kvmppc_get_gpr(vcpu, rs); switch (sprn) { case SPRN_SDR1: - to_book3s(vcpu)->sdr1 = vcpu->arch.gpr[rs]; + to_book3s(vcpu)->sdr1 = spr_val; break; case SPRN_DSISR: - to_book3s(vcpu)->dsisr = vcpu->arch.gpr[rs]; + to_book3s(vcpu)->dsisr = spr_val; break; case SPRN_DAR: - vcpu->arch.dear = vcpu->arch.gpr[rs]; + vcpu->arch.dear = spr_val; break; case SPRN_HIOR: - to_book3s(vcpu)->hior = vcpu->arch.gpr[rs]; + to_book3s(vcpu)->hior = spr_val; break; case SPRN_IBAT0U ... SPRN_IBAT3L: case SPRN_IBAT4U ... SPRN_IBAT7L: case SPRN_DBAT0U ... SPRN_DBAT3L: case SPRN_DBAT4U ... SPRN_DBAT7L: - kvmppc_write_bat(vcpu, sprn, (u32)vcpu->arch.gpr[rs]); + kvmppc_write_bat(vcpu, sprn, (u32)spr_val); /* BAT writes happen so rarely that we're ok to flush * everything here */ kvmppc_mmu_pte_flush(vcpu, 0, 0); break; case SPRN_HID0: - to_book3s(vcpu)->hid[0] = vcpu->arch.gpr[rs]; + to_book3s(vcpu)->hid[0] = spr_val; break; case SPRN_HID1: - to_book3s(vcpu)->hid[1] = vcpu->arch.gpr[rs]; + to_book3s(vcpu)->hid[1] = spr_val; break; case SPRN_HID2: - to_book3s(vcpu)->hid[2] = vcpu->arch.gpr[rs]; + to_book3s(vcpu)->hid[2] = spr_val; break; case SPRN_HID4: - to_book3s(vcpu)->hid[4] = vcpu->arch.gpr[rs]; + to_book3s(vcpu)->hid[4] = spr_val; break; case SPRN_HID5: - to_book3s(vcpu)->hid[5] = vcpu->arch.gpr[rs]; + to_book3s(vcpu)->hid[5] = spr_val; /* guest HID5 set can change is_dcbz32 */ if (vcpu->arch.mmu.is_dcbz32(vcpu) && (mfmsr() & MSR_HV)) @@ -299,38 +302,38 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) switch (sprn) { case SPRN_SDR1: - vcpu->arch.gpr[rt] = to_book3s(vcpu)->sdr1; + kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->sdr1); break; case SPRN_DSISR: - vcpu->arch.gpr[rt] = to_book3s(vcpu)->dsisr; + kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->dsisr); break; case SPRN_DAR: - vcpu->arch.gpr[rt] = vcpu->arch.dear; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.dear); break; case SPRN_HIOR: - vcpu->arch.gpr[rt] = to_book3s(vcpu)->hior; + kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->hior); break; case SPRN_HID0: - vcpu->arch.gpr[rt] = to_book3s(vcpu)->hid[0]; + kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->hid[0]); break; case SPRN_HID1: - vcpu->arch.gpr[rt] = to_book3s(vcpu)->hid[1]; + kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->hid[1]); break; case SPRN_HID2: - vcpu->arch.gpr[rt] = to_book3s(vcpu)->hid[2]; + kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->hid[2]); break; case SPRN_HID4: - vcpu->arch.gpr[rt] = to_book3s(vcpu)->hid[4]; + kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->hid[4]); break; case SPRN_HID5: - vcpu->arch.gpr[rt] = to_book3s(vcpu)->hid[5]; + kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->hid[5]); break; case SPRN_THRM1: case SPRN_THRM2: case SPRN_THRM3: case SPRN_CTRLF: case SPRN_CTRLT: - vcpu->arch.gpr[rt] = 0; + kvmppc_set_gpr(vcpu, rt, 0); break; default: printk(KERN_INFO "KVM: invalid SPR read: %d\n", sprn); diff --git a/arch/powerpc/kvm/book3s_64_exports.c b/arch/powerpc/kvm/book3s_64_exports.c index 5b2db38ed86..1dd5a1ddfd0 100644 --- a/arch/powerpc/kvm/book3s_64_exports.c +++ b/arch/powerpc/kvm/book3s_64_exports.c @@ -22,3 +22,11 @@ EXPORT_SYMBOL_GPL(kvmppc_trampoline_enter); EXPORT_SYMBOL_GPL(kvmppc_trampoline_lowmem); +EXPORT_SYMBOL_GPL(kvmppc_rmcall); +EXPORT_SYMBOL_GPL(kvmppc_load_up_fpu); +#ifdef CONFIG_ALTIVEC +EXPORT_SYMBOL_GPL(kvmppc_load_up_altivec); +#endif +#ifdef CONFIG_VSX +EXPORT_SYMBOL_GPL(kvmppc_load_up_vsx); +#endif diff --git a/arch/powerpc/kvm/book3s_64_interrupts.S b/arch/powerpc/kvm/book3s_64_interrupts.S index 7b55d8094c8..c1584d0cbce 100644 --- a/arch/powerpc/kvm/book3s_64_interrupts.S +++ b/arch/powerpc/kvm/book3s_64_interrupts.S @@ -28,11 +28,6 @@ #define ULONG_SIZE 8 #define VCPU_GPR(n) (VCPU_GPRS + (n * ULONG_SIZE)) -.macro mfpaca tmp_reg, src_reg, offset, vcpu_reg - ld \tmp_reg, (PACA_EXMC+\offset)(r13) - std \tmp_reg, VCPU_GPR(\src_reg)(\vcpu_reg) -.endm - .macro DISABLE_INTERRUPTS mfmsr r0 rldicl r0,r0,48,1 @@ -40,6 +35,26 @@ mtmsrd r0,1 .endm +#define VCPU_LOAD_NVGPRS(vcpu) \ + ld r14, VCPU_GPR(r14)(vcpu); \ + ld r15, VCPU_GPR(r15)(vcpu); \ + ld r16, VCPU_GPR(r16)(vcpu); \ + ld r17, VCPU_GPR(r17)(vcpu); \ + ld r18, VCPU_GPR(r18)(vcpu); \ + ld r19, VCPU_GPR(r19)(vcpu); \ + ld r20, VCPU_GPR(r20)(vcpu); \ + ld r21, VCPU_GPR(r21)(vcpu); \ + ld r22, VCPU_GPR(r22)(vcpu); \ + ld r23, VCPU_GPR(r23)(vcpu); \ + ld r24, VCPU_GPR(r24)(vcpu); \ + ld r25, VCPU_GPR(r25)(vcpu); \ + ld r26, VCPU_GPR(r26)(vcpu); \ + ld r27, VCPU_GPR(r27)(vcpu); \ + ld r28, VCPU_GPR(r28)(vcpu); \ + ld r29, VCPU_GPR(r29)(vcpu); \ + ld r30, VCPU_GPR(r30)(vcpu); \ + ld r31, VCPU_GPR(r31)(vcpu); \ + /***************************************************************************** * * * Guest entry / exit code that is in kernel module memory (highmem) * @@ -67,61 +82,32 @@ kvm_start_entry: SAVE_NVGPRS(r1) /* Save LR */ - mflr r14 - std r14, _LINK(r1) - -/* XXX optimize non-volatile loading away */ -kvm_start_lightweight: + std r0, _LINK(r1) - DISABLE_INTERRUPTS + /* Load non-volatile guest state from the vcpu */ + VCPU_LOAD_NVGPRS(r4) /* Save R1/R2 in the PACA */ - std r1, PACAR1(r13) - std r2, (PACA_EXMC+EX_SRR0)(r13) + std r1, PACA_KVM_HOST_R1(r13) + std r2, PACA_KVM_HOST_R2(r13) + + /* XXX swap in/out on load? */ ld r3, VCPU_HIGHMEM_HANDLER(r4) - std r3, PACASAVEDMSR(r13) + std r3, PACA_KVM_VMHANDLER(r13) - /* Load non-volatile guest state from the vcpu */ - ld r14, VCPU_GPR(r14)(r4) - ld r15, VCPU_GPR(r15)(r4) - ld r16, VCPU_GPR(r16)(r4) - ld r17, VCPU_GPR(r17)(r4) - ld r18, VCPU_GPR(r18)(r4) - ld r19, VCPU_GPR(r19)(r4) - ld r20, VCPU_GPR(r20)(r4) - ld r21, VCPU_GPR(r21)(r4) - ld r22, VCPU_GPR(r22)(r4) - ld r23, VCPU_GPR(r23)(r4) - ld r24, VCPU_GPR(r24)(r4) - ld r25, VCPU_GPR(r25)(r4) - ld r26, VCPU_GPR(r26)(r4) - ld r27, VCPU_GPR(r27)(r4) - ld r28, VCPU_GPR(r28)(r4) - ld r29, VCPU_GPR(r29)(r4) - ld r30, VCPU_GPR(r30)(r4) - ld r31, VCPU_GPR(r31)(r4) +kvm_start_lightweight: ld r9, VCPU_PC(r4) /* r9 = vcpu->arch.pc */ ld r10, VCPU_SHADOW_MSR(r4) /* r10 = vcpu->arch.shadow_msr */ - ld r3, VCPU_TRAMPOLINE_ENTER(r4) - mtsrr0 r3 - - LOAD_REG_IMMEDIATE(r3, MSR_KERNEL & ~(MSR_IR | MSR_DR)) - mtsrr1 r3 - - /* Load guest state in the respective registers */ - lwz r3, VCPU_CR(r4) /* r3 = vcpu->arch.cr */ - stw r3, (PACA_EXMC + EX_CCR)(r13) - - ld r3, VCPU_CTR(r4) /* r3 = vcpu->arch.ctr */ - mtctr r3 /* CTR = r3 */ + /* Load some guest state in the respective registers */ + ld r5, VCPU_CTR(r4) /* r5 = vcpu->arch.ctr */ + /* will be swapped in by rmcall */ ld r3, VCPU_LR(r4) /* r3 = vcpu->arch.lr */ mtlr r3 /* LR = r3 */ - ld r3, VCPU_XER(r4) /* r3 = vcpu->arch.xer */ - std r3, (PACA_EXMC + EX_R3)(r13) + DISABLE_INTERRUPTS /* Some guests may need to have dcbz set to 32 byte length. * @@ -141,36 +127,15 @@ kvm_start_lightweight: mtspr SPRN_HID5,r3 no_dcbz32_on: - /* Load guest GPRs */ - - ld r3, VCPU_GPR(r9)(r4) - std r3, (PACA_EXMC + EX_R9)(r13) - ld r3, VCPU_GPR(r10)(r4) - std r3, (PACA_EXMC + EX_R10)(r13) - ld r3, VCPU_GPR(r11)(r4) - std r3, (PACA_EXMC + EX_R11)(r13) - ld r3, VCPU_GPR(r12)(r4) - std r3, (PACA_EXMC + EX_R12)(r13) - ld r3, VCPU_GPR(r13)(r4) - std r3, (PACA_EXMC + EX_R13)(r13) - - ld r0, VCPU_GPR(r0)(r4) - ld r1, VCPU_GPR(r1)(r4) - ld r2, VCPU_GPR(r2)(r4) - ld r3, VCPU_GPR(r3)(r4) - ld r5, VCPU_GPR(r5)(r4) - ld r6, VCPU_GPR(r6)(r4) - ld r7, VCPU_GPR(r7)(r4) - ld r8, VCPU_GPR(r8)(r4) - ld r4, VCPU_GPR(r4)(r4) - - /* This sets the Magic value for the trampoline */ - - li r11, 1 - stb r11, PACA_KVM_IN_GUEST(r13) + + ld r6, VCPU_RMCALL(r4) + mtctr r6 + + ld r3, VCPU_TRAMPOLINE_ENTER(r4) + LOAD_REG_IMMEDIATE(r4, MSR_KERNEL & ~(MSR_IR | MSR_DR)) /* Jump to SLB patching handlder and into our guest */ - RFI + bctr /* * This is the handler in module memory. It gets jumped at from the @@ -184,125 +149,70 @@ kvmppc_handler_highmem: /* * Register usage at this point: * - * R00 = guest R13 - * R01 = host R1 - * R02 = host R2 - * R10 = guest PC - * R11 = guest MSR - * R12 = exit handler id - * R13 = PACA - * PACA.exmc.R9 = guest R1 - * PACA.exmc.R10 = guest R10 - * PACA.exmc.R11 = guest R11 - * PACA.exmc.R12 = guest R12 - * PACA.exmc.R13 = guest R2 - * PACA.exmc.DAR = guest DAR - * PACA.exmc.DSISR = guest DSISR - * PACA.exmc.LR = guest instruction - * PACA.exmc.CCR = guest CR - * PACA.exmc.SRR0 = guest R0 + * R0 = guest last inst + * R1 = host R1 + * R2 = host R2 + * R3 = guest PC + * R4 = guest MSR + * R5 = guest DAR + * R6 = guest DSISR + * R13 = PACA + * PACA.KVM.* = guest * * */ - std r3, (PACA_EXMC+EX_R3)(r13) + /* R7 = vcpu */ + ld r7, GPR4(r1) - /* save the exit id in R3 */ - mr r3, r12 + /* Now save the guest state */ - /* R12 = vcpu */ - ld r12, GPR4(r1) + stw r0, VCPU_LAST_INST(r7) - /* Now save the guest state */ + std r3, VCPU_PC(r7) + std r4, VCPU_SHADOW_SRR1(r7) + std r5, VCPU_FAULT_DEAR(r7) + std r6, VCPU_FAULT_DSISR(r7) - std r0, VCPU_GPR(r13)(r12) - std r4, VCPU_GPR(r4)(r12) - std r5, VCPU_GPR(r5)(r12) - std r6, VCPU_GPR(r6)(r12) - std r7, VCPU_GPR(r7)(r12) - std r8, VCPU_GPR(r8)(r12) - std r9, VCPU_GPR(r9)(r12) - - /* get registers from PACA */ - mfpaca r5, r0, EX_SRR0, r12 - mfpaca r5, r3, EX_R3, r12 - mfpaca r5, r1, EX_R9, r12 - mfpaca r5, r10, EX_R10, r12 - mfpaca r5, r11, EX_R11, r12 - mfpaca r5, r12, EX_R12, r12 - mfpaca r5, r2, EX_R13, r12 - - lwz r5, (PACA_EXMC+EX_LR)(r13) - stw r5, VCPU_LAST_INST(r12) - - lwz r5, (PACA_EXMC+EX_CCR)(r13) - stw r5, VCPU_CR(r12) - - ld r5, VCPU_HFLAGS(r12) + ld r5, VCPU_HFLAGS(r7) rldicl. r5, r5, 0, 63 /* CR = ((r5 & 1) == 0) */ beq no_dcbz32_off + li r4, 0 mfspr r5,SPRN_HID5 - rldimi r5,r5,6,56 + rldimi r5,r4,6,56 mtspr SPRN_HID5,r5 no_dcbz32_off: - /* XXX maybe skip on lightweight? */ - std r14, VCPU_GPR(r14)(r12) - std r15, VCPU_GPR(r15)(r12) - std r16, VCPU_GPR(r16)(r12) - std r17, VCPU_GPR(r17)(r12) - std r18, VCPU_GPR(r18)(r12) - std r19, VCPU_GPR(r19)(r12) - std r20, VCPU_GPR(r20)(r12) - std r21, VCPU_GPR(r21)(r12) - std r22, VCPU_GPR(r22)(r12) - std r23, VCPU_GPR(r23)(r12) - std r24, VCPU_GPR(r24)(r12) - std r25, VCPU_GPR(r25)(r12) - std r26, VCPU_GPR(r26)(r12) - std r27, VCPU_GPR(r27)(r12) - std r28, VCPU_GPR(r28)(r12) - std r29, VCPU_GPR(r29)(r12) - std r30, VCPU_GPR(r30)(r12) - std r31, VCPU_GPR(r31)(r12) - - /* Restore non-volatile host registers (r14 - r31) */ - REST_NVGPRS(r1) - - /* Save guest PC (R10) */ - std r10, VCPU_PC(r12) - - /* Save guest msr (R11) */ - std r11, VCPU_SHADOW_MSR(r12) - - /* Save guest CTR (in R12) */ + std r14, VCPU_GPR(r14)(r7) + std r15, VCPU_GPR(r15)(r7) + std r16, VCPU_GPR(r16)(r7) + std r17, VCPU_GPR(r17)(r7) + std r18, VCPU_GPR(r18)(r7) + std r19, VCPU_GPR(r19)(r7) + std r20, VCPU_GPR(r20)(r7) + std r21, VCPU_GPR(r21)(r7) + std r22, VCPU_GPR(r22)(r7) + std r23, VCPU_GPR(r23)(r7) + std r24, VCPU_GPR(r24)(r7) + std r25, VCPU_GPR(r25)(r7) + std r26, VCPU_GPR(r26)(r7) + std r27, VCPU_GPR(r27)(r7) + std r28, VCPU_GPR(r28)(r7) + std r29, VCPU_GPR(r29)(r7) + std r30, VCPU_GPR(r30)(r7) + std r31, VCPU_GPR(r31)(r7) + + /* Save guest CTR */ mfctr r5 - std r5, VCPU_CTR(r12) + std r5, VCPU_CTR(r7) /* Save guest LR */ mflr r5 - std r5, VCPU_LR(r12) - - /* Save guest XER */ - mfxer r5 - std r5, VCPU_XER(r12) - - /* Save guest DAR */ - ld r5, (PACA_EXMC+EX_DAR)(r13) - std r5, VCPU_FAULT_DEAR(r12) - - /* Save guest DSISR */ - lwz r5, (PACA_EXMC+EX_DSISR)(r13) - std r5, VCPU_FAULT_DSISR(r12) + std r5, VCPU_LR(r7) /* Restore host msr -> SRR1 */ - ld r7, VCPU_HOST_MSR(r12) - mtsrr1 r7 - - /* Restore host IP -> SRR0 */ - ld r6, VCPU_HOST_RETIP(r12) - mtsrr0 r6 + ld r6, VCPU_HOST_MSR(r7) /* * For some interrupts, we need to call the real Linux @@ -314,13 +224,14 @@ no_dcbz32_off: * r3 = address of interrupt handler (exit reason) */ - cmpwi r3, BOOK3S_INTERRUPT_EXTERNAL + cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL beq call_linux_handler - cmpwi r3, BOOK3S_INTERRUPT_DECREMENTER + cmpwi r12, BOOK3S_INTERRUPT_DECREMENTER beq call_linux_handler - /* Back to Interruptable Mode! (goto kvm_return_point) */ - RFI + /* Back to EE=1 */ + mtmsr r6 + b kvm_return_point call_linux_handler: @@ -333,16 +244,22 @@ call_linux_handler: * interrupt handler! * * R3 still contains the exit code, - * R6 VCPU_HOST_RETIP and - * R7 VCPU_HOST_MSR + * R5 VCPU_HOST_RETIP and + * R6 VCPU_HOST_MSR */ - mtlr r3 + /* Restore host IP -> SRR0 */ + ld r5, VCPU_HOST_RETIP(r7) + + /* XXX Better move to a safe function? + * What if we get an HTAB flush in between mtsrr0 and mtsrr1? */ - ld r5, VCPU_TRAMPOLINE_LOWMEM(r12) - mtsrr0 r5 - LOAD_REG_IMMEDIATE(r5, MSR_KERNEL & ~(MSR_IR | MSR_DR)) - mtsrr1 r5 + mtlr r12 + + ld r4, VCPU_TRAMPOLINE_LOWMEM(r7) + mtsrr0 r4 + LOAD_REG_IMMEDIATE(r3, MSR_KERNEL & ~(MSR_IR | MSR_DR)) + mtsrr1 r3 RFI @@ -351,42 +268,51 @@ kvm_return_point: /* Jump back to lightweight entry if we're supposed to */ /* go back into the guest */ - mr r5, r3 + + /* Pass the exit number as 3rd argument to kvmppc_handle_exit */ + mr r5, r12 + /* Restore r3 (kvm_run) and r4 (vcpu) */ REST_2GPRS(3, r1) bl KVMPPC_HANDLE_EXIT -#if 0 /* XXX get lightweight exits back */ + /* If RESUME_GUEST, get back in the loop */ cmpwi r3, RESUME_GUEST - bne kvm_exit_heavyweight + beq kvm_loop_lightweight - /* put VCPU and KVM_RUN back into place and roll again! */ - REST_2GPRS(3, r1) - b kvm_start_lightweight + cmpwi r3, RESUME_GUEST_NV + beq kvm_loop_heavyweight -kvm_exit_heavyweight: - /* Restore non-volatile host registers */ - ld r14, _LINK(r1) - mtlr r14 - REST_NVGPRS(r1) +kvm_exit_loop: - addi r1, r1, SWITCH_FRAME_SIZE -#else ld r4, _LINK(r1) mtlr r4 - cmpwi r3, RESUME_GUEST - bne kvm_exit_heavyweight + /* Restore non-volatile host registers (r14 - r31) */ + REST_NVGPRS(r1) + + addi r1, r1, SWITCH_FRAME_SIZE + blr + +kvm_loop_heavyweight: + + ld r4, _LINK(r1) + std r4, (16 + SWITCH_FRAME_SIZE)(r1) + /* Load vcpu and cpu_run */ REST_2GPRS(3, r1) - addi r1, r1, SWITCH_FRAME_SIZE + /* Load non-volatile guest state from the vcpu */ + VCPU_LOAD_NVGPRS(r4) - b kvm_start_entry + /* Jump back into the beginning of this function */ + b kvm_start_lightweight -kvm_exit_heavyweight: +kvm_loop_lightweight: - addi r1, r1, SWITCH_FRAME_SIZE -#endif + /* We'll need the vcpu pointer */ + REST_GPR(4, r1) + + /* Jump back into the beginning of this function */ + b kvm_start_lightweight - blr diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c index e4beeb371a7..512dcff7755 100644 --- a/arch/powerpc/kvm/book3s_64_mmu.c +++ b/arch/powerpc/kvm/book3s_64_mmu.c @@ -54,7 +54,7 @@ static struct kvmppc_slb *kvmppc_mmu_book3s_64_find_slbe( if (!vcpu_book3s->slb[i].valid) continue; - if (vcpu_book3s->slb[i].large) + if (vcpu_book3s->slb[i].tb) cmp_esid = esid_1t; if (vcpu_book3s->slb[i].esid == cmp_esid) @@ -65,9 +65,10 @@ static struct kvmppc_slb *kvmppc_mmu_book3s_64_find_slbe( eaddr, esid, esid_1t); for (i = 0; i < vcpu_book3s->slb_nr; i++) { if (vcpu_book3s->slb[i].vsid) - dprintk(" %d: %c%c %llx %llx\n", i, + dprintk(" %d: %c%c%c %llx %llx\n", i, vcpu_book3s->slb[i].valid ? 'v' : ' ', vcpu_book3s->slb[i].large ? 'l' : ' ', + vcpu_book3s->slb[i].tb ? 't' : ' ', vcpu_book3s->slb[i].esid, vcpu_book3s->slb[i].vsid); } @@ -84,7 +85,7 @@ static u64 kvmppc_mmu_book3s_64_ea_to_vp(struct kvm_vcpu *vcpu, gva_t eaddr, if (!slb) return 0; - if (slb->large) + if (slb->tb) return (((u64)eaddr >> 12) & 0xfffffff) | (((u64)slb->vsid) << 28); @@ -309,7 +310,8 @@ static void kvmppc_mmu_book3s_64_slbmte(struct kvm_vcpu *vcpu, u64 rs, u64 rb) slbe = &vcpu_book3s->slb[slb_nr]; slbe->large = (rs & SLB_VSID_L) ? 1 : 0; - slbe->esid = slbe->large ? esid_1t : esid; + slbe->tb = (rs & SLB_VSID_B_1T) ? 1 : 0; + slbe->esid = slbe->tb ? esid_1t : esid; slbe->vsid = rs >> 12; slbe->valid = (rb & SLB_ESID_V) ? 1 : 0; slbe->Ks = (rs & SLB_VSID_KS) ? 1 : 0; diff --git a/arch/powerpc/kvm/book3s_64_rmhandlers.S b/arch/powerpc/kvm/book3s_64_rmhandlers.S index fb7dd2e9ac8..c83c60ad96c 100644 --- a/arch/powerpc/kvm/book3s_64_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_64_rmhandlers.S @@ -45,36 +45,25 @@ kvmppc_trampoline_\intno: * To distinguish, we check a magic byte in the PACA */ mfspr r13, SPRN_SPRG_PACA /* r13 = PACA */ - std r12, (PACA_EXMC + EX_R12)(r13) + std r12, PACA_KVM_SCRATCH0(r13) mfcr r12 - stw r12, (PACA_EXMC + EX_CCR)(r13) + stw r12, PACA_KVM_SCRATCH1(r13) lbz r12, PACA_KVM_IN_GUEST(r13) - cmpwi r12, 0 + cmpwi r12, KVM_GUEST_MODE_NONE bne ..kvmppc_handler_hasmagic_\intno /* No KVM guest? Then jump back to the Linux handler! */ - lwz r12, (PACA_EXMC + EX_CCR)(r13) + lwz r12, PACA_KVM_SCRATCH1(r13) mtcr r12 - ld r12, (PACA_EXMC + EX_R12)(r13) + ld r12, PACA_KVM_SCRATCH0(r13) mfspr r13, SPRN_SPRG_SCRATCH0 /* r13 = original r13 */ b kvmppc_resume_\intno /* Get back original handler */ /* Now we know we're handling a KVM guest */ ..kvmppc_handler_hasmagic_\intno: - /* Unset guest state */ - li r12, 0 - stb r12, PACA_KVM_IN_GUEST(r13) - std r1, (PACA_EXMC+EX_R9)(r13) - std r10, (PACA_EXMC+EX_R10)(r13) - std r11, (PACA_EXMC+EX_R11)(r13) - std r2, (PACA_EXMC+EX_R13)(r13) - - mfsrr0 r10 - mfsrr1 r11 - - /* Restore R1/R2 so we can handle faults */ - ld r1, PACAR1(r13) - ld r2, (PACA_EXMC+EX_SRR0)(r13) + /* Should we just skip the faulting instruction? */ + cmpwi r12, KVM_GUEST_MODE_SKIP + beq kvmppc_handler_skip_ins /* Let's store which interrupt we're handling */ li r12, \intno @@ -102,23 +91,107 @@ INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_ALTIVEC INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_VSX /* + * Bring us back to the faulting code, but skip the + * faulting instruction. + * + * This is a generic exit path from the interrupt + * trampolines above. + * + * Input Registers: + * + * R12 = free + * R13 = PACA + * PACA.KVM.SCRATCH0 = guest R12 + * PACA.KVM.SCRATCH1 = guest CR + * SPRG_SCRATCH0 = guest R13 + * + */ +kvmppc_handler_skip_ins: + + /* Patch the IP to the next instruction */ + mfsrr0 r12 + addi r12, r12, 4 + mtsrr0 r12 + + /* Clean up all state */ + lwz r12, PACA_KVM_SCRATCH1(r13) + mtcr r12 + ld r12, PACA_KVM_SCRATCH0(r13) + mfspr r13, SPRN_SPRG_SCRATCH0 + + /* And get back into the code */ + RFI + +/* * This trampoline brings us back to a real mode handler * * Input Registers: * - * R6 = SRR0 - * R7 = SRR1 + * R5 = SRR0 + * R6 = SRR1 * LR = real-mode IP * */ .global kvmppc_handler_lowmem_trampoline kvmppc_handler_lowmem_trampoline: - mtsrr0 r6 - mtsrr1 r7 + mtsrr0 r5 + mtsrr1 r6 blr kvmppc_handler_lowmem_trampoline_end: +/* + * Call a function in real mode + * + * Input Registers: + * + * R3 = function + * R4 = MSR + * R5 = CTR + * + */ +_GLOBAL(kvmppc_rmcall) + mtmsr r4 /* Disable relocation, so mtsrr + doesn't get interrupted */ + mtctr r5 + mtsrr0 r3 + mtsrr1 r4 + RFI + +/* + * Activate current's external feature (FPU/Altivec/VSX) + */ +#define define_load_up(what) \ + \ +_GLOBAL(kvmppc_load_up_ ## what); \ + subi r1, r1, INT_FRAME_SIZE; \ + mflr r3; \ + std r3, _LINK(r1); \ + mfmsr r4; \ + std r31, GPR3(r1); \ + mr r31, r4; \ + li r5, MSR_DR; \ + oris r5, r5, MSR_EE@h; \ + andc r4, r4, r5; \ + mtmsr r4; \ + \ + bl .load_up_ ## what; \ + \ + mtmsr r31; \ + ld r3, _LINK(r1); \ + ld r31, GPR3(r1); \ + addi r1, r1, INT_FRAME_SIZE; \ + mtlr r3; \ + blr + +define_load_up(fpu) +#ifdef CONFIG_ALTIVEC +define_load_up(altivec) +#endif +#ifdef CONFIG_VSX +define_load_up(vsx) +#endif + .global kvmppc_trampoline_lowmem kvmppc_trampoline_lowmem: .long kvmppc_handler_lowmem_trampoline - _stext diff --git a/arch/powerpc/kvm/book3s_64_slb.S b/arch/powerpc/kvm/book3s_64_slb.S index ecd237a03fd..35b76272218 100644 --- a/arch/powerpc/kvm/book3s_64_slb.S +++ b/arch/powerpc/kvm/book3s_64_slb.S @@ -31,7 +31,7 @@ #define REBOLT_SLB_ENTRY(num) \ ld r10, SHADOW_SLB_ESID(num)(r11); \ cmpdi r10, 0; \ - beq slb_exit_skip_1; \ + beq slb_exit_skip_ ## num; \ oris r10, r10, SLB_ESID_V@h; \ ld r9, SHADOW_SLB_VSID(num)(r11); \ slbmte r9, r10; \ @@ -51,23 +51,21 @@ kvmppc_handler_trampoline_enter: * * MSR = ~IR|DR * R13 = PACA + * R1 = host R1 + * R2 = host R2 * R9 = guest IP * R10 = guest MSR - * R11 = free - * R12 = free - * PACA[PACA_EXMC + EX_R9] = guest R9 - * PACA[PACA_EXMC + EX_R10] = guest R10 - * PACA[PACA_EXMC + EX_R11] = guest R11 - * PACA[PACA_EXMC + EX_R12] = guest R12 - * PACA[PACA_EXMC + EX_R13] = guest R13 - * PACA[PACA_EXMC + EX_CCR] = guest CR - * PACA[PACA_EXMC + EX_R3] = guest XER + * all other GPRS = free + * PACA[KVM_CR] = guest CR + * PACA[KVM_XER] = guest XER */ mtsrr0 r9 mtsrr1 r10 - mtspr SPRN_SPRG_SCRATCH0, r0 + /* Activate guest mode, so faults get handled by KVM */ + li r11, KVM_GUEST_MODE_GUEST + stb r11, PACA_KVM_IN_GUEST(r13) /* Remove LPAR shadow entries */ @@ -131,20 +129,27 @@ slb_do_enter: /* Enter guest */ - mfspr r0, SPRN_SPRG_SCRATCH0 - - ld r9, (PACA_EXMC+EX_R9)(r13) - ld r10, (PACA_EXMC+EX_R10)(r13) - ld r12, (PACA_EXMC+EX_R12)(r13) - - lwz r11, (PACA_EXMC+EX_CCR)(r13) + ld r0, (PACA_KVM_R0)(r13) + ld r1, (PACA_KVM_R1)(r13) + ld r2, (PACA_KVM_R2)(r13) + ld r3, (PACA_KVM_R3)(r13) + ld r4, (PACA_KVM_R4)(r13) + ld r5, (PACA_KVM_R5)(r13) + ld r6, (PACA_KVM_R6)(r13) + ld r7, (PACA_KVM_R7)(r13) + ld r8, (PACA_KVM_R8)(r13) + ld r9, (PACA_KVM_R9)(r13) + ld r10, (PACA_KVM_R10)(r13) + ld r12, (PACA_KVM_R12)(r13) + + lwz r11, (PACA_KVM_CR)(r13) mtcr r11 - ld r11, (PACA_EXMC+EX_R3)(r13) + ld r11, (PACA_KVM_XER)(r13) mtxer r11 - ld r11, (PACA_EXMC+EX_R11)(r13) - ld r13, (PACA_EXMC+EX_R13)(r13) + ld r11, (PACA_KVM_R11)(r13) + ld r13, (PACA_KVM_R13)(r13) RFI kvmppc_handler_trampoline_enter_end: @@ -162,28 +167,54 @@ kvmppc_handler_trampoline_exit: /* Register usage at this point: * - * SPRG_SCRATCH0 = guest R13 - * R01 = host R1 - * R02 = host R2 - * R10 = guest PC - * R11 = guest MSR - * R12 = exit handler id - * R13 = PACA - * PACA.exmc.CCR = guest CR - * PACA.exmc.R9 = guest R1 - * PACA.exmc.R10 = guest R10 - * PACA.exmc.R11 = guest R11 - * PACA.exmc.R12 = guest R12 - * PACA.exmc.R13 = guest R2 + * SPRG_SCRATCH0 = guest R13 + * R12 = exit handler id + * R13 = PACA + * PACA.KVM.SCRATCH0 = guest R12 + * PACA.KVM.SCRATCH1 = guest CR * */ /* Save registers */ - std r0, (PACA_EXMC+EX_SRR0)(r13) - std r9, (PACA_EXMC+EX_R3)(r13) - std r10, (PACA_EXMC+EX_LR)(r13) - std r11, (PACA_EXMC+EX_DAR)(r13) + std r0, PACA_KVM_R0(r13) + std r1, PACA_KVM_R1(r13) + std r2, PACA_KVM_R2(r13) + std r3, PACA_KVM_R3(r13) + std r4, PACA_KVM_R4(r13) + std r5, PACA_KVM_R5(r13) + std r6, PACA_KVM_R6(r13) + std r7, PACA_KVM_R7(r13) + std r8, PACA_KVM_R8(r13) + std r9, PACA_KVM_R9(r13) + std r10, PACA_KVM_R10(r13) + std r11, PACA_KVM_R11(r13) + + /* Restore R1/R2 so we can handle faults */ + ld r1, PACA_KVM_HOST_R1(r13) + ld r2, PACA_KVM_HOST_R2(r13) + + /* Save guest PC and MSR in GPRs */ + mfsrr0 r3 + mfsrr1 r4 + + /* Get scratch'ed off registers */ + mfspr r9, SPRN_SPRG_SCRATCH0 + std r9, PACA_KVM_R13(r13) + + ld r8, PACA_KVM_SCRATCH0(r13) + std r8, PACA_KVM_R12(r13) + + lwz r7, PACA_KVM_SCRATCH1(r13) + stw r7, PACA_KVM_CR(r13) + + /* Save more register state */ + + mfxer r6 + stw r6, PACA_KVM_XER(r13) + + mfdar r5 + mfdsisr r6 /* * In order for us to easily get the last instruction, @@ -202,17 +233,28 @@ kvmppc_handler_trampoline_exit: ld_last_inst: /* Save off the guest instruction we're at */ + + /* Set guest mode to 'jump over instruction' so if lwz faults + * we'll just continue at the next IP. */ + li r9, KVM_GUEST_MODE_SKIP + stb r9, PACA_KVM_IN_GUEST(r13) + /* 1) enable paging for data */ mfmsr r9 ori r11, r9, MSR_DR /* Enable paging for data */ mtmsr r11 /* 2) fetch the instruction */ - lwz r0, 0(r10) + li r0, KVM_INST_FETCH_FAILED /* In case lwz faults */ + lwz r0, 0(r3) /* 3) disable paging again */ mtmsr r9 no_ld_last_inst: + /* Unset guest mode */ + li r9, KVM_GUEST_MODE_NONE + stb r9, PACA_KVM_IN_GUEST(r13) + /* Restore bolted entries from the shadow and fix it along the way */ /* We don't store anything in entry 0, so we don't need to take care of it */ @@ -233,29 +275,27 @@ no_ld_last_inst: slb_do_exit: - /* Restore registers */ - - ld r11, (PACA_EXMC+EX_DAR)(r13) - ld r10, (PACA_EXMC+EX_LR)(r13) - ld r9, (PACA_EXMC+EX_R3)(r13) - - /* Save last inst */ - stw r0, (PACA_EXMC+EX_LR)(r13) - - /* Save DAR and DSISR before going to paged mode */ - mfdar r0 - std r0, (PACA_EXMC+EX_DAR)(r13) - mfdsisr r0 - stw r0, (PACA_EXMC+EX_DSISR)(r13) + /* Register usage at this point: + * + * R0 = guest last inst + * R1 = host R1 + * R2 = host R2 + * R3 = guest PC + * R4 = guest MSR + * R5 = guest DAR + * R6 = guest DSISR + * R12 = exit handler id + * R13 = PACA + * PACA.KVM.* = guest * + * + */ /* RFI into the highmem handler */ - mfmsr r0 - ori r0, r0, MSR_IR|MSR_DR|MSR_RI /* Enable paging */ - mtsrr1 r0 - ld r0, PACASAVEDMSR(r13) /* Highmem handler address */ - mtsrr0 r0 - - mfspr r0, SPRN_SPRG_SCRATCH0 + mfmsr r7 + ori r7, r7, MSR_IR|MSR_DR|MSR_RI /* Enable paging */ + mtsrr1 r7 + ld r8, PACA_KVM_VMHANDLER(r13) /* Highmem handler address */ + mtsrr0 r8 RFI kvmppc_handler_trampoline_exit_end: diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 06f5a9ecc42..4d686cc6b26 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -69,10 +69,10 @@ void kvmppc_dump_vcpu(struct kvm_vcpu *vcpu) for (i = 0; i < 32; i += 4) { printk("gpr%02d: %08lx %08lx %08lx %08lx\n", i, - vcpu->arch.gpr[i], - vcpu->arch.gpr[i+1], - vcpu->arch.gpr[i+2], - vcpu->arch.gpr[i+3]); + kvmppc_get_gpr(vcpu, i), + kvmppc_get_gpr(vcpu, i+1), + kvmppc_get_gpr(vcpu, i+2), + kvmppc_get_gpr(vcpu, i+3)); } } @@ -82,8 +82,32 @@ static void kvmppc_booke_queue_irqprio(struct kvm_vcpu *vcpu, set_bit(priority, &vcpu->arch.pending_exceptions); } -void kvmppc_core_queue_program(struct kvm_vcpu *vcpu) +static void kvmppc_core_queue_dtlb_miss(struct kvm_vcpu *vcpu, + ulong dear_flags, ulong esr_flags) { + vcpu->arch.queued_dear = dear_flags; + vcpu->arch.queued_esr = esr_flags; + kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_DTLB_MISS); +} + +static void kvmppc_core_queue_data_storage(struct kvm_vcpu *vcpu, + ulong dear_flags, ulong esr_flags) +{ + vcpu->arch.queued_dear = dear_flags; + vcpu->arch.queued_esr = esr_flags; + kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_DATA_STORAGE); +} + +static void kvmppc_core_queue_inst_storage(struct kvm_vcpu *vcpu, + ulong esr_flags) +{ + vcpu->arch.queued_esr = esr_flags; + kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_INST_STORAGE); +} + +void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong esr_flags) +{ + vcpu->arch.queued_esr = esr_flags; kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_PROGRAM); } @@ -97,6 +121,11 @@ int kvmppc_core_pending_dec(struct kvm_vcpu *vcpu) return test_bit(BOOKE_IRQPRIO_DECREMENTER, &vcpu->arch.pending_exceptions); } +void kvmppc_core_dequeue_dec(struct kvm_vcpu *vcpu) +{ + clear_bit(BOOKE_IRQPRIO_DECREMENTER, &vcpu->arch.pending_exceptions); +} + void kvmppc_core_queue_external(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq) { @@ -109,14 +138,19 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, { int allowed = 0; ulong msr_mask; + bool update_esr = false, update_dear = false; switch (priority) { - case BOOKE_IRQPRIO_PROGRAM: case BOOKE_IRQPRIO_DTLB_MISS: - case BOOKE_IRQPRIO_ITLB_MISS: - case BOOKE_IRQPRIO_SYSCALL: case BOOKE_IRQPRIO_DATA_STORAGE: + update_dear = true; + /* fall through */ case BOOKE_IRQPRIO_INST_STORAGE: + case BOOKE_IRQPRIO_PROGRAM: + update_esr = true; + /* fall through */ + case BOOKE_IRQPRIO_ITLB_MISS: + case BOOKE_IRQPRIO_SYSCALL: case BOOKE_IRQPRIO_FP_UNAVAIL: case BOOKE_IRQPRIO_SPE_UNAVAIL: case BOOKE_IRQPRIO_SPE_FP_DATA: @@ -151,6 +185,10 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, vcpu->arch.srr0 = vcpu->arch.pc; vcpu->arch.srr1 = vcpu->arch.msr; vcpu->arch.pc = vcpu->arch.ivpr | vcpu->arch.ivor[priority]; + if (update_esr == true) + vcpu->arch.esr = vcpu->arch.queued_esr; + if (update_dear == true) + vcpu->arch.dear = vcpu->arch.queued_dear; kvmppc_set_msr(vcpu, vcpu->arch.msr & msr_mask); clear_bit(priority, &vcpu->arch.pending_exceptions); @@ -223,8 +261,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, if (vcpu->arch.msr & MSR_PR) { /* Program traps generated by user-level software must be handled * by the guest kernel. */ - vcpu->arch.esr = vcpu->arch.fault_esr; - kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_PROGRAM); + kvmppc_core_queue_program(vcpu, vcpu->arch.fault_esr); r = RESUME_GUEST; kvmppc_account_exit(vcpu, USR_PR_INST); break; @@ -280,16 +317,14 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, break; case BOOKE_INTERRUPT_DATA_STORAGE: - vcpu->arch.dear = vcpu->arch.fault_dear; - vcpu->arch.esr = vcpu->arch.fault_esr; - kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_DATA_STORAGE); + kvmppc_core_queue_data_storage(vcpu, vcpu->arch.fault_dear, + vcpu->arch.fault_esr); kvmppc_account_exit(vcpu, DSI_EXITS); r = RESUME_GUEST; break; case BOOKE_INTERRUPT_INST_STORAGE: - vcpu->arch.esr = vcpu->arch.fault_esr; - kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_INST_STORAGE); + kvmppc_core_queue_inst_storage(vcpu, vcpu->arch.fault_esr); kvmppc_account_exit(vcpu, ISI_EXITS); r = RESUME_GUEST; break; @@ -310,9 +345,9 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, gtlb_index = kvmppc_mmu_dtlb_index(vcpu, eaddr); if (gtlb_index < 0) { /* The guest didn't have a mapping for it. */ - kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_DTLB_MISS); - vcpu->arch.dear = vcpu->arch.fault_dear; - vcpu->arch.esr = vcpu->arch.fault_esr; + kvmppc_core_queue_dtlb_miss(vcpu, + vcpu->arch.fault_dear, + vcpu->arch.fault_esr); kvmppc_mmu_dtlb_miss(vcpu); kvmppc_account_exit(vcpu, DTLB_REAL_MISS_EXITS); r = RESUME_GUEST; @@ -426,7 +461,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) { vcpu->arch.pc = 0; vcpu->arch.msr = 0; - vcpu->arch.gpr[1] = (16<<20) - 8; /* -8 for the callee-save LR slot */ + kvmppc_set_gpr(vcpu, 1, (16<<20) - 8); /* -8 for the callee-save LR slot */ vcpu->arch.shadow_pid = 1; @@ -444,10 +479,10 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) int i; regs->pc = vcpu->arch.pc; - regs->cr = vcpu->arch.cr; + regs->cr = kvmppc_get_cr(vcpu); regs->ctr = vcpu->arch.ctr; regs->lr = vcpu->arch.lr; - regs->xer = vcpu->arch.xer; + regs->xer = kvmppc_get_xer(vcpu); regs->msr = vcpu->arch.msr; regs->srr0 = vcpu->arch.srr0; regs->srr1 = vcpu->arch.srr1; @@ -461,7 +496,7 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) regs->sprg7 = vcpu->arch.sprg6; for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) - regs->gpr[i] = vcpu->arch.gpr[i]; + regs->gpr[i] = kvmppc_get_gpr(vcpu, i); return 0; } @@ -471,10 +506,10 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) int i; vcpu->arch.pc = regs->pc; - vcpu->arch.cr = regs->cr; + kvmppc_set_cr(vcpu, regs->cr); vcpu->arch.ctr = regs->ctr; vcpu->arch.lr = regs->lr; - vcpu->arch.xer = regs->xer; + kvmppc_set_xer(vcpu, regs->xer); kvmppc_set_msr(vcpu, regs->msr); vcpu->arch.srr0 = regs->srr0; vcpu->arch.srr1 = regs->srr1; @@ -486,8 +521,8 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) vcpu->arch.sprg6 = regs->sprg5; vcpu->arch.sprg7 = regs->sprg6; - for (i = 0; i < ARRAY_SIZE(vcpu->arch.gpr); i++) - vcpu->arch.gpr[i] = regs->gpr[i]; + for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) + kvmppc_set_gpr(vcpu, i, regs->gpr[i]); return 0; } diff --git a/arch/powerpc/kvm/booke_emulate.c b/arch/powerpc/kvm/booke_emulate.c index aebc65e93f4..cbc790ee192 100644 --- a/arch/powerpc/kvm/booke_emulate.c +++ b/arch/powerpc/kvm/booke_emulate.c @@ -62,20 +62,20 @@ int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, case OP_31_XOP_MFMSR: rt = get_rt(inst); - vcpu->arch.gpr[rt] = vcpu->arch.msr; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.msr); kvmppc_set_exit_type(vcpu, EMULATED_MFMSR_EXITS); break; case OP_31_XOP_MTMSR: rs = get_rs(inst); kvmppc_set_exit_type(vcpu, EMULATED_MTMSR_EXITS); - kvmppc_set_msr(vcpu, vcpu->arch.gpr[rs]); + kvmppc_set_msr(vcpu, kvmppc_get_gpr(vcpu, rs)); break; case OP_31_XOP_WRTEE: rs = get_rs(inst); vcpu->arch.msr = (vcpu->arch.msr & ~MSR_EE) - | (vcpu->arch.gpr[rs] & MSR_EE); + | (kvmppc_get_gpr(vcpu, rs) & MSR_EE); kvmppc_set_exit_type(vcpu, EMULATED_WRTEE_EXITS); break; @@ -101,22 +101,23 @@ int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) { int emulated = EMULATE_DONE; + ulong spr_val = kvmppc_get_gpr(vcpu, rs); switch (sprn) { case SPRN_DEAR: - vcpu->arch.dear = vcpu->arch.gpr[rs]; break; + vcpu->arch.dear = spr_val; break; case SPRN_ESR: - vcpu->arch.esr = vcpu->arch.gpr[rs]; break; + vcpu->arch.esr = spr_val; break; case SPRN_DBCR0: - vcpu->arch.dbcr0 = vcpu->arch.gpr[rs]; break; + vcpu->arch.dbcr0 = spr_val; break; case SPRN_DBCR1: - vcpu->arch.dbcr1 = vcpu->arch.gpr[rs]; break; + vcpu->arch.dbcr1 = spr_val; break; case SPRN_DBSR: - vcpu->arch.dbsr &= ~vcpu->arch.gpr[rs]; break; + vcpu->arch.dbsr &= ~spr_val; break; case SPRN_TSR: - vcpu->arch.tsr &= ~vcpu->arch.gpr[rs]; break; + vcpu->arch.tsr &= ~spr_val; break; case SPRN_TCR: - vcpu->arch.tcr = vcpu->arch.gpr[rs]; + vcpu->arch.tcr = spr_val; kvmppc_emulate_dec(vcpu); break; @@ -124,64 +125,64 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) * loaded into the real SPRGs when resuming the * guest. */ case SPRN_SPRG4: - vcpu->arch.sprg4 = vcpu->arch.gpr[rs]; break; + vcpu->arch.sprg4 = spr_val; break; case SPRN_SPRG5: - vcpu->arch.sprg5 = vcpu->arch.gpr[rs]; break; + vcpu->arch.sprg5 = spr_val; break; case SPRN_SPRG6: - vcpu->arch.sprg6 = vcpu->arch.gpr[rs]; break; + vcpu->arch.sprg6 = spr_val; break; case SPRN_SPRG7: - vcpu->arch.sprg7 = vcpu->arch.gpr[rs]; break; + vcpu->arch.sprg7 = spr_val; break; case SPRN_IVPR: - vcpu->arch.ivpr = vcpu->arch.gpr[rs]; + vcpu->arch.ivpr = spr_val; break; case SPRN_IVOR0: - vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL] = vcpu->arch.gpr[rs]; + vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL] = spr_val; break; case SPRN_IVOR1: - vcpu->arch.ivor[BOOKE_IRQPRIO_MACHINE_CHECK] = vcpu->arch.gpr[rs]; + vcpu->arch.ivor[BOOKE_IRQPRIO_MACHINE_CHECK] = spr_val; break; case SPRN_IVOR2: - vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE] = vcpu->arch.gpr[rs]; + vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE] = spr_val; break; case SPRN_IVOR3: - vcpu->arch.ivor[BOOKE_IRQPRIO_INST_STORAGE] = vcpu->arch.gpr[rs]; + vcpu->arch.ivor[BOOKE_IRQPRIO_INST_STORAGE] = spr_val; break; case SPRN_IVOR4: - vcpu->arch.ivor[BOOKE_IRQPRIO_EXTERNAL] = vcpu->arch.gpr[rs]; + vcpu->arch.ivor[BOOKE_IRQPRIO_EXTERNAL] = spr_val; break; case SPRN_IVOR5: - vcpu->arch.ivor[BOOKE_IRQPRIO_ALIGNMENT] = vcpu->arch.gpr[rs]; + vcpu->arch.ivor[BOOKE_IRQPRIO_ALIGNMENT] = spr_val; break; case SPRN_IVOR6: - vcpu->arch.ivor[BOOKE_IRQPRIO_PROGRAM] = vcpu->arch.gpr[rs]; + vcpu->arch.ivor[BOOKE_IRQPRIO_PROGRAM] = spr_val; break; case SPRN_IVOR7: - vcpu->arch.ivor[BOOKE_IRQPRIO_FP_UNAVAIL] = vcpu->arch.gpr[rs]; + vcpu->arch.ivor[BOOKE_IRQPRIO_FP_UNAVAIL] = spr_val; break; case SPRN_IVOR8: - vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL] = vcpu->arch.gpr[rs]; + vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL] = spr_val; break; case SPRN_IVOR9: - vcpu->arch.ivor[BOOKE_IRQPRIO_AP_UNAVAIL] = vcpu->arch.gpr[rs]; + vcpu->arch.ivor[BOOKE_IRQPRIO_AP_UNAVAIL] = spr_val; break; case SPRN_IVOR10: - vcpu->arch.ivor[BOOKE_IRQPRIO_DECREMENTER] = vcpu->arch.gpr[rs]; + vcpu->arch.ivor[BOOKE_IRQPRIO_DECREMENTER] = spr_val; break; case SPRN_IVOR11: - vcpu->arch.ivor[BOOKE_IRQPRIO_FIT] = vcpu->arch.gpr[rs]; + vcpu->arch.ivor[BOOKE_IRQPRIO_FIT] = spr_val; break; case SPRN_IVOR12: - vcpu->arch.ivor[BOOKE_IRQPRIO_WATCHDOG] = vcpu->arch.gpr[rs]; + vcpu->arch.ivor[BOOKE_IRQPRIO_WATCHDOG] = spr_val; break; case SPRN_IVOR13: - vcpu->arch.ivor[BOOKE_IRQPRIO_DTLB_MISS] = vcpu->arch.gpr[rs]; + vcpu->arch.ivor[BOOKE_IRQPRIO_DTLB_MISS] = spr_val; break; case SPRN_IVOR14: - vcpu->arch.ivor[BOOKE_IRQPRIO_ITLB_MISS] = vcpu->arch.gpr[rs]; + vcpu->arch.ivor[BOOKE_IRQPRIO_ITLB_MISS] = spr_val; break; case SPRN_IVOR15: - vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG] = vcpu->arch.gpr[rs]; + vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG] = spr_val; break; default: @@ -197,65 +198,65 @@ int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) switch (sprn) { case SPRN_IVPR: - vcpu->arch.gpr[rt] = vcpu->arch.ivpr; break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivpr); break; case SPRN_DEAR: - vcpu->arch.gpr[rt] = vcpu->arch.dear; break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.dear); break; case SPRN_ESR: - vcpu->arch.gpr[rt] = vcpu->arch.esr; break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.esr); break; case SPRN_DBCR0: - vcpu->arch.gpr[rt] = vcpu->arch.dbcr0; break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.dbcr0); break; case SPRN_DBCR1: - vcpu->arch.gpr[rt] = vcpu->arch.dbcr1; break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.dbcr1); break; case SPRN_DBSR: - vcpu->arch.gpr[rt] = vcpu->arch.dbsr; break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.dbsr); break; case SPRN_IVOR0: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL]; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL]); break; case SPRN_IVOR1: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_MACHINE_CHECK]; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_MACHINE_CHECK]); break; case SPRN_IVOR2: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE]; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE]); break; case SPRN_IVOR3: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_INST_STORAGE]; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_INST_STORAGE]); break; case SPRN_IVOR4: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_EXTERNAL]; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_EXTERNAL]); break; case SPRN_IVOR5: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_ALIGNMENT]; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_ALIGNMENT]); break; case SPRN_IVOR6: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_PROGRAM]; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_PROGRAM]); break; case SPRN_IVOR7: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_FP_UNAVAIL]; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_FP_UNAVAIL]); break; case SPRN_IVOR8: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL]; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL]); break; case SPRN_IVOR9: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_AP_UNAVAIL]; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_AP_UNAVAIL]); break; case SPRN_IVOR10: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_DECREMENTER]; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_DECREMENTER]); break; case SPRN_IVOR11: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_FIT]; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_FIT]); break; case SPRN_IVOR12: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_WATCHDOG]; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_WATCHDOG]); break; case SPRN_IVOR13: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_DTLB_MISS]; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_DTLB_MISS]); break; case SPRN_IVOR14: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_ITLB_MISS]; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_ITLB_MISS]); break; case SPRN_IVOR15: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG]; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG]); break; default: diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c index 64949eef43f..efa1198940a 100644 --- a/arch/powerpc/kvm/e500.c +++ b/arch/powerpc/kvm/e500.c @@ -60,6 +60,12 @@ int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu) kvmppc_e500_tlb_setup(vcpu_e500); + /* Registers init */ + vcpu->arch.pvr = mfspr(SPRN_PVR); + + /* Since booke kvm only support one core, update all vcpus' PIR to 0 */ + vcpu->vcpu_id = 0; + return 0; } diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c index be95b8d8e3b..8e3edfbc963 100644 --- a/arch/powerpc/kvm/e500_emulate.c +++ b/arch/powerpc/kvm/e500_emulate.c @@ -74,54 +74,59 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); int emulated = EMULATE_DONE; + ulong spr_val = kvmppc_get_gpr(vcpu, rs); switch (sprn) { case SPRN_PID: vcpu_e500->pid[0] = vcpu->arch.shadow_pid = - vcpu->arch.pid = vcpu->arch.gpr[rs]; + vcpu->arch.pid = spr_val; break; case SPRN_PID1: - vcpu_e500->pid[1] = vcpu->arch.gpr[rs]; break; + vcpu_e500->pid[1] = spr_val; break; case SPRN_PID2: - vcpu_e500->pid[2] = vcpu->arch.gpr[rs]; break; + vcpu_e500->pid[2] = spr_val; break; case SPRN_MAS0: - vcpu_e500->mas0 = vcpu->arch.gpr[rs]; break; + vcpu_e500->mas0 = spr_val; break; case SPRN_MAS1: - vcpu_e500->mas1 = vcpu->arch.gpr[rs]; break; + vcpu_e500->mas1 = spr_val; break; case SPRN_MAS2: - vcpu_e500->mas2 = vcpu->arch.gpr[rs]; break; + vcpu_e500->mas2 = spr_val; break; case SPRN_MAS3: - vcpu_e500->mas3 = vcpu->arch.gpr[rs]; break; + vcpu_e500->mas3 = spr_val; break; case SPRN_MAS4: - vcpu_e500->mas4 = vcpu->arch.gpr[rs]; break; + vcpu_e500->mas4 = spr_val; break; case SPRN_MAS6: - vcpu_e500->mas6 = vcpu->arch.gpr[rs]; break; + vcpu_e500->mas6 = spr_val; break; case SPRN_MAS7: - vcpu_e500->mas7 = vcpu->arch.gpr[rs]; break; + vcpu_e500->mas7 = spr_val; break; + case SPRN_L1CSR0: + vcpu_e500->l1csr0 = spr_val; + vcpu_e500->l1csr0 &= ~(L1CSR0_DCFI | L1CSR0_CLFC); + break; case SPRN_L1CSR1: - vcpu_e500->l1csr1 = vcpu->arch.gpr[rs]; break; + vcpu_e500->l1csr1 = spr_val; break; case SPRN_HID0: - vcpu_e500->hid0 = vcpu->arch.gpr[rs]; break; + vcpu_e500->hid0 = spr_val; break; case SPRN_HID1: - vcpu_e500->hid1 = vcpu->arch.gpr[rs]; break; + vcpu_e500->hid1 = spr_val; break; case SPRN_MMUCSR0: emulated = kvmppc_e500_emul_mt_mmucsr0(vcpu_e500, - vcpu->arch.gpr[rs]); + spr_val); break; /* extra exceptions */ case SPRN_IVOR32: - vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_UNAVAIL] = vcpu->arch.gpr[rs]; + vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_UNAVAIL] = spr_val; break; case SPRN_IVOR33: - vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_DATA] = vcpu->arch.gpr[rs]; + vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_DATA] = spr_val; break; case SPRN_IVOR34: - vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_ROUND] = vcpu->arch.gpr[rs]; + vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_ROUND] = spr_val; break; case SPRN_IVOR35: - vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR] = vcpu->arch.gpr[rs]; + vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR] = spr_val; break; default: @@ -138,63 +143,57 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) switch (sprn) { case SPRN_PID: - vcpu->arch.gpr[rt] = vcpu_e500->pid[0]; break; + kvmppc_set_gpr(vcpu, rt, vcpu_e500->pid[0]); break; case SPRN_PID1: - vcpu->arch.gpr[rt] = vcpu_e500->pid[1]; break; + kvmppc_set_gpr(vcpu, rt, vcpu_e500->pid[1]); break; case SPRN_PID2: - vcpu->arch.gpr[rt] = vcpu_e500->pid[2]; break; + kvmppc_set_gpr(vcpu, rt, vcpu_e500->pid[2]); break; case SPRN_MAS0: - vcpu->arch.gpr[rt] = vcpu_e500->mas0; break; + kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas0); break; case SPRN_MAS1: - vcpu->arch.gpr[rt] = vcpu_e500->mas1; break; + kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas1); break; case SPRN_MAS2: - vcpu->arch.gpr[rt] = vcpu_e500->mas2; break; + kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas2); break; case SPRN_MAS3: - vcpu->arch.gpr[rt] = vcpu_e500->mas3; break; + kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas3); break; case SPRN_MAS4: - vcpu->arch.gpr[rt] = vcpu_e500->mas4; break; + kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas4); break; case SPRN_MAS6: - vcpu->arch.gpr[rt] = vcpu_e500->mas6; break; + kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas6); break; case SPRN_MAS7: - vcpu->arch.gpr[rt] = vcpu_e500->mas7; break; + kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas7); break; case SPRN_TLB0CFG: - vcpu->arch.gpr[rt] = mfspr(SPRN_TLB0CFG); - vcpu->arch.gpr[rt] &= ~0xfffUL; - vcpu->arch.gpr[rt] |= vcpu_e500->guest_tlb_size[0]; - break; - + kvmppc_set_gpr(vcpu, rt, vcpu_e500->tlb0cfg); break; case SPRN_TLB1CFG: - vcpu->arch.gpr[rt] = mfspr(SPRN_TLB1CFG); - vcpu->arch.gpr[rt] &= ~0xfffUL; - vcpu->arch.gpr[rt] |= vcpu_e500->guest_tlb_size[1]; - break; - + kvmppc_set_gpr(vcpu, rt, vcpu_e500->tlb1cfg); break; + case SPRN_L1CSR0: + kvmppc_set_gpr(vcpu, rt, vcpu_e500->l1csr0); break; case SPRN_L1CSR1: - vcpu->arch.gpr[rt] = vcpu_e500->l1csr1; break; + kvmppc_set_gpr(vcpu, rt, vcpu_e500->l1csr1); break; case SPRN_HID0: - vcpu->arch.gpr[rt] = vcpu_e500->hid0; break; + kvmppc_set_gpr(vcpu, rt, vcpu_e500->hid0); break; case SPRN_HID1: - vcpu->arch.gpr[rt] = vcpu_e500->hid1; break; + kvmppc_set_gpr(vcpu, rt, vcpu_e500->hid1); break; case SPRN_MMUCSR0: - vcpu->arch.gpr[rt] = 0; break; + kvmppc_set_gpr(vcpu, rt, 0); break; case SPRN_MMUCFG: - vcpu->arch.gpr[rt] = mfspr(SPRN_MMUCFG); break; + kvmppc_set_gpr(vcpu, rt, mfspr(SPRN_MMUCFG)); break; /* extra exceptions */ case SPRN_IVOR32: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_UNAVAIL]; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_UNAVAIL]); break; case SPRN_IVOR33: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_DATA]; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_DATA]); break; case SPRN_IVOR34: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_ROUND]; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_ROUND]); break; case SPRN_IVOR35: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR]; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR]); break; default: emulated = kvmppc_booke_emulate_mfspr(vcpu, sprn, rt); diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c index fb1e1dc11ba..0d772e6b631 100644 --- a/arch/powerpc/kvm/e500_tlb.c +++ b/arch/powerpc/kvm/e500_tlb.c @@ -417,7 +417,7 @@ int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, int ra, int rb) int esel, tlbsel; gva_t ea; - ea = ((ra) ? vcpu->arch.gpr[ra] : 0) + vcpu->arch.gpr[rb]; + ea = ((ra) ? kvmppc_get_gpr(vcpu, ra) : 0) + kvmppc_get_gpr(vcpu, rb); ia = (ea >> 2) & 0x1; @@ -470,7 +470,7 @@ int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, int rb) struct tlbe *gtlbe = NULL; gva_t ea; - ea = vcpu->arch.gpr[rb]; + ea = kvmppc_get_gpr(vcpu, rb); for (tlbsel = 0; tlbsel < 2; tlbsel++) { esel = kvmppc_e500_tlb_index(vcpu_e500, ea, tlbsel, pid, as); @@ -728,6 +728,12 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500) if (vcpu_e500->shadow_pages[1] == NULL) goto err_out_page0; + /* Init TLB configuration register */ + vcpu_e500->tlb0cfg = mfspr(SPRN_TLB0CFG) & ~0xfffUL; + vcpu_e500->tlb0cfg |= vcpu_e500->guest_tlb_size[0]; + vcpu_e500->tlb1cfg = mfspr(SPRN_TLB1CFG) & ~0xfffUL; + vcpu_e500->tlb1cfg |= vcpu_e500->guest_tlb_size[1]; + return 0; err_out_page0: diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c index 4a9ac6640fa..cb72a65f4ec 100644 --- a/arch/powerpc/kvm/emulate.c +++ b/arch/powerpc/kvm/emulate.c @@ -83,6 +83,9 @@ void kvmppc_emulate_dec(struct kvm_vcpu *vcpu) pr_debug("mtDEC: %x\n", vcpu->arch.dec); #ifdef CONFIG_PPC64 + /* mtdec lowers the interrupt line when positive. */ + kvmppc_core_dequeue_dec(vcpu); + /* POWER4+ triggers a dec interrupt if the value is < 0 */ if (vcpu->arch.dec & 0x80000000) { hrtimer_try_to_cancel(&vcpu->arch.dec_timer); @@ -140,14 +143,18 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) pr_debug(KERN_INFO "Emulating opcode %d / %d\n", get_op(inst), get_xop(inst)); + /* Try again next time */ + if (inst == KVM_INST_FETCH_FAILED) + return EMULATE_DONE; + switch (get_op(inst)) { case OP_TRAP: #ifdef CONFIG_PPC64 case OP_TRAP_64: + kvmppc_core_queue_program(vcpu, SRR1_PROGTRAP); #else - vcpu->arch.esr |= ESR_PTR; + kvmppc_core_queue_program(vcpu, vcpu->arch.esr | ESR_PTR); #endif - kvmppc_core_queue_program(vcpu); advance = 0; break; @@ -167,14 +174,14 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) case OP_31_XOP_STWX: rs = get_rs(inst); emulated = kvmppc_handle_store(run, vcpu, - vcpu->arch.gpr[rs], + kvmppc_get_gpr(vcpu, rs), 4, 1); break; case OP_31_XOP_STBX: rs = get_rs(inst); emulated = kvmppc_handle_store(run, vcpu, - vcpu->arch.gpr[rs], + kvmppc_get_gpr(vcpu, rs), 1, 1); break; @@ -183,14 +190,14 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) ra = get_ra(inst); rb = get_rb(inst); - ea = vcpu->arch.gpr[rb]; + ea = kvmppc_get_gpr(vcpu, rb); if (ra) - ea += vcpu->arch.gpr[ra]; + ea += kvmppc_get_gpr(vcpu, ra); emulated = kvmppc_handle_store(run, vcpu, - vcpu->arch.gpr[rs], + kvmppc_get_gpr(vcpu, rs), 1, 1); - vcpu->arch.gpr[rs] = ea; + kvmppc_set_gpr(vcpu, rs, ea); break; case OP_31_XOP_LHZX: @@ -203,12 +210,12 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) ra = get_ra(inst); rb = get_rb(inst); - ea = vcpu->arch.gpr[rb]; + ea = kvmppc_get_gpr(vcpu, rb); if (ra) - ea += vcpu->arch.gpr[ra]; + ea += kvmppc_get_gpr(vcpu, ra); emulated = kvmppc_handle_load(run, vcpu, rt, 2, 1); - vcpu->arch.gpr[ra] = ea; + kvmppc_set_gpr(vcpu, ra, ea); break; case OP_31_XOP_MFSPR: @@ -217,47 +224,49 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) switch (sprn) { case SPRN_SRR0: - vcpu->arch.gpr[rt] = vcpu->arch.srr0; break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.srr0); break; case SPRN_SRR1: - vcpu->arch.gpr[rt] = vcpu->arch.srr1; break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.srr1); break; case SPRN_PVR: - vcpu->arch.gpr[rt] = vcpu->arch.pvr; break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.pvr); break; case SPRN_PIR: - vcpu->arch.gpr[rt] = vcpu->vcpu_id; break; + kvmppc_set_gpr(vcpu, rt, vcpu->vcpu_id); break; case SPRN_MSSSR0: - vcpu->arch.gpr[rt] = 0; break; + kvmppc_set_gpr(vcpu, rt, 0); break; /* Note: mftb and TBRL/TBWL are user-accessible, so * the guest can always access the real TB anyways. * In fact, we probably will never see these traps. */ case SPRN_TBWL: - vcpu->arch.gpr[rt] = get_tb() >> 32; break; + kvmppc_set_gpr(vcpu, rt, get_tb() >> 32); break; case SPRN_TBWU: - vcpu->arch.gpr[rt] = get_tb(); break; + kvmppc_set_gpr(vcpu, rt, get_tb()); break; case SPRN_SPRG0: - vcpu->arch.gpr[rt] = vcpu->arch.sprg0; break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.sprg0); break; case SPRN_SPRG1: - vcpu->arch.gpr[rt] = vcpu->arch.sprg1; break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.sprg1); break; case SPRN_SPRG2: - vcpu->arch.gpr[rt] = vcpu->arch.sprg2; break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.sprg2); break; case SPRN_SPRG3: - vcpu->arch.gpr[rt] = vcpu->arch.sprg3; break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.sprg3); break; /* Note: SPRG4-7 are user-readable, so we don't get * a trap. */ case SPRN_DEC: { u64 jd = get_tb() - vcpu->arch.dec_jiffies; - vcpu->arch.gpr[rt] = vcpu->arch.dec - jd; - pr_debug(KERN_INFO "mfDEC: %x - %llx = %lx\n", vcpu->arch.dec, jd, vcpu->arch.gpr[rt]); + kvmppc_set_gpr(vcpu, rt, vcpu->arch.dec - jd); + pr_debug(KERN_INFO "mfDEC: %x - %llx = %lx\n", + vcpu->arch.dec, jd, + kvmppc_get_gpr(vcpu, rt)); break; } default: emulated = kvmppc_core_emulate_mfspr(vcpu, sprn, rt); if (emulated == EMULATE_FAIL) { printk("mfspr: unknown spr %x\n", sprn); - vcpu->arch.gpr[rt] = 0; + kvmppc_set_gpr(vcpu, rt, 0); } break; } @@ -269,7 +278,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) rb = get_rb(inst); emulated = kvmppc_handle_store(run, vcpu, - vcpu->arch.gpr[rs], + kvmppc_get_gpr(vcpu, rs), 2, 1); break; @@ -278,14 +287,14 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) ra = get_ra(inst); rb = get_rb(inst); - ea = vcpu->arch.gpr[rb]; + ea = kvmppc_get_gpr(vcpu, rb); if (ra) - ea += vcpu->arch.gpr[ra]; + ea += kvmppc_get_gpr(vcpu, ra); emulated = kvmppc_handle_store(run, vcpu, - vcpu->arch.gpr[rs], + kvmppc_get_gpr(vcpu, rs), 2, 1); - vcpu->arch.gpr[ra] = ea; + kvmppc_set_gpr(vcpu, ra, ea); break; case OP_31_XOP_MTSPR: @@ -293,9 +302,9 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) rs = get_rs(inst); switch (sprn) { case SPRN_SRR0: - vcpu->arch.srr0 = vcpu->arch.gpr[rs]; break; + vcpu->arch.srr0 = kvmppc_get_gpr(vcpu, rs); break; case SPRN_SRR1: - vcpu->arch.srr1 = vcpu->arch.gpr[rs]; break; + vcpu->arch.srr1 = kvmppc_get_gpr(vcpu, rs); break; /* XXX We need to context-switch the timebase for * watchdog and FIT. */ @@ -305,18 +314,18 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) case SPRN_MSSSR0: break; case SPRN_DEC: - vcpu->arch.dec = vcpu->arch.gpr[rs]; + vcpu->arch.dec = kvmppc_get_gpr(vcpu, rs); kvmppc_emulate_dec(vcpu); break; case SPRN_SPRG0: - vcpu->arch.sprg0 = vcpu->arch.gpr[rs]; break; + vcpu->arch.sprg0 = kvmppc_get_gpr(vcpu, rs); break; case SPRN_SPRG1: - vcpu->arch.sprg1 = vcpu->arch.gpr[rs]; break; + vcpu->arch.sprg1 = kvmppc_get_gpr(vcpu, rs); break; case SPRN_SPRG2: - vcpu->arch.sprg2 = vcpu->arch.gpr[rs]; break; + vcpu->arch.sprg2 = kvmppc_get_gpr(vcpu, rs); break; case SPRN_SPRG3: - vcpu->arch.sprg3 = vcpu->arch.gpr[rs]; break; + vcpu->arch.sprg3 = kvmppc_get_gpr(vcpu, rs); break; default: emulated = kvmppc_core_emulate_mtspr(vcpu, sprn, rs); @@ -348,7 +357,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) rb = get_rb(inst); emulated = kvmppc_handle_store(run, vcpu, - vcpu->arch.gpr[rs], + kvmppc_get_gpr(vcpu, rs), 4, 0); break; @@ -363,7 +372,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) rb = get_rb(inst); emulated = kvmppc_handle_store(run, vcpu, - vcpu->arch.gpr[rs], + kvmppc_get_gpr(vcpu, rs), 2, 0); break; @@ -382,7 +391,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) ra = get_ra(inst); rt = get_rt(inst); emulated = kvmppc_handle_load(run, vcpu, rt, 4, 1); - vcpu->arch.gpr[ra] = vcpu->arch.paddr_accessed; + kvmppc_set_gpr(vcpu, ra, vcpu->arch.paddr_accessed); break; case OP_LBZ: @@ -394,35 +403,39 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) ra = get_ra(inst); rt = get_rt(inst); emulated = kvmppc_handle_load(run, vcpu, rt, 1, 1); - vcpu->arch.gpr[ra] = vcpu->arch.paddr_accessed; + kvmppc_set_gpr(vcpu, ra, vcpu->arch.paddr_accessed); break; case OP_STW: rs = get_rs(inst); - emulated = kvmppc_handle_store(run, vcpu, vcpu->arch.gpr[rs], + emulated = kvmppc_handle_store(run, vcpu, + kvmppc_get_gpr(vcpu, rs), 4, 1); break; case OP_STWU: ra = get_ra(inst); rs = get_rs(inst); - emulated = kvmppc_handle_store(run, vcpu, vcpu->arch.gpr[rs], + emulated = kvmppc_handle_store(run, vcpu, + kvmppc_get_gpr(vcpu, rs), 4, 1); - vcpu->arch.gpr[ra] = vcpu->arch.paddr_accessed; + kvmppc_set_gpr(vcpu, ra, vcpu->arch.paddr_accessed); break; case OP_STB: rs = get_rs(inst); - emulated = kvmppc_handle_store(run, vcpu, vcpu->arch.gpr[rs], + emulated = kvmppc_handle_store(run, vcpu, + kvmppc_get_gpr(vcpu, rs), 1, 1); break; case OP_STBU: ra = get_ra(inst); rs = get_rs(inst); - emulated = kvmppc_handle_store(run, vcpu, vcpu->arch.gpr[rs], + emulated = kvmppc_handle_store(run, vcpu, + kvmppc_get_gpr(vcpu, rs), 1, 1); - vcpu->arch.gpr[ra] = vcpu->arch.paddr_accessed; + kvmppc_set_gpr(vcpu, ra, vcpu->arch.paddr_accessed); break; case OP_LHZ: @@ -434,21 +447,23 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) ra = get_ra(inst); rt = get_rt(inst); emulated = kvmppc_handle_load(run, vcpu, rt, 2, 1); - vcpu->arch.gpr[ra] = vcpu->arch.paddr_accessed; + kvmppc_set_gpr(vcpu, ra, vcpu->arch.paddr_accessed); break; case OP_STH: rs = get_rs(inst); - emulated = kvmppc_handle_store(run, vcpu, vcpu->arch.gpr[rs], + emulated = kvmppc_handle_store(run, vcpu, + kvmppc_get_gpr(vcpu, rs), 2, 1); break; case OP_STHU: ra = get_ra(inst); rs = get_rs(inst); - emulated = kvmppc_handle_store(run, vcpu, vcpu->arch.gpr[rs], + emulated = kvmppc_handle_store(run, vcpu, + kvmppc_get_gpr(vcpu, rs), 2, 1); - vcpu->arch.gpr[ra] = vcpu->arch.paddr_accessed; + kvmppc_set_gpr(vcpu, ra, vcpu->arch.paddr_accessed); break; default: @@ -461,6 +476,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) advance = 0; printk(KERN_ERR "Couldn't emulate instruction 0x%08x " "(op %d xop %d)\n", inst, get_op(inst), get_xop(inst)); + kvmppc_core_queue_program(vcpu, 0); } } diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index f06cf93b178..51aedd7f16b 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -137,6 +137,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm) { kvmppc_free_vcpus(kvm); kvm_free_physmem(kvm); + cleanup_srcu_struct(&kvm->srcu); kfree(kvm); } @@ -165,14 +166,24 @@ long kvm_arch_dev_ioctl(struct file *filp, return -EINVAL; } -int kvm_arch_set_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem, - struct kvm_memory_slot old, - int user_alloc) +int kvm_arch_prepare_memory_region(struct kvm *kvm, + struct kvm_memory_slot *memslot, + struct kvm_memory_slot old, + struct kvm_userspace_memory_region *mem, + int user_alloc) { return 0; } +void kvm_arch_commit_memory_region(struct kvm *kvm, + struct kvm_userspace_memory_region *mem, + struct kvm_memory_slot old, + int user_alloc) +{ + return; +} + + void kvm_arch_flush_shadow(struct kvm *kvm) { } @@ -260,34 +271,35 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, static void kvmppc_complete_dcr_load(struct kvm_vcpu *vcpu, struct kvm_run *run) { - ulong *gpr = &vcpu->arch.gpr[vcpu->arch.io_gpr]; - *gpr = run->dcr.data; + kvmppc_set_gpr(vcpu, vcpu->arch.io_gpr, run->dcr.data); } static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu, struct kvm_run *run) { - ulong *gpr = &vcpu->arch.gpr[vcpu->arch.io_gpr]; + ulong gpr; - if (run->mmio.len > sizeof(*gpr)) { + if (run->mmio.len > sizeof(gpr)) { printk(KERN_ERR "bad MMIO length: %d\n", run->mmio.len); return; } if (vcpu->arch.mmio_is_bigendian) { switch (run->mmio.len) { - case 4: *gpr = *(u32 *)run->mmio.data; break; - case 2: *gpr = *(u16 *)run->mmio.data; break; - case 1: *gpr = *(u8 *)run->mmio.data; break; + case 4: gpr = *(u32 *)run->mmio.data; break; + case 2: gpr = *(u16 *)run->mmio.data; break; + case 1: gpr = *(u8 *)run->mmio.data; break; } } else { /* Convert BE data from userland back to LE. */ switch (run->mmio.len) { - case 4: *gpr = ld_le32((u32 *)run->mmio.data); break; - case 2: *gpr = ld_le16((u16 *)run->mmio.data); break; - case 1: *gpr = *(u8 *)run->mmio.data; break; + case 4: gpr = ld_le32((u32 *)run->mmio.data); break; + case 2: gpr = ld_le16((u16 *)run->mmio.data); break; + case 1: gpr = *(u8 *)run->mmio.data; break; } } + + kvmppc_set_gpr(vcpu, vcpu->arch.io_gpr, gpr); } int kvmppc_handle_load(struct kvm_run *run, struct kvm_vcpu *vcpu, diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 3fa0a10e466..49292869a5c 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -242,6 +242,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm) kvm_free_physmem(kvm); free_page((unsigned long)(kvm->arch.sca)); debug_unregister(kvm->arch.dbf); + cleanup_srcu_struct(&kvm->srcu); kfree(kvm); } @@ -690,14 +691,12 @@ long kvm_arch_vcpu_ioctl(struct file *filp, } /* Section: memory related */ -int kvm_arch_set_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem, - struct kvm_memory_slot old, - int user_alloc) +int kvm_arch_prepare_memory_region(struct kvm *kvm, + struct kvm_memory_slot *memslot, + struct kvm_memory_slot old, + struct kvm_userspace_memory_region *mem, + int user_alloc) { - int i; - struct kvm_vcpu *vcpu; - /* A few sanity checks. We can have exactly one memory slot which has to start at guest virtual zero and which has to be located at a page boundary in userland and which has to end at a page boundary. @@ -720,14 +719,23 @@ int kvm_arch_set_memory_region(struct kvm *kvm, if (!user_alloc) return -EINVAL; + return 0; +} + +void kvm_arch_commit_memory_region(struct kvm *kvm, + struct kvm_userspace_memory_region *mem, + struct kvm_memory_slot old, + int user_alloc) +{ + int i; + struct kvm_vcpu *vcpu; + /* request update of sie control block for all available vcpus */ kvm_for_each_vcpu(i, vcpu, kvm) { if (test_and_set_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) continue; kvm_s390_inject_sigp_stop(vcpu, ACTION_RELOADVCPU_ON_STOP); } - - return 0; } void kvm_arch_flush_shadow(struct kvm *kvm) diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 06cce8285ba..60f09ab3672 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -67,10 +67,14 @@ static inline long kvm_s390_vcpu_get_memsize(struct kvm_vcpu *vcpu) static inline void kvm_s390_vcpu_set_mem(struct kvm_vcpu *vcpu) { + int idx; struct kvm_memory_slot *mem; + struct kvm_memslots *memslots; - down_read(&vcpu->kvm->slots_lock); - mem = &vcpu->kvm->memslots[0]; + idx = srcu_read_lock(&vcpu->kvm->srcu); + memslots = rcu_dereference(vcpu->kvm->memslots); + + mem = &memslots->memslots[0]; vcpu->arch.sie_block->gmsor = mem->userspace_addr; vcpu->arch.sie_block->gmslm = @@ -78,7 +82,7 @@ static inline void kvm_s390_vcpu_set_mem(struct kvm_vcpu *vcpu) (mem->npages << PAGE_SHIFT) + VIRTIODESCSPACE - 1ul; - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&vcpu->kvm->srcu, idx); } /* implemented in priv.c */ diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index 9f828f87ca3..493092efaa3 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -11,6 +11,7 @@ header-y += sigcontext32.h header-y += ucontext.h header-y += processor-flags.h header-y += hw_breakpoint.h +header-y += hyperv.h unifdef-y += e820.h unifdef-y += ist.h diff --git a/arch/x86/include/asm/hyperv.h b/arch/x86/include/asm/hyperv.h new file mode 100644 index 00000000000..e153a2b3889 --- /dev/null +++ b/arch/x86/include/asm/hyperv.h @@ -0,0 +1,186 @@ +#ifndef _ASM_X86_KVM_HYPERV_H +#define _ASM_X86_KVM_HYPERV_H + +#include <linux/types.h> + +/* + * The below CPUID leaves are present if VersionAndFeatures.HypervisorPresent + * is set by CPUID(HvCpuIdFunctionVersionAndFeatures). + */ +#define HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS 0x40000000 +#define HYPERV_CPUID_INTERFACE 0x40000001 +#define HYPERV_CPUID_VERSION 0x40000002 +#define HYPERV_CPUID_FEATURES 0x40000003 +#define HYPERV_CPUID_ENLIGHTMENT_INFO 0x40000004 +#define HYPERV_CPUID_IMPLEMENT_LIMITS 0x40000005 + +/* + * Feature identification. EAX indicates which features are available + * to the partition based upon the current partition privileges. + */ + +/* VP Runtime (HV_X64_MSR_VP_RUNTIME) available */ +#define HV_X64_MSR_VP_RUNTIME_AVAILABLE (1 << 0) +/* Partition Reference Counter (HV_X64_MSR_TIME_REF_COUNT) available*/ +#define HV_X64_MSR_TIME_REF_COUNT_AVAILABLE (1 << 1) +/* + * Basic SynIC MSRs (HV_X64_MSR_SCONTROL through HV_X64_MSR_EOM + * and HV_X64_MSR_SINT0 through HV_X64_MSR_SINT15) available + */ +#define HV_X64_MSR_SYNIC_AVAILABLE (1 << 2) +/* + * Synthetic Timer MSRs (HV_X64_MSR_STIMER0_CONFIG through + * HV_X64_MSR_STIMER3_COUNT) available + */ +#define HV_X64_MSR_SYNTIMER_AVAILABLE (1 << 3) +/* + * APIC access MSRs (HV_X64_MSR_EOI, HV_X64_MSR_ICR and HV_X64_MSR_TPR) + * are available + */ +#define HV_X64_MSR_APIC_ACCESS_AVAILABLE (1 << 4) +/* Hypercall MSRs (HV_X64_MSR_GUEST_OS_ID and HV_X64_MSR_HYPERCALL) available*/ +#define HV_X64_MSR_HYPERCALL_AVAILABLE (1 << 5) +/* Access virtual processor index MSR (HV_X64_MSR_VP_INDEX) available*/ +#define HV_X64_MSR_VP_INDEX_AVAILABLE (1 << 6) +/* Virtual system reset MSR (HV_X64_MSR_RESET) is available*/ +#define HV_X64_MSR_RESET_AVAILABLE (1 << 7) + /* + * Access statistics pages MSRs (HV_X64_MSR_STATS_PARTITION_RETAIL_PAGE, + * HV_X64_MSR_STATS_PARTITION_INTERNAL_PAGE, HV_X64_MSR_STATS_VP_RETAIL_PAGE, + * HV_X64_MSR_STATS_VP_INTERNAL_PAGE) available + */ +#define HV_X64_MSR_STAT_PAGES_AVAILABLE (1 << 8) + +/* + * Feature identification: EBX indicates which flags were specified at + * partition creation. The format is the same as the partition creation + * flag structure defined in section Partition Creation Flags. + */ +#define HV_X64_CREATE_PARTITIONS (1 << 0) +#define HV_X64_ACCESS_PARTITION_ID (1 << 1) +#define HV_X64_ACCESS_MEMORY_POOL (1 << 2) +#define HV_X64_ADJUST_MESSAGE_BUFFERS (1 << 3) +#define HV_X64_POST_MESSAGES (1 << 4) +#define HV_X64_SIGNAL_EVENTS (1 << 5) +#define HV_X64_CREATE_PORT (1 << 6) +#define HV_X64_CONNECT_PORT (1 << 7) +#define HV_X64_ACCESS_STATS (1 << 8) +#define HV_X64_DEBUGGING (1 << 11) +#define HV_X64_CPU_POWER_MANAGEMENT (1 << 12) +#define HV_X64_CONFIGURE_PROFILER (1 << 13) + +/* + * Feature identification. EDX indicates which miscellaneous features + * are available to the partition. + */ +/* The MWAIT instruction is available (per section MONITOR / MWAIT) */ +#define HV_X64_MWAIT_AVAILABLE (1 << 0) +/* Guest debugging support is available */ +#define HV_X64_GUEST_DEBUGGING_AVAILABLE (1 << 1) +/* Performance Monitor support is available*/ +#define HV_X64_PERF_MONITOR_AVAILABLE (1 << 2) +/* Support for physical CPU dynamic partitioning events is available*/ +#define HV_X64_CPU_DYNAMIC_PARTITIONING_AVAILABLE (1 << 3) +/* + * Support for passing hypercall input parameter block via XMM + * registers is available + */ +#define HV_X64_HYPERCALL_PARAMS_XMM_AVAILABLE (1 << 4) +/* Support for a virtual guest idle state is available */ +#define HV_X64_GUEST_IDLE_STATE_AVAILABLE (1 << 5) + +/* + * Implementation recommendations. Indicates which behaviors the hypervisor + * recommends the OS implement for optimal performance. + */ + /* + * Recommend using hypercall for address space switches rather + * than MOV to CR3 instruction + */ +#define HV_X64_MWAIT_RECOMMENDED (1 << 0) +/* Recommend using hypercall for local TLB flushes rather + * than INVLPG or MOV to CR3 instructions */ +#define HV_X64_LOCAL_TLB_FLUSH_RECOMMENDED (1 << 1) +/* + * Recommend using hypercall for remote TLB flushes rather + * than inter-processor interrupts + */ +#define HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED (1 << 2) +/* + * Recommend using MSRs for accessing APIC registers + * EOI, ICR and TPR rather than their memory-mapped counterparts + */ +#define HV_X64_APIC_ACCESS_RECOMMENDED (1 << 3) +/* Recommend using the hypervisor-provided MSR to initiate a system RESET */ +#define HV_X64_SYSTEM_RESET_RECOMMENDED (1 << 4) +/* + * Recommend using relaxed timing for this partition. If used, + * the VM should disable any watchdog timeouts that rely on the + * timely delivery of external interrupts + */ +#define HV_X64_RELAXED_TIMING_RECOMMENDED (1 << 5) + +/* MSR used to identify the guest OS. */ +#define HV_X64_MSR_GUEST_OS_ID 0x40000000 + +/* MSR used to setup pages used to communicate with the hypervisor. */ +#define HV_X64_MSR_HYPERCALL 0x40000001 + +/* MSR used to provide vcpu index */ +#define HV_X64_MSR_VP_INDEX 0x40000002 + +/* Define the virtual APIC registers */ +#define HV_X64_MSR_EOI 0x40000070 +#define HV_X64_MSR_ICR 0x40000071 +#define HV_X64_MSR_TPR 0x40000072 +#define HV_X64_MSR_APIC_ASSIST_PAGE 0x40000073 + +/* Define synthetic interrupt controller model specific registers. */ +#define HV_X64_MSR_SCONTROL 0x40000080 +#define HV_X64_MSR_SVERSION 0x40000081 +#define HV_X64_MSR_SIEFP 0x40000082 +#define HV_X64_MSR_SIMP 0x40000083 +#define HV_X64_MSR_EOM 0x40000084 +#define HV_X64_MSR_SINT0 0x40000090 +#define HV_X64_MSR_SINT1 0x40000091 +#define HV_X64_MSR_SINT2 0x40000092 +#define HV_X64_MSR_SINT3 0x40000093 +#define HV_X64_MSR_SINT4 0x40000094 +#define HV_X64_MSR_SINT5 0x40000095 +#define HV_X64_MSR_SINT6 0x40000096 +#define HV_X64_MSR_SINT7 0x40000097 +#define HV_X64_MSR_SINT8 0x40000098 +#define HV_X64_MSR_SINT9 0x40000099 +#define HV_X64_MSR_SINT10 0x4000009A +#define HV_X64_MSR_SINT11 0x4000009B +#define HV_X64_MSR_SINT12 0x4000009C +#define HV_X64_MSR_SINT13 0x4000009D +#define HV_X64_MSR_SINT14 0x4000009E +#define HV_X64_MSR_SINT15 0x4000009F + + +#define HV_X64_MSR_HYPERCALL_ENABLE 0x00000001 +#define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT 12 +#define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK \ + (~((1ull << HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT) - 1)) + +/* Declare the various hypercall operations. */ +#define HV_X64_HV_NOTIFY_LONG_SPIN_WAIT 0x0008 + +#define HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE 0x00000001 +#define HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT 12 +#define HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_MASK \ + (~((1ull << HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT) - 1)) + +#define HV_PROCESSOR_POWER_STATE_C0 0 +#define HV_PROCESSOR_POWER_STATE_C1 1 +#define HV_PROCESSOR_POWER_STATE_C2 2 +#define HV_PROCESSOR_POWER_STATE_C3 3 + +/* hypercall status code */ +#define HV_STATUS_SUCCESS 0 +#define HV_STATUS_INVALID_HYPERCALL_CODE 2 +#define HV_STATUS_INVALID_HYPERCALL_INPUT 3 +#define HV_STATUS_INVALID_ALIGNMENT 4 + +#endif diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 7c18e1230f5..7a6f54fa13b 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -54,13 +54,23 @@ struct x86_emulate_ctxt; struct x86_emulate_ops { /* * read_std: Read bytes of standard (non-emulated/special) memory. - * Used for instruction fetch, stack operations, and others. + * Used for descriptor reading. * @addr: [IN ] Linear address from which to read. * @val: [OUT] Value read from memory, zero-extended to 'u_long'. * @bytes: [IN ] Number of bytes to read from memory. */ int (*read_std)(unsigned long addr, void *val, - unsigned int bytes, struct kvm_vcpu *vcpu); + unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error); + + /* + * fetch: Read bytes of standard (non-emulated/special) memory. + * Used for instruction fetch. + * @addr: [IN ] Linear address from which to read. + * @val: [OUT] Value read from memory, zero-extended to 'u_long'. + * @bytes: [IN ] Number of bytes to read from memory. + */ + int (*fetch)(unsigned long addr, void *val, + unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error); /* * read_emulated: Read bytes from emulated/special memory area. @@ -74,7 +84,7 @@ struct x86_emulate_ops { struct kvm_vcpu *vcpu); /* - * write_emulated: Read bytes from emulated/special memory area. + * write_emulated: Write bytes to emulated/special memory area. * @addr: [IN ] Linear address to which to write. * @val: [IN ] Value to write to memory (low-order bytes used as * required). @@ -168,6 +178,7 @@ struct x86_emulate_ctxt { /* Execution mode, passed to the emulator. */ #define X86EMUL_MODE_REAL 0 /* Real mode. */ +#define X86EMUL_MODE_VM86 1 /* Virtual 8086 mode. */ #define X86EMUL_MODE_PROT16 2 /* 16-bit protected mode. */ #define X86EMUL_MODE_PROT32 4 /* 32-bit protected mode. */ #define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 4f865e8b854..06d9e79ca37 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -25,7 +25,7 @@ #include <asm/mtrr.h> #include <asm/msr-index.h> -#define KVM_MAX_VCPUS 16 +#define KVM_MAX_VCPUS 64 #define KVM_MEMORY_SLOTS 32 /* memory slots that does not exposed to userspace */ #define KVM_PRIVATE_MEM_SLOTS 4 @@ -38,19 +38,6 @@ #define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \ 0xFFFFFF0000000000ULL) -#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \ - (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD) -#define KVM_GUEST_CR0_MASK \ - (KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) -#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST \ - (X86_CR0_WP | X86_CR0_NE | X86_CR0_TS | X86_CR0_MP) -#define KVM_VM_CR0_ALWAYS_ON \ - (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) -#define KVM_GUEST_CR4_MASK \ - (X86_CR4_VME | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_PGE | X86_CR4_VMXE) -#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) -#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) - #define INVALID_PAGE (~(hpa_t)0) #define UNMAPPED_GVA (~(gpa_t)0) @@ -256,7 +243,8 @@ struct kvm_mmu { void (*new_cr3)(struct kvm_vcpu *vcpu); int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err); void (*free)(struct kvm_vcpu *vcpu); - gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva); + gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access, + u32 *error); void (*prefetch_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page); int (*sync_page)(struct kvm_vcpu *vcpu, @@ -282,13 +270,15 @@ struct kvm_vcpu_arch { u32 regs_dirty; unsigned long cr0; + unsigned long cr0_guest_owned_bits; unsigned long cr2; unsigned long cr3; unsigned long cr4; + unsigned long cr4_guest_owned_bits; unsigned long cr8; u32 hflags; u64 pdptrs[4]; /* pae */ - u64 shadow_efer; + u64 efer; u64 apic_base; struct kvm_lapic *apic; /* kernel irqchip context */ int32_t apic_arb_prio; @@ -374,17 +364,27 @@ struct kvm_vcpu_arch { /* used for guest single stepping over the given code position */ u16 singlestep_cs; unsigned long singlestep_rip; + /* fields used by HYPER-V emulation */ + u64 hv_vapic; }; struct kvm_mem_alias { gfn_t base_gfn; unsigned long npages; gfn_t target_gfn; +#define KVM_ALIAS_INVALID 1UL + unsigned long flags; }; -struct kvm_arch{ - int naliases; +#define KVM_ARCH_HAS_UNALIAS_INSTANTIATION + +struct kvm_mem_aliases { struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS]; + int naliases; +}; + +struct kvm_arch { + struct kvm_mem_aliases *aliases; unsigned int n_free_mmu_pages; unsigned int n_requested_mmu_pages; @@ -416,6 +416,10 @@ struct kvm_arch{ s64 kvmclock_offset; struct kvm_xen_hvm_config xen_hvm_config; + + /* fields used by HYPER-V emulation */ + u64 hv_guest_os_id; + u64 hv_hypercall; }; struct kvm_vm_stat { @@ -471,6 +475,7 @@ struct kvm_x86_ops { int (*hardware_setup)(void); /* __init */ void (*hardware_unsetup)(void); /* __exit */ bool (*cpu_has_accelerated_tpr)(void); + void (*cpuid_update)(struct kvm_vcpu *vcpu); /* Create, but do not attach this VCPU */ struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id); @@ -492,6 +497,7 @@ struct kvm_x86_ops { void (*set_segment)(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l); + void (*decache_cr0_guest_bits)(struct kvm_vcpu *vcpu); void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu); void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); @@ -501,12 +507,13 @@ struct kvm_x86_ops { void (*set_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); void (*get_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); void (*set_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); - unsigned long (*get_dr)(struct kvm_vcpu *vcpu, int dr); - void (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value, - int *exception); + int (*get_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long *dest); + int (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value); void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg); unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); + void (*fpu_activate)(struct kvm_vcpu *vcpu); + void (*fpu_deactivate)(struct kvm_vcpu *vcpu); void (*tlb_flush)(struct kvm_vcpu *vcpu); @@ -531,7 +538,8 @@ struct kvm_x86_ops { int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); int (*get_tdp_level)(void); u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); - bool (*gb_page_enable)(void); + int (*get_lpage_level)(void); + bool (*rdtscp_supported)(void); const struct trace_print_flags *exit_reasons_str; }; @@ -606,8 +614,7 @@ int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value); void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); -int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, - int type_bits, int seg); +int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg); int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason); @@ -653,6 +660,10 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); int kvm_mmu_load(struct kvm_vcpu *vcpu); void kvm_mmu_unload(struct kvm_vcpu *vcpu); void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); +gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error); +gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error); +gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error); +gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error); int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); @@ -666,6 +677,7 @@ void kvm_disable_tdp(void); int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3); int complete_pio(struct kvm_vcpu *vcpu); +bool kvm_check_iopl(struct kvm_vcpu *vcpu); struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn); diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index c584076a47f..ffae1420e7d 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -2,6 +2,7 @@ #define _ASM_X86_KVM_PARA_H #include <linux/types.h> +#include <asm/hyperv.h> /* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx. It * should be used to determine that a VM is running under KVM. diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 1fecb7e6113..38638cd2fa4 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -313,7 +313,7 @@ struct __attribute__ ((__packed__)) vmcb { #define SVM_EXIT_ERR -1 -#define SVM_CR0_SELECTIVE_MASK (1 << 3 | 1) /* TS and MP */ +#define SVM_CR0_SELECTIVE_MASK (X86_CR0_TS | X86_CR0_MP) #define SVM_VMLOAD ".byte 0x0f, 0x01, 0xda" #define SVM_VMRUN ".byte 0x0f, 0x01, 0xd8" diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 2b4945419a8..fb9a080740e 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -53,6 +53,7 @@ */ #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001 #define SECONDARY_EXEC_ENABLE_EPT 0x00000002 +#define SECONDARY_EXEC_RDTSCP 0x00000008 #define SECONDARY_EXEC_ENABLE_VPID 0x00000020 #define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 #define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080 @@ -251,6 +252,7 @@ enum vmcs_field { #define EXIT_REASON_MSR_READ 31 #define EXIT_REASON_MSR_WRITE 32 #define EXIT_REASON_MWAIT_INSTRUCTION 36 +#define EXIT_REASON_MONITOR_INSTRUCTION 39 #define EXIT_REASON_PAUSE_INSTRUCTION 40 #define EXIT_REASON_MCE_DURING_VMENTRY 41 #define EXIT_REASON_TPR_BELOW_THRESHOLD 43 @@ -362,6 +364,7 @@ enum vmcs_field { #define VMX_EPTP_UC_BIT (1ull << 8) #define VMX_EPTP_WB_BIT (1ull << 14) #define VMX_EPT_2MB_PAGE_BIT (1ull << 16) +#define VMX_EPT_1GB_PAGE_BIT (1ull << 17) #define VMX_EPT_EXTENT_INDIVIDUAL_BIT (1ull << 24) #define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) #define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) @@ -374,7 +377,7 @@ enum vmcs_field { #define VMX_EPT_READABLE_MASK 0x1ull #define VMX_EPT_WRITABLE_MASK 0x2ull #define VMX_EPT_EXECUTABLE_MASK 0x4ull -#define VMX_EPT_IGMT_BIT (1ull << 6) +#define VMX_EPT_IPAT_BIT (1ull << 6) #define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 9055e5872ff..1c0c6ab9c60 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -301,7 +301,8 @@ static int __init vsyscall_init(void) register_sysctl_table(kernel_root_table2); #endif on_each_cpu(cpu_vsyscall_init, NULL, 1); - hotcpu_notifier(cpu_vsyscall_notifier, 0); + /* notifier priority > KVM */ + hotcpu_notifier(cpu_vsyscall_notifier, 30); return 0; } diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 3c4d0109ad2..970bbd47951 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -29,6 +29,7 @@ config KVM select HAVE_KVM_EVENTFD select KVM_APIC_ARCHITECTURE select USER_RETURN_NOTIFIER + select KVM_MMIO ---help--- Support hosting fully virtualized guest machines using hardware virtualization extensions. You will need a fairly recent diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 7e8faea4651..4dade6ac082 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -32,7 +32,7 @@ #include <linux/module.h> #include <asm/kvm_emulate.h> -#include "mmu.h" /* for is_long_mode() */ +#include "x86.h" /* * Opcode effective-address decode tables. @@ -76,6 +76,8 @@ #define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ #define GroupMask 0xff /* Group number stored in bits 0:7 */ /* Misc flags */ +#define Lock (1<<26) /* lock prefix is allowed for the instruction */ +#define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */ #define No64 (1<<28) /* Source 2 operand type */ #define Src2None (0<<29) @@ -88,39 +90,40 @@ enum { Group1_80, Group1_81, Group1_82, Group1_83, Group1A, Group3_Byte, Group3, Group4, Group5, Group7, + Group8, Group9, }; static u32 opcode_table[256] = { /* 0x00 - 0x07 */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, ImplicitOps | Stack | No64, ImplicitOps | Stack | No64, /* 0x08 - 0x0F */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, ImplicitOps | Stack | No64, 0, /* 0x10 - 0x17 */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, ImplicitOps | Stack | No64, ImplicitOps | Stack | No64, /* 0x18 - 0x1F */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, ImplicitOps | Stack | No64, ImplicitOps | Stack | No64, /* 0x20 - 0x27 */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0, /* 0x28 - 0x2F */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 0, 0, 0, 0, /* 0x30 - 0x37 */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 0, 0, 0, 0, /* 0x38 - 0x3F */ @@ -156,7 +159,7 @@ static u32 opcode_table[256] = { Group | Group1_80, Group | Group1_81, Group | Group1_82, Group | Group1_83, ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, /* 0x88 - 0x8F */ ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, @@ -210,7 +213,7 @@ static u32 opcode_table[256] = { SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* 0xF0 - 0xF7 */ 0, 0, 0, 0, - ImplicitOps, ImplicitOps, Group | Group3_Byte, Group | Group3, + ImplicitOps | Priv, ImplicitOps, Group | Group3_Byte, Group | Group3, /* 0xF8 - 0xFF */ ImplicitOps, 0, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, Group | Group4, Group | Group5, @@ -218,16 +221,20 @@ static u32 opcode_table[256] = { static u32 twobyte_table[256] = { /* 0x00 - 0x0F */ - 0, Group | GroupDual | Group7, 0, 0, 0, ImplicitOps, ImplicitOps, 0, - ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0, + 0, Group | GroupDual | Group7, 0, 0, + 0, ImplicitOps, ImplicitOps | Priv, 0, + ImplicitOps | Priv, ImplicitOps | Priv, 0, 0, + 0, ImplicitOps | ModRM, 0, 0, /* 0x10 - 0x1F */ 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0, /* 0x20 - 0x2F */ - ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0, + ModRM | ImplicitOps | Priv, ModRM | Priv, + ModRM | ImplicitOps | Priv, ModRM | Priv, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x30 - 0x3F */ - ImplicitOps, 0, ImplicitOps, 0, - ImplicitOps, ImplicitOps, 0, 0, + ImplicitOps | Priv, 0, ImplicitOps | Priv, 0, + ImplicitOps, ImplicitOps | Priv, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x40 - 0x47 */ DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, @@ -257,21 +264,23 @@ static u32 twobyte_table[256] = { DstMem | SrcReg | Src2CL | ModRM, 0, 0, /* 0xA8 - 0xAF */ ImplicitOps | Stack, ImplicitOps | Stack, - 0, DstMem | SrcReg | ModRM | BitOp, + 0, DstMem | SrcReg | ModRM | BitOp | Lock, DstMem | SrcReg | Src2ImmByte | ModRM, DstMem | SrcReg | Src2CL | ModRM, ModRM, 0, /* 0xB0 - 0xB7 */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0, - DstMem | SrcReg | ModRM | BitOp, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, + 0, DstMem | SrcReg | ModRM | BitOp | Lock, 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem16 | ModRM | Mov, /* 0xB8 - 0xBF */ - 0, 0, DstMem | SrcImmByte | ModRM, DstMem | SrcReg | ModRM | BitOp, + 0, 0, + Group | Group8, DstMem | SrcReg | ModRM | BitOp | Lock, 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem16 | ModRM | Mov, /* 0xC0 - 0xCF */ - 0, 0, 0, DstMem | SrcReg | ModRM | Mov, 0, 0, 0, ImplicitOps | ModRM, + 0, 0, 0, DstMem | SrcReg | ModRM | Mov, + 0, 0, 0, Group | GroupDual | Group9, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xD0 - 0xDF */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, @@ -283,25 +292,41 @@ static u32 twobyte_table[256] = { static u32 group_table[] = { [Group1_80*8] = - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM, [Group1_81*8] = - DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, - DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, - DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, - DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, + DstMem | SrcImm | ModRM | Lock, + DstMem | SrcImm | ModRM | Lock, + DstMem | SrcImm | ModRM | Lock, + DstMem | SrcImm | ModRM | Lock, + DstMem | SrcImm | ModRM | Lock, + DstMem | SrcImm | ModRM | Lock, + DstMem | SrcImm | ModRM | Lock, + DstMem | SrcImm | ModRM, [Group1_82*8] = - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, + ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, + ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, + ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, + ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, + ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, + ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, + ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, + ByteOp | DstMem | SrcImm | ModRM | No64, [Group1_83*8] = - DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, - DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, - DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, - DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, + DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM, [Group1A*8] = DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0, [Group3_Byte*8] = @@ -320,24 +345,39 @@ static u32 group_table[] = { SrcMem | ModRM | Stack, 0, SrcMem | ModRM | Stack, 0, SrcMem | ModRM | Stack, 0, [Group7*8] = - 0, 0, ModRM | SrcMem, ModRM | SrcMem, + 0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv, SrcNone | ModRM | DstMem | Mov, 0, - SrcMem16 | ModRM | Mov, SrcMem | ModRM | ByteOp, + SrcMem16 | ModRM | Mov | Priv, SrcMem | ModRM | ByteOp | Priv, + [Group8*8] = + 0, 0, 0, 0, + DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM | Lock, DstMem | SrcImmByte | ModRM | Lock, + [Group9*8] = + 0, ImplicitOps | ModRM | Lock, 0, 0, 0, 0, 0, 0, }; static u32 group2_table[] = { [Group7*8] = - SrcNone | ModRM, 0, 0, SrcNone | ModRM, + SrcNone | ModRM | Priv, 0, 0, SrcNone | ModRM, SrcNone | ModRM | DstMem | Mov, 0, SrcMem16 | ModRM | Mov, 0, + [Group9*8] = + 0, 0, 0, 0, 0, 0, 0, 0, }; /* EFLAGS bit definitions. */ +#define EFLG_ID (1<<21) +#define EFLG_VIP (1<<20) +#define EFLG_VIF (1<<19) +#define EFLG_AC (1<<18) #define EFLG_VM (1<<17) #define EFLG_RF (1<<16) +#define EFLG_IOPL (3<<12) +#define EFLG_NT (1<<14) #define EFLG_OF (1<<11) #define EFLG_DF (1<<10) #define EFLG_IF (1<<9) +#define EFLG_TF (1<<8) #define EFLG_SF (1<<7) #define EFLG_ZF (1<<6) #define EFLG_AF (1<<4) @@ -606,7 +646,7 @@ static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, if (linear < fc->start || linear >= fc->end) { size = min(15UL, PAGE_SIZE - offset_in_page(linear)); - rc = ops->read_std(linear, fc->data, size, ctxt->vcpu); + rc = ops->fetch(linear, fc->data, size, ctxt->vcpu, NULL); if (rc) return rc; fc->start = linear; @@ -661,11 +701,11 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt, op_bytes = 3; *address = 0; rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2, - ctxt->vcpu); + ctxt->vcpu, NULL); if (rc) return rc; rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes, - ctxt->vcpu); + ctxt->vcpu, NULL); return rc; } @@ -889,6 +929,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) switch (mode) { case X86EMUL_MODE_REAL: + case X86EMUL_MODE_VM86: case X86EMUL_MODE_PROT16: def_op_bytes = def_ad_bytes = 2; break; @@ -975,7 +1016,7 @@ done_prefixes: } if (mode == X86EMUL_MODE_PROT64 && (c->d & No64)) { - kvm_report_emulation_failure(ctxt->vcpu, "invalid x86/64 instruction");; + kvm_report_emulation_failure(ctxt->vcpu, "invalid x86/64 instruction"); return -1; } @@ -1196,13 +1237,56 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt, rc = ops->read_emulated(register_address(c, ss_base(ctxt), c->regs[VCPU_REGS_RSP]), dest, len, ctxt->vcpu); - if (rc != 0) + if (rc != X86EMUL_CONTINUE) return rc; register_address_increment(c, &c->regs[VCPU_REGS_RSP], len); return rc; } +static int emulate_popf(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + void *dest, int len) +{ + int rc; + unsigned long val, change_mask; + int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; + int cpl = kvm_x86_ops->get_cpl(ctxt->vcpu); + + rc = emulate_pop(ctxt, ops, &val, len); + if (rc != X86EMUL_CONTINUE) + return rc; + + change_mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_OF + | EFLG_TF | EFLG_DF | EFLG_NT | EFLG_RF | EFLG_AC | EFLG_ID; + + switch(ctxt->mode) { + case X86EMUL_MODE_PROT64: + case X86EMUL_MODE_PROT32: + case X86EMUL_MODE_PROT16: + if (cpl == 0) + change_mask |= EFLG_IOPL; + if (cpl <= iopl) + change_mask |= EFLG_IF; + break; + case X86EMUL_MODE_VM86: + if (iopl < 3) { + kvm_inject_gp(ctxt->vcpu, 0); + return X86EMUL_PROPAGATE_FAULT; + } + change_mask |= EFLG_IF; + break; + default: /* real mode */ + change_mask |= (EFLG_IOPL | EFLG_IF); + break; + } + + *(unsigned long *)dest = + (ctxt->eflags & ~change_mask) | (val & change_mask); + + return rc; +} + static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt, int seg) { struct decode_cache *c = &ctxt->decode; @@ -1225,7 +1309,7 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, if (rc != 0) return rc; - rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)selector, 1, seg); + rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)selector, seg); return rc; } @@ -1370,7 +1454,7 @@ static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt, int rc; rc = ops->read_emulated(memop, &old, 8, ctxt->vcpu); - if (rc != 0) + if (rc != X86EMUL_CONTINUE) return rc; if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) || @@ -1385,7 +1469,7 @@ static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt, (u32) c->regs[VCPU_REGS_RBX]; rc = ops->cmpxchg_emulated(memop, &old, &new, 8, ctxt->vcpu); - if (rc != 0) + if (rc != X86EMUL_CONTINUE) return rc; ctxt->eflags |= EFLG_ZF; } @@ -1407,7 +1491,7 @@ static int emulate_ret_far(struct x86_emulate_ctxt *ctxt, rc = emulate_pop(ctxt, ops, &cs, c->op_bytes); if (rc) return rc; - rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)cs, 1, VCPU_SREG_CS); + rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)cs, VCPU_SREG_CS); return rc; } @@ -1451,7 +1535,7 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt, &c->dst.val, c->dst.bytes, ctxt->vcpu); - if (rc != 0) + if (rc != X86EMUL_CONTINUE) return rc; break; case OP_NONE: @@ -1514,9 +1598,8 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt) u64 msr_data; /* syscall is not available in real mode */ - if (c->lock_prefix || ctxt->mode == X86EMUL_MODE_REAL - || !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) - return -1; + if (ctxt->mode == X86EMUL_MODE_REAL || ctxt->mode == X86EMUL_MODE_VM86) + return X86EMUL_UNHANDLEABLE; setup_syscalls_segments(ctxt, &cs, &ss); @@ -1553,7 +1636,7 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt) ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); } - return 0; + return X86EMUL_CONTINUE; } static int @@ -1563,22 +1646,17 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt) struct kvm_segment cs, ss; u64 msr_data; - /* inject #UD if LOCK prefix is used */ - if (c->lock_prefix) - return -1; - - /* inject #GP if in real mode or paging is disabled */ - if (ctxt->mode == X86EMUL_MODE_REAL || - !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) { + /* inject #GP if in real mode */ + if (ctxt->mode == X86EMUL_MODE_REAL) { kvm_inject_gp(ctxt->vcpu, 0); - return -1; + return X86EMUL_UNHANDLEABLE; } /* XXX sysenter/sysexit have not been tested in 64bit mode. * Therefore, we inject an #UD. */ if (ctxt->mode == X86EMUL_MODE_PROT64) - return -1; + return X86EMUL_UNHANDLEABLE; setup_syscalls_segments(ctxt, &cs, &ss); @@ -1587,13 +1665,13 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt) case X86EMUL_MODE_PROT32: if ((msr_data & 0xfffc) == 0x0) { kvm_inject_gp(ctxt->vcpu, 0); - return -1; + return X86EMUL_PROPAGATE_FAULT; } break; case X86EMUL_MODE_PROT64: if (msr_data == 0x0) { kvm_inject_gp(ctxt->vcpu, 0); - return -1; + return X86EMUL_PROPAGATE_FAULT; } break; } @@ -1618,7 +1696,7 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt) kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data); c->regs[VCPU_REGS_RSP] = msr_data; - return 0; + return X86EMUL_CONTINUE; } static int @@ -1629,21 +1707,11 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt) u64 msr_data; int usermode; - /* inject #UD if LOCK prefix is used */ - if (c->lock_prefix) - return -1; - - /* inject #GP if in real mode or paging is disabled */ - if (ctxt->mode == X86EMUL_MODE_REAL - || !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) { - kvm_inject_gp(ctxt->vcpu, 0); - return -1; - } - - /* sysexit must be called from CPL 0 */ - if (kvm_x86_ops->get_cpl(ctxt->vcpu) != 0) { + /* inject #GP if in real mode or Virtual 8086 mode */ + if (ctxt->mode == X86EMUL_MODE_REAL || + ctxt->mode == X86EMUL_MODE_VM86) { kvm_inject_gp(ctxt->vcpu, 0); - return -1; + return X86EMUL_UNHANDLEABLE; } setup_syscalls_segments(ctxt, &cs, &ss); @@ -1661,7 +1729,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt) cs.selector = (u16)(msr_data + 16); if ((msr_data & 0xfffc) == 0x0) { kvm_inject_gp(ctxt->vcpu, 0); - return -1; + return X86EMUL_PROPAGATE_FAULT; } ss.selector = (u16)(msr_data + 24); break; @@ -1669,7 +1737,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt) cs.selector = (u16)(msr_data + 32); if (msr_data == 0x0) { kvm_inject_gp(ctxt->vcpu, 0); - return -1; + return X86EMUL_PROPAGATE_FAULT; } ss.selector = cs.selector + 8; cs.db = 0; @@ -1685,7 +1753,58 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt) c->eip = ctxt->vcpu->arch.regs[VCPU_REGS_RDX]; c->regs[VCPU_REGS_RSP] = ctxt->vcpu->arch.regs[VCPU_REGS_RCX]; - return 0; + return X86EMUL_CONTINUE; +} + +static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt) +{ + int iopl; + if (ctxt->mode == X86EMUL_MODE_REAL) + return false; + if (ctxt->mode == X86EMUL_MODE_VM86) + return true; + iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; + return kvm_x86_ops->get_cpl(ctxt->vcpu) > iopl; +} + +static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + u16 port, u16 len) +{ + struct kvm_segment tr_seg; + int r; + u16 io_bitmap_ptr; + u8 perm, bit_idx = port & 0x7; + unsigned mask = (1 << len) - 1; + + kvm_get_segment(ctxt->vcpu, &tr_seg, VCPU_SREG_TR); + if (tr_seg.unusable) + return false; + if (tr_seg.limit < 103) + return false; + r = ops->read_std(tr_seg.base + 102, &io_bitmap_ptr, 2, ctxt->vcpu, + NULL); + if (r != X86EMUL_CONTINUE) + return false; + if (io_bitmap_ptr + port/8 > tr_seg.limit) + return false; + r = ops->read_std(tr_seg.base + io_bitmap_ptr + port/8, &perm, 1, + ctxt->vcpu, NULL); + if (r != X86EMUL_CONTINUE) + return false; + if ((perm >> bit_idx) & mask) + return false; + return true; +} + +static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + u16 port, u16 len) +{ + if (emulator_bad_iopl(ctxt)) + if (!emulator_io_port_access_allowed(ctxt, ops, port, len)) + return false; + return true; } int @@ -1709,6 +1828,18 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); saved_eip = c->eip; + /* LOCK prefix is allowed only with some instructions */ + if (c->lock_prefix && !(c->d & Lock)) { + kvm_queue_exception(ctxt->vcpu, UD_VECTOR); + goto done; + } + + /* Privileged instruction can be executed only in CPL=0 */ + if ((c->d & Priv) && kvm_x86_ops->get_cpl(ctxt->vcpu)) { + kvm_inject_gp(ctxt->vcpu, 0); + goto done; + } + if (((c->d & ModRM) && (c->modrm_mod != 3)) || (c->d & MemAbs)) memop = c->modrm_ea; @@ -1749,7 +1880,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) &c->src.val, c->src.bytes, ctxt->vcpu); - if (rc != 0) + if (rc != X86EMUL_CONTINUE) goto done; c->src.orig_val = c->src.val; } @@ -1768,12 +1899,15 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) c->dst.ptr = (void *)c->dst.ptr + (c->src.val & mask) / 8; } - if (!(c->d & Mov) && - /* optimisation - avoid slow emulated read */ - ((rc = ops->read_emulated((unsigned long)c->dst.ptr, - &c->dst.val, - c->dst.bytes, ctxt->vcpu)) != 0)) - goto done; + if (!(c->d & Mov)) { + /* optimisation - avoid slow emulated read */ + rc = ops->read_emulated((unsigned long)c->dst.ptr, + &c->dst.val, + c->dst.bytes, + ctxt->vcpu); + if (rc != X86EMUL_CONTINUE) + goto done; + } } c->dst.orig_val = c->dst.val; @@ -1876,7 +2010,12 @@ special_insn: break; case 0x6c: /* insb */ case 0x6d: /* insw/insd */ - if (kvm_emulate_pio_string(ctxt->vcpu, + if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], + (c->d & ByteOp) ? 1 : c->op_bytes)) { + kvm_inject_gp(ctxt->vcpu, 0); + goto done; + } + if (kvm_emulate_pio_string(ctxt->vcpu, 1, (c->d & ByteOp) ? 1 : c->op_bytes, c->rep_prefix ? @@ -1892,6 +2031,11 @@ special_insn: return 0; case 0x6e: /* outsb */ case 0x6f: /* outsw/outsd */ + if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], + (c->d & ByteOp) ? 1 : c->op_bytes)) { + kvm_inject_gp(ctxt->vcpu, 0); + goto done; + } if (kvm_emulate_pio_string(ctxt->vcpu, 0, (c->d & ByteOp) ? 1 : c->op_bytes, @@ -1978,25 +2122,19 @@ special_insn: break; case 0x8e: { /* mov seg, r/m16 */ uint16_t sel; - int type_bits; - int err; sel = c->src.val; - if (c->modrm_reg == VCPU_SREG_SS) - toggle_interruptibility(ctxt, X86_SHADOW_INT_MOV_SS); - if (c->modrm_reg <= 5) { - type_bits = (c->modrm_reg == 1) ? 9 : 1; - err = kvm_load_segment_descriptor(ctxt->vcpu, sel, - type_bits, c->modrm_reg); - } else { - printk(KERN_INFO "Invalid segreg in modrm byte 0x%02x\n", - c->modrm); - goto cannot_emulate; + if (c->modrm_reg == VCPU_SREG_CS || + c->modrm_reg > VCPU_SREG_GS) { + kvm_queue_exception(ctxt->vcpu, UD_VECTOR); + goto done; } - if (err < 0) - goto cannot_emulate; + if (c->modrm_reg == VCPU_SREG_SS) + toggle_interruptibility(ctxt, X86_SHADOW_INT_MOV_SS); + + rc = kvm_load_segment_descriptor(ctxt->vcpu, sel, c->modrm_reg); c->dst.type = OP_NONE; /* Disable writeback. */ break; @@ -2025,7 +2163,10 @@ special_insn: c->dst.type = OP_REG; c->dst.ptr = (unsigned long *) &ctxt->eflags; c->dst.bytes = c->op_bytes; - goto pop_instruction; + rc = emulate_popf(ctxt, ops, &c->dst.val, c->op_bytes); + if (rc != X86EMUL_CONTINUE) + goto done; + break; case 0xa0 ... 0xa1: /* mov */ c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; c->dst.val = c->src.val; @@ -2039,11 +2180,12 @@ special_insn: c->dst.ptr = (unsigned long *)register_address(c, es_base(ctxt), c->regs[VCPU_REGS_RDI]); - if ((rc = ops->read_emulated(register_address(c, - seg_override_base(ctxt, c), - c->regs[VCPU_REGS_RSI]), + rc = ops->read_emulated(register_address(c, + seg_override_base(ctxt, c), + c->regs[VCPU_REGS_RSI]), &c->dst.val, - c->dst.bytes, ctxt->vcpu)) != 0) + c->dst.bytes, ctxt->vcpu); + if (rc != X86EMUL_CONTINUE) goto done; register_address_increment(c, &c->regs[VCPU_REGS_RSI], (ctxt->eflags & EFLG_DF) ? -c->dst.bytes @@ -2058,10 +2200,11 @@ special_insn: c->src.ptr = (unsigned long *)register_address(c, seg_override_base(ctxt, c), c->regs[VCPU_REGS_RSI]); - if ((rc = ops->read_emulated((unsigned long)c->src.ptr, - &c->src.val, - c->src.bytes, - ctxt->vcpu)) != 0) + rc = ops->read_emulated((unsigned long)c->src.ptr, + &c->src.val, + c->src.bytes, + ctxt->vcpu); + if (rc != X86EMUL_CONTINUE) goto done; c->dst.type = OP_NONE; /* Disable writeback. */ @@ -2069,10 +2212,11 @@ special_insn: c->dst.ptr = (unsigned long *)register_address(c, es_base(ctxt), c->regs[VCPU_REGS_RDI]); - if ((rc = ops->read_emulated((unsigned long)c->dst.ptr, - &c->dst.val, - c->dst.bytes, - ctxt->vcpu)) != 0) + rc = ops->read_emulated((unsigned long)c->dst.ptr, + &c->dst.val, + c->dst.bytes, + ctxt->vcpu); + if (rc != X86EMUL_CONTINUE) goto done; DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr); @@ -2102,12 +2246,13 @@ special_insn: c->dst.type = OP_REG; c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; - if ((rc = ops->read_emulated(register_address(c, - seg_override_base(ctxt, c), - c->regs[VCPU_REGS_RSI]), - &c->dst.val, - c->dst.bytes, - ctxt->vcpu)) != 0) + rc = ops->read_emulated(register_address(c, + seg_override_base(ctxt, c), + c->regs[VCPU_REGS_RSI]), + &c->dst.val, + c->dst.bytes, + ctxt->vcpu); + if (rc != X86EMUL_CONTINUE) goto done; register_address_increment(c, &c->regs[VCPU_REGS_RSI], (ctxt->eflags & EFLG_DF) ? -c->dst.bytes @@ -2163,11 +2308,9 @@ special_insn: case 0xe9: /* jmp rel */ goto jmp; case 0xea: /* jmp far */ - if (kvm_load_segment_descriptor(ctxt->vcpu, c->src2.val, 9, - VCPU_SREG_CS) < 0) { - DPRINTF("jmp far: Failed to load CS descriptor\n"); - goto cannot_emulate; - } + if (kvm_load_segment_descriptor(ctxt->vcpu, c->src2.val, + VCPU_SREG_CS)) + goto done; c->eip = c->src.val; break; @@ -2185,7 +2328,13 @@ special_insn: case 0xef: /* out (e/r)ax,dx */ port = c->regs[VCPU_REGS_RDX]; io_dir_in = 0; - do_io: if (kvm_emulate_pio(ctxt->vcpu, io_dir_in, + do_io: + if (!emulator_io_permited(ctxt, ops, port, + (c->d & ByteOp) ? 1 : c->op_bytes)) { + kvm_inject_gp(ctxt->vcpu, 0); + goto done; + } + if (kvm_emulate_pio(ctxt->vcpu, io_dir_in, (c->d & ByteOp) ? 1 : c->op_bytes, port) != 0) { c->eip = saved_eip; @@ -2210,13 +2359,21 @@ special_insn: c->dst.type = OP_NONE; /* Disable writeback. */ break; case 0xfa: /* cli */ - ctxt->eflags &= ~X86_EFLAGS_IF; - c->dst.type = OP_NONE; /* Disable writeback. */ + if (emulator_bad_iopl(ctxt)) + kvm_inject_gp(ctxt->vcpu, 0); + else { + ctxt->eflags &= ~X86_EFLAGS_IF; + c->dst.type = OP_NONE; /* Disable writeback. */ + } break; case 0xfb: /* sti */ - toggle_interruptibility(ctxt, X86_SHADOW_INT_STI); - ctxt->eflags |= X86_EFLAGS_IF; - c->dst.type = OP_NONE; /* Disable writeback. */ + if (emulator_bad_iopl(ctxt)) + kvm_inject_gp(ctxt->vcpu, 0); + else { + toggle_interruptibility(ctxt, X86_SHADOW_INT_STI); + ctxt->eflags |= X86_EFLAGS_IF; + c->dst.type = OP_NONE; /* Disable writeback. */ + } break; case 0xfc: /* cld */ ctxt->eflags &= ~EFLG_DF; @@ -2319,8 +2476,9 @@ twobyte_insn: } break; case 0x05: /* syscall */ - if (emulate_syscall(ctxt) == -1) - goto cannot_emulate; + rc = emulate_syscall(ctxt); + if (rc != X86EMUL_CONTINUE) + goto done; else goto writeback; break; @@ -2391,14 +2549,16 @@ twobyte_insn: c->dst.type = OP_NONE; break; case 0x34: /* sysenter */ - if (emulate_sysenter(ctxt) == -1) - goto cannot_emulate; + rc = emulate_sysenter(ctxt); + if (rc != X86EMUL_CONTINUE) + goto done; else goto writeback; break; case 0x35: /* sysexit */ - if (emulate_sysexit(ctxt) == -1) - goto cannot_emulate; + rc = emulate_sysexit(ctxt); + if (rc != X86EMUL_CONTINUE) + goto done; else goto writeback; break; diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 15578f180e5..294698b6daf 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -242,11 +242,11 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian) { struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state, irq_ack_notifier); - spin_lock(&ps->inject_lock); + raw_spin_lock(&ps->inject_lock); if (atomic_dec_return(&ps->pit_timer.pending) < 0) atomic_inc(&ps->pit_timer.pending); ps->irq_ack = 1; - spin_unlock(&ps->inject_lock); + raw_spin_unlock(&ps->inject_lock); } void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) @@ -605,7 +605,7 @@ static const struct kvm_io_device_ops speaker_dev_ops = { .write = speaker_ioport_write, }; -/* Caller must have writers lock on slots_lock */ +/* Caller must hold slots_lock */ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) { struct kvm_pit *pit; @@ -624,7 +624,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) mutex_init(&pit->pit_state.lock); mutex_lock(&pit->pit_state.lock); - spin_lock_init(&pit->pit_state.inject_lock); + raw_spin_lock_init(&pit->pit_state.inject_lock); kvm->arch.vpit = pit; pit->kvm = kvm; @@ -645,13 +645,13 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier); kvm_iodevice_init(&pit->dev, &pit_dev_ops); - ret = __kvm_io_bus_register_dev(&kvm->pio_bus, &pit->dev); + ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &pit->dev); if (ret < 0) goto fail; if (flags & KVM_PIT_SPEAKER_DUMMY) { kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops); - ret = __kvm_io_bus_register_dev(&kvm->pio_bus, + ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &pit->speaker_dev); if (ret < 0) goto fail_unregister; @@ -660,11 +660,12 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) return pit; fail_unregister: - __kvm_io_bus_unregister_dev(&kvm->pio_bus, &pit->dev); + kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->dev); fail: - if (pit->irq_source_id >= 0) - kvm_free_irq_source_id(kvm, pit->irq_source_id); + kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier); + kvm_unregister_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier); + kvm_free_irq_source_id(kvm, pit->irq_source_id); kfree(pit); return NULL; @@ -723,12 +724,12 @@ void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu) /* Try to inject pending interrupts when * last one has been acked. */ - spin_lock(&ps->inject_lock); + raw_spin_lock(&ps->inject_lock); if (atomic_read(&ps->pit_timer.pending) && ps->irq_ack) { ps->irq_ack = 0; inject = 1; } - spin_unlock(&ps->inject_lock); + raw_spin_unlock(&ps->inject_lock); if (inject) __inject_pit_timer_intr(kvm); } diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index d4c1c7ffdc0..900d6b0ba7c 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h @@ -27,7 +27,7 @@ struct kvm_kpit_state { u32 speaker_data_on; struct mutex lock; struct kvm_pit *pit; - spinlock_t inject_lock; + raw_spinlock_t inject_lock; unsigned long irq_ack; struct kvm_irq_ack_notifier irq_ack_notifier; }; diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index d057c0cbd24..07771da85de 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -44,18 +44,19 @@ static void pic_clear_isr(struct kvm_kpic_state *s, int irq) * Other interrupt may be delivered to PIC while lock is dropped but * it should be safe since PIC state is already updated at this stage. */ - spin_unlock(&s->pics_state->lock); + raw_spin_unlock(&s->pics_state->lock); kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq); - spin_lock(&s->pics_state->lock); + raw_spin_lock(&s->pics_state->lock); } void kvm_pic_clear_isr_ack(struct kvm *kvm) { struct kvm_pic *s = pic_irqchip(kvm); - spin_lock(&s->lock); + + raw_spin_lock(&s->lock); s->pics[0].isr_ack = 0xff; s->pics[1].isr_ack = 0xff; - spin_unlock(&s->lock); + raw_spin_unlock(&s->lock); } /* @@ -156,9 +157,9 @@ static void pic_update_irq(struct kvm_pic *s) void kvm_pic_update_irq(struct kvm_pic *s) { - spin_lock(&s->lock); + raw_spin_lock(&s->lock); pic_update_irq(s); - spin_unlock(&s->lock); + raw_spin_unlock(&s->lock); } int kvm_pic_set_irq(void *opaque, int irq, int level) @@ -166,14 +167,14 @@ int kvm_pic_set_irq(void *opaque, int irq, int level) struct kvm_pic *s = opaque; int ret = -1; - spin_lock(&s->lock); + raw_spin_lock(&s->lock); if (irq >= 0 && irq < PIC_NUM_PINS) { ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); pic_update_irq(s); trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr, s->pics[irq >> 3].imr, ret == 0); } - spin_unlock(&s->lock); + raw_spin_unlock(&s->lock); return ret; } @@ -203,7 +204,7 @@ int kvm_pic_read_irq(struct kvm *kvm) int irq, irq2, intno; struct kvm_pic *s = pic_irqchip(kvm); - spin_lock(&s->lock); + raw_spin_lock(&s->lock); irq = pic_get_irq(&s->pics[0]); if (irq >= 0) { pic_intack(&s->pics[0], irq); @@ -228,7 +229,7 @@ int kvm_pic_read_irq(struct kvm *kvm) intno = s->pics[0].irq_base + irq; } pic_update_irq(s); - spin_unlock(&s->lock); + raw_spin_unlock(&s->lock); return intno; } @@ -442,7 +443,7 @@ static int picdev_write(struct kvm_io_device *this, printk(KERN_ERR "PIC: non byte write\n"); return 0; } - spin_lock(&s->lock); + raw_spin_lock(&s->lock); switch (addr) { case 0x20: case 0x21: @@ -455,7 +456,7 @@ static int picdev_write(struct kvm_io_device *this, elcr_ioport_write(&s->pics[addr & 1], addr, data); break; } - spin_unlock(&s->lock); + raw_spin_unlock(&s->lock); return 0; } @@ -472,7 +473,7 @@ static int picdev_read(struct kvm_io_device *this, printk(KERN_ERR "PIC: non byte read\n"); return 0; } - spin_lock(&s->lock); + raw_spin_lock(&s->lock); switch (addr) { case 0x20: case 0x21: @@ -486,7 +487,7 @@ static int picdev_read(struct kvm_io_device *this, break; } *(unsigned char *)val = data; - spin_unlock(&s->lock); + raw_spin_unlock(&s->lock); return 0; } @@ -520,7 +521,7 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm) s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL); if (!s) return NULL; - spin_lock_init(&s->lock); + raw_spin_lock_init(&s->lock); s->kvm = kvm; s->pics[0].elcr_mask = 0xf8; s->pics[1].elcr_mask = 0xde; @@ -533,7 +534,9 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm) * Initialize PIO device */ kvm_iodevice_init(&s->dev, &picdev_ops); - ret = kvm_io_bus_register_dev(kvm, &kvm->pio_bus, &s->dev); + mutex_lock(&kvm->slots_lock); + ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &s->dev); + mutex_unlock(&kvm->slots_lock); if (ret < 0) { kfree(s); return NULL; @@ -541,3 +544,14 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm) return s; } + +void kvm_destroy_pic(struct kvm *kvm) +{ + struct kvm_pic *vpic = kvm->arch.vpic; + + if (vpic) { + kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev); + kvm->arch.vpic = NULL; + kfree(vpic); + } +} diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index be399e207d5..34b15915754 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -62,7 +62,7 @@ struct kvm_kpic_state { }; struct kvm_pic { - spinlock_t lock; + raw_spinlock_t lock; unsigned pending_acks; struct kvm *kvm; struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ @@ -75,6 +75,7 @@ struct kvm_pic { }; struct kvm_pic *kvm_create_pic(struct kvm *kvm); +void kvm_destroy_pic(struct kvm *kvm); int kvm_pic_read_irq(struct kvm *kvm); void kvm_pic_update_irq(struct kvm_pic *s); void kvm_pic_clear_isr_ack(struct kvm *kvm); diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index 7bcc5b6a440..cff851cf532 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -1,6 +1,11 @@ #ifndef ASM_KVM_CACHE_REGS_H #define ASM_KVM_CACHE_REGS_H +#define KVM_POSSIBLE_CR0_GUEST_BITS X86_CR0_TS +#define KVM_POSSIBLE_CR4_GUEST_BITS \ + (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ + | X86_CR4_OSXMMEXCPT | X86_CR4_PGE) + static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu, enum kvm_reg reg) { @@ -38,4 +43,30 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index) return vcpu->arch.pdptrs[index]; } +static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask) +{ + ulong tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS; + if (tmask & vcpu->arch.cr0_guest_owned_bits) + kvm_x86_ops->decache_cr0_guest_bits(vcpu); + return vcpu->arch.cr0 & mask; +} + +static inline ulong kvm_read_cr0(struct kvm_vcpu *vcpu) +{ + return kvm_read_cr0_bits(vcpu, ~0UL); +} + +static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask) +{ + ulong tmask = mask & KVM_POSSIBLE_CR4_GUEST_BITS; + if (tmask & vcpu->arch.cr4_guest_owned_bits) + kvm_x86_ops->decache_cr4_guest_bits(vcpu); + return vcpu->arch.cr4 & mask; +} + +static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu) +{ + return kvm_read_cr4_bits(vcpu, ~0UL); +} + #endif diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index ba8c045da78..4b224f90087 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1246,3 +1246,34 @@ int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data) return 0; } + +int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 reg, u64 data) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + + if (!irqchip_in_kernel(vcpu->kvm)) + return 1; + + /* if this is ICR write vector before command */ + if (reg == APIC_ICR) + apic_reg_write(apic, APIC_ICR2, (u32)(data >> 32)); + return apic_reg_write(apic, reg, (u32)data); +} + +int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + u32 low, high = 0; + + if (!irqchip_in_kernel(vcpu->kvm)) + return 1; + + if (apic_reg_read(apic, reg, 4, &low)) + return 1; + if (reg == APIC_ICR) + apic_reg_read(apic, APIC_ICR2, 4, &high); + + *data = (((u64)high) << 32) | low; + + return 0; +} diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 40010b09c4a..f5fe32c5eda 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -48,4 +48,12 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu); int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data); int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data); + +int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data); +int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data); + +static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.hv_vapic & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE; +} #endif diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 89a49fb46a2..741373e8ca7 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -18,6 +18,7 @@ */ #include "mmu.h" +#include "x86.h" #include "kvm_cache_regs.h" #include <linux/kvm_host.h> @@ -29,6 +30,7 @@ #include <linux/swap.h> #include <linux/hugetlb.h> #include <linux/compiler.h> +#include <linux/srcu.h> #include <asm/page.h> #include <asm/cmpxchg.h> @@ -136,16 +138,6 @@ module_param(oos_shadow, bool, 0644); #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \ | PT64_NX_MASK) -#define PFERR_PRESENT_MASK (1U << 0) -#define PFERR_WRITE_MASK (1U << 1) -#define PFERR_USER_MASK (1U << 2) -#define PFERR_RSVD_MASK (1U << 3) -#define PFERR_FETCH_MASK (1U << 4) - -#define PT_PDPE_LEVEL 3 -#define PT_DIRECTORY_LEVEL 2 -#define PT_PAGE_TABLE_LEVEL 1 - #define RMAP_EXT 4 #define ACC_EXEC_MASK 1 @@ -153,6 +145,9 @@ module_param(oos_shadow, bool, 0644); #define ACC_USER_MASK PT_USER_MASK #define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) +#include <trace/events/kvm.h> + +#undef TRACE_INCLUDE_FILE #define CREATE_TRACE_POINTS #include "mmutrace.h" @@ -229,7 +224,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); static int is_write_protection(struct kvm_vcpu *vcpu) { - return vcpu->arch.cr0 & X86_CR0_WP; + return kvm_read_cr0_bits(vcpu, X86_CR0_WP); } static int is_cpuid_PSE36(void) @@ -239,7 +234,7 @@ static int is_cpuid_PSE36(void) static int is_nx(struct kvm_vcpu *vcpu) { - return vcpu->arch.shadow_efer & EFER_NX; + return vcpu->arch.efer & EFER_NX; } static int is_shadow_present_pte(u64 pte) @@ -253,7 +248,7 @@ static int is_large_pte(u64 pte) return pte & PT_PAGE_SIZE_MASK; } -static int is_writeble_pte(unsigned long pte) +static int is_writable_pte(unsigned long pte) { return pte & PT_WRITABLE_MASK; } @@ -470,24 +465,10 @@ static int has_wrprotected_page(struct kvm *kvm, static int host_mapping_level(struct kvm *kvm, gfn_t gfn) { - unsigned long page_size = PAGE_SIZE; - struct vm_area_struct *vma; - unsigned long addr; + unsigned long page_size; int i, ret = 0; - addr = gfn_to_hva(kvm, gfn); - if (kvm_is_error_hva(addr)) - return PT_PAGE_TABLE_LEVEL; - - down_read(¤t->mm->mmap_sem); - vma = find_vma(current->mm, addr); - if (!vma) - goto out; - - page_size = vma_kernel_pagesize(vma); - -out: - up_read(¤t->mm->mmap_sem); + page_size = kvm_host_page_size(kvm, gfn); for (i = PT_PAGE_TABLE_LEVEL; i < (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES); ++i) { @@ -503,8 +484,7 @@ out: static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) { struct kvm_memory_slot *slot; - int host_level; - int level = PT_PAGE_TABLE_LEVEL; + int host_level, level, max_level; slot = gfn_to_memslot(vcpu->kvm, large_gfn); if (slot && slot->dirty_bitmap) @@ -515,7 +495,10 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) if (host_level == PT_PAGE_TABLE_LEVEL) return host_level; - for (level = PT_DIRECTORY_LEVEL; level <= host_level; ++level) + max_level = kvm_x86_ops->get_lpage_level() < host_level ? + kvm_x86_ops->get_lpage_level() : host_level; + + for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level) if (has_wrprotected_page(vcpu->kvm, large_gfn, level)) break; @@ -633,7 +616,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) pfn = spte_to_pfn(*spte); if (*spte & shadow_accessed_mask) kvm_set_pfn_accessed(pfn); - if (is_writeble_pte(*spte)) + if (is_writable_pte(*spte)) kvm_set_pfn_dirty(pfn); rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], sp->role.level); if (!*rmapp) { @@ -662,6 +645,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) prev_desc = desc; desc = desc->more; } + pr_err("rmap_remove: %p %llx many->many\n", spte, *spte); BUG(); } } @@ -708,7 +692,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) BUG_ON(!spte); BUG_ON(!(*spte & PT_PRESENT_MASK)); rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); - if (is_writeble_pte(*spte)) { + if (is_writable_pte(*spte)) { __set_spte(spte, *spte & ~PT_WRITABLE_MASK); write_protected = 1; } @@ -732,7 +716,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) BUG_ON(!(*spte & PT_PRESENT_MASK)); BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)); pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); - if (is_writeble_pte(*spte)) { + if (is_writable_pte(*spte)) { rmap_remove(kvm, spte); --kvm->stat.lpages; __set_spte(spte, shadow_trap_nonpresent_pte); @@ -787,7 +771,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, new_spte &= ~PT_WRITABLE_MASK; new_spte &= ~SPTE_HOST_WRITEABLE; - if (is_writeble_pte(*spte)) + if (is_writable_pte(*spte)) kvm_set_pfn_dirty(spte_to_pfn(*spte)); __set_spte(spte, new_spte); spte = rmap_next(kvm, rmapp, spte); @@ -805,35 +789,32 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, unsigned long data)) { int i, j; + int ret; int retval = 0; + struct kvm_memslots *slots; - /* - * If mmap_sem isn't taken, we can look the memslots with only - * the mmu_lock by skipping over the slots with userspace_addr == 0. - */ - for (i = 0; i < kvm->nmemslots; i++) { - struct kvm_memory_slot *memslot = &kvm->memslots[i]; + slots = rcu_dereference(kvm->memslots); + + for (i = 0; i < slots->nmemslots; i++) { + struct kvm_memory_slot *memslot = &slots->memslots[i]; unsigned long start = memslot->userspace_addr; unsigned long end; - /* mmu_lock protects userspace_addr */ - if (!start) - continue; - end = start + (memslot->npages << PAGE_SHIFT); if (hva >= start && hva < end) { gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; - retval |= handler(kvm, &memslot->rmap[gfn_offset], - data); + ret = handler(kvm, &memslot->rmap[gfn_offset], data); for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) { int idx = gfn_offset; idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j); - retval |= handler(kvm, + ret |= handler(kvm, &memslot->lpage_info[j][idx].rmap_pde, data); } + trace_kvm_age_page(hva, memslot, ret); + retval |= ret; } } @@ -856,9 +837,15 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, u64 *spte; int young = 0; - /* always return old for EPT */ + /* + * Emulate the accessed bit for EPT, by checking if this page has + * an EPT mapping, and clearing it if it does. On the next access, + * a new EPT mapping will be established. + * This has some overhead, but not as much as the cost of swapping + * out actively used pages or breaking up actively used hugepages. + */ if (!shadow_accessed_mask) - return 0; + return kvm_unmap_rmapp(kvm, rmapp, data); spte = rmap_next(kvm, rmapp, NULL); while (spte) { @@ -1615,7 +1602,7 @@ static void mmu_unshadow(struct kvm *kvm, gfn_t gfn) static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) { - int slot = memslot_id(kvm, gfn_to_memslot(kvm, gfn)); + int slot = memslot_id(kvm, gfn); struct kvm_mmu_page *sp = page_header(__pa(pte)); __set_bit(slot, sp->slot_bitmap); @@ -1639,7 +1626,7 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) { struct page *page; - gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); + gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); if (gpa == UNMAPPED_GVA) return NULL; @@ -1852,7 +1839,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, * is responsibility of mmu_get_page / kvm_sync_page. * Same reasoning can be applied to dirty page accounting. */ - if (!can_unsync && is_writeble_pte(*sptep)) + if (!can_unsync && is_writable_pte(*sptep)) goto set_pte; if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { @@ -1860,7 +1847,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, __func__, gfn); ret = 1; pte_access &= ~ACC_WRITE_MASK; - if (is_writeble_pte(spte)) + if (is_writable_pte(spte)) spte &= ~PT_WRITABLE_MASK; } } @@ -1881,7 +1868,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, bool reset_host_protection) { int was_rmapped = 0; - int was_writeble = is_writeble_pte(*sptep); + int was_writable = is_writable_pte(*sptep); int rmap_count; pgprintk("%s: spte %llx access %x write_fault %d" @@ -1932,7 +1919,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (rmap_count > RMAP_RECYCLE_THRESHOLD) rmap_recycle(vcpu, sptep, gfn); } else { - if (was_writeble) + if (was_writable) kvm_release_pfn_dirty(pfn); else kvm_release_pfn_clean(pfn); @@ -2162,8 +2149,11 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) spin_unlock(&vcpu->kvm->mmu_lock); } -static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) +static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr, + u32 access, u32 *error) { + if (error) + *error = 0; return vaddr; } @@ -2747,7 +2737,7 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) if (tdp_enabled) return 0; - gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); + gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); spin_lock(&vcpu->kvm->mmu_lock); r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); @@ -2847,16 +2837,13 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu) */ page = alloc_page(GFP_KERNEL | __GFP_DMA32); if (!page) - goto error_1; + return -ENOMEM; + vcpu->arch.mmu.pae_root = page_address(page); for (i = 0; i < 4; ++i) vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; return 0; - -error_1: - free_mmu_pages(vcpu); - return -ENOMEM; } int kvm_mmu_create(struct kvm_vcpu *vcpu) @@ -2936,10 +2923,9 @@ static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask) spin_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { - int npages; + int npages, idx; - if (!down_read_trylock(&kvm->slots_lock)) - continue; + idx = srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); npages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages; @@ -2952,7 +2938,7 @@ static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask) nr_to_scan--; spin_unlock(&kvm->mmu_lock); - up_read(&kvm->slots_lock); + srcu_read_unlock(&kvm->srcu, idx); } if (kvm_freed) list_move_tail(&kvm_freed->vm_list, &vm_list); @@ -3019,9 +3005,11 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm) int i; unsigned int nr_mmu_pages; unsigned int nr_pages = 0; + struct kvm_memslots *slots; - for (i = 0; i < kvm->nmemslots; i++) - nr_pages += kvm->memslots[i].npages; + slots = rcu_dereference(kvm->memslots); + for (i = 0; i < slots->nmemslots; i++) + nr_pages += slots->memslots[i].npages; nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000; nr_mmu_pages = max(nr_mmu_pages, @@ -3246,7 +3234,7 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte, if (is_shadow_present_pte(ent) && !is_last_spte(ent, level)) audit_mappings_page(vcpu, ent, va, level - 1); else { - gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va); + gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, va, NULL); gfn_t gfn = gpa >> PAGE_SHIFT; pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn); hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT; @@ -3291,10 +3279,12 @@ static void audit_mappings(struct kvm_vcpu *vcpu) static int count_rmaps(struct kvm_vcpu *vcpu) { int nmaps = 0; - int i, j, k; + int i, j, k, idx; + idx = srcu_read_lock(&kvm->srcu); + slots = rcu_dereference(kvm->memslots); for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { - struct kvm_memory_slot *m = &vcpu->kvm->memslots[i]; + struct kvm_memory_slot *m = &slots->memslots[i]; struct kvm_rmap_desc *d; for (j = 0; j < m->npages; ++j) { @@ -3317,6 +3307,7 @@ static int count_rmaps(struct kvm_vcpu *vcpu) } } } + srcu_read_unlock(&kvm->srcu, idx); return nmaps; } diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 61a1b3884b4..be66759321a 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -2,6 +2,7 @@ #define __KVM_X86_MMU_H #include <linux/kvm_host.h> +#include "kvm_cache_regs.h" #define PT64_PT_BITS 9 #define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS) @@ -37,6 +38,16 @@ #define PT32_ROOT_LEVEL 2 #define PT32E_ROOT_LEVEL 3 +#define PT_PDPE_LEVEL 3 +#define PT_DIRECTORY_LEVEL 2 +#define PT_PAGE_TABLE_LEVEL 1 + +#define PFERR_PRESENT_MASK (1U << 0) +#define PFERR_WRITE_MASK (1U << 1) +#define PFERR_USER_MASK (1U << 2) +#define PFERR_RSVD_MASK (1U << 3) +#define PFERR_FETCH_MASK (1U << 4) + int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) @@ -53,30 +64,6 @@ static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) return kvm_mmu_load(vcpu); } -static inline int is_long_mode(struct kvm_vcpu *vcpu) -{ -#ifdef CONFIG_X86_64 - return vcpu->arch.shadow_efer & EFER_LMA; -#else - return 0; -#endif -} - -static inline int is_pae(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.cr4 & X86_CR4_PAE; -} - -static inline int is_pse(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.cr4 & X86_CR4_PSE; -} - -static inline int is_paging(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.cr0 & X86_CR0_PG; -} - static inline int is_present_gpte(unsigned long pte) { return pte & PT_PRESENT_MASK; diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index ede2131a922..81eab9a50e6 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -162,7 +162,7 @@ walk: if (rsvd_fault) goto access_error; - if (write_fault && !is_writeble_pte(pte)) + if (write_fault && !is_writable_pte(pte)) if (user_fault || is_write_protection(vcpu)) goto access_error; @@ -490,18 +490,23 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) spin_unlock(&vcpu->kvm->mmu_lock); } -static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) +static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, + u32 *error) { struct guest_walker walker; gpa_t gpa = UNMAPPED_GVA; int r; - r = FNAME(walk_addr)(&walker, vcpu, vaddr, 0, 0, 0); + r = FNAME(walk_addr)(&walker, vcpu, vaddr, + !!(access & PFERR_WRITE_MASK), + !!(access & PFERR_USER_MASK), + !!(access & PFERR_FETCH_MASK)); if (r) { gpa = gfn_to_gpa(walker.gfn); gpa |= vaddr & ~PAGE_MASK; - } + } else if (error) + *error = walker.error_code; return gpa; } diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 1d9b33843c8..52f78dd0301 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -231,7 +231,7 @@ static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) efer &= ~EFER_LME; to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME; - vcpu->arch.shadow_efer = efer; + vcpu->arch.efer = efer; } static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, @@ -540,6 +540,8 @@ static void init_vmcb(struct vcpu_svm *svm) struct vmcb_control_area *control = &svm->vmcb->control; struct vmcb_save_area *save = &svm->vmcb->save; + svm->vcpu.fpu_active = 1; + control->intercept_cr_read = INTERCEPT_CR0_MASK | INTERCEPT_CR3_MASK | INTERCEPT_CR4_MASK; @@ -552,13 +554,19 @@ static void init_vmcb(struct vcpu_svm *svm) control->intercept_dr_read = INTERCEPT_DR0_MASK | INTERCEPT_DR1_MASK | INTERCEPT_DR2_MASK | - INTERCEPT_DR3_MASK; + INTERCEPT_DR3_MASK | + INTERCEPT_DR4_MASK | + INTERCEPT_DR5_MASK | + INTERCEPT_DR6_MASK | + INTERCEPT_DR7_MASK; control->intercept_dr_write = INTERCEPT_DR0_MASK | INTERCEPT_DR1_MASK | INTERCEPT_DR2_MASK | INTERCEPT_DR3_MASK | + INTERCEPT_DR4_MASK | INTERCEPT_DR5_MASK | + INTERCEPT_DR6_MASK | INTERCEPT_DR7_MASK; control->intercept_exceptions = (1 << PF_VECTOR) | @@ -569,6 +577,7 @@ static void init_vmcb(struct vcpu_svm *svm) control->intercept = (1ULL << INTERCEPT_INTR) | (1ULL << INTERCEPT_NMI) | (1ULL << INTERCEPT_SMI) | + (1ULL << INTERCEPT_SELECTIVE_CR0) | (1ULL << INTERCEPT_CPUID) | (1ULL << INTERCEPT_INVD) | (1ULL << INTERCEPT_HLT) | @@ -641,10 +650,8 @@ static void init_vmcb(struct vcpu_svm *svm) control->intercept &= ~((1ULL << INTERCEPT_TASK_SWITCH) | (1ULL << INTERCEPT_INVLPG)); control->intercept_exceptions &= ~(1 << PF_VECTOR); - control->intercept_cr_read &= ~(INTERCEPT_CR0_MASK| - INTERCEPT_CR3_MASK); - control->intercept_cr_write &= ~(INTERCEPT_CR0_MASK| - INTERCEPT_CR3_MASK); + control->intercept_cr_read &= ~INTERCEPT_CR3_MASK; + control->intercept_cr_write &= ~INTERCEPT_CR3_MASK; save->g_pat = 0x0007040600070406ULL; save->cr3 = 0; save->cr4 = 0; @@ -730,7 +737,6 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) init_vmcb(svm); fx_init(&svm->vcpu); - svm->vcpu.fpu_active = 1; svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; if (kvm_vcpu_is_bsp(&svm->vcpu)) svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; @@ -765,14 +771,16 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (unlikely(cpu != vcpu->cpu)) { u64 delta; - /* - * Make sure that the guest sees a monotonically - * increasing TSC. - */ - delta = vcpu->arch.host_tsc - native_read_tsc(); - svm->vmcb->control.tsc_offset += delta; - if (is_nested(svm)) - svm->nested.hsave->control.tsc_offset += delta; + if (check_tsc_unstable()) { + /* + * Make sure that the guest sees a monotonically + * increasing TSC. + */ + delta = vcpu->arch.host_tsc - native_read_tsc(); + svm->vmcb->control.tsc_offset += delta; + if (is_nested(svm)) + svm->nested.hsave->control.tsc_offset += delta; + } vcpu->cpu = cpu; kvm_migrate_timers(vcpu); svm->asid_generation = 0; @@ -954,42 +962,59 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) svm->vmcb->save.gdtr.base = dt->base ; } +static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) +{ +} + static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) { } +static void update_cr0_intercept(struct vcpu_svm *svm) +{ + ulong gcr0 = svm->vcpu.arch.cr0; + u64 *hcr0 = &svm->vmcb->save.cr0; + + if (!svm->vcpu.fpu_active) + *hcr0 |= SVM_CR0_SELECTIVE_MASK; + else + *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK) + | (gcr0 & SVM_CR0_SELECTIVE_MASK); + + + if (gcr0 == *hcr0 && svm->vcpu.fpu_active) { + svm->vmcb->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK; + svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK; + } else { + svm->vmcb->control.intercept_cr_read |= INTERCEPT_CR0_MASK; + svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR0_MASK; + } +} + static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { struct vcpu_svm *svm = to_svm(vcpu); #ifdef CONFIG_X86_64 - if (vcpu->arch.shadow_efer & EFER_LME) { + if (vcpu->arch.efer & EFER_LME) { if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { - vcpu->arch.shadow_efer |= EFER_LMA; + vcpu->arch.efer |= EFER_LMA; svm->vmcb->save.efer |= EFER_LMA | EFER_LME; } if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) { - vcpu->arch.shadow_efer &= ~EFER_LMA; + vcpu->arch.efer &= ~EFER_LMA; svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME); } } #endif - if (npt_enabled) - goto set; + vcpu->arch.cr0 = cr0; - if ((vcpu->arch.cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) { - svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); - vcpu->fpu_active = 1; - } + if (!npt_enabled) + cr0 |= X86_CR0_PG | X86_CR0_WP; - vcpu->arch.cr0 = cr0; - cr0 |= X86_CR0_PG | X86_CR0_WP; - if (!vcpu->fpu_active) { - svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR); + if (!vcpu->fpu_active) cr0 |= X86_CR0_TS; - } -set: /* * re-enable caching here because the QEMU bios * does not do it - this results in some delay at @@ -997,6 +1022,7 @@ set: */ cr0 &= ~(X86_CR0_CD | X86_CR0_NW); svm->vmcb->save.cr0 = cr0; + update_cr0_intercept(svm); } static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) @@ -1102,76 +1128,70 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) svm->vmcb->control.asid = sd->next_asid++; } -static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr) +static int svm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *dest) { struct vcpu_svm *svm = to_svm(vcpu); - unsigned long val; switch (dr) { case 0 ... 3: - val = vcpu->arch.db[dr]; + *dest = vcpu->arch.db[dr]; break; + case 4: + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) + return EMULATE_FAIL; /* will re-inject UD */ + /* fall through */ case 6: if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) - val = vcpu->arch.dr6; + *dest = vcpu->arch.dr6; else - val = svm->vmcb->save.dr6; + *dest = svm->vmcb->save.dr6; break; + case 5: + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) + return EMULATE_FAIL; /* will re-inject UD */ + /* fall through */ case 7: if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) - val = vcpu->arch.dr7; + *dest = vcpu->arch.dr7; else - val = svm->vmcb->save.dr7; + *dest = svm->vmcb->save.dr7; break; - default: - val = 0; } - return val; + return EMULATE_DONE; } -static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value, - int *exception) +static int svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value) { struct vcpu_svm *svm = to_svm(vcpu); - *exception = 0; - switch (dr) { case 0 ... 3: vcpu->arch.db[dr] = value; if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) vcpu->arch.eff_db[dr] = value; - return; - case 4 ... 5: - if (vcpu->arch.cr4 & X86_CR4_DE) - *exception = UD_VECTOR; - return; + break; + case 4: + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) + return EMULATE_FAIL; /* will re-inject UD */ + /* fall through */ case 6: - if (value & 0xffffffff00000000ULL) { - *exception = GP_VECTOR; - return; - } vcpu->arch.dr6 = (value & DR6_VOLATILE) | DR6_FIXED_1; - return; + break; + case 5: + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) + return EMULATE_FAIL; /* will re-inject UD */ + /* fall through */ case 7: - if (value & 0xffffffff00000000ULL) { - *exception = GP_VECTOR; - return; - } vcpu->arch.dr7 = (value & DR7_VOLATILE) | DR7_FIXED_1; if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { svm->vmcb->save.dr7 = vcpu->arch.dr7; vcpu->arch.switch_db_regs = (value & DR7_BP_EN_MASK); } - return; - default: - /* FIXME: Possible case? */ - printk(KERN_DEBUG "%s: unexpected dr %u\n", - __func__, dr); - *exception = UD_VECTOR; - return; + break; } + + return EMULATE_DONE; } static int pf_interception(struct vcpu_svm *svm) @@ -1239,13 +1259,17 @@ static int ud_interception(struct vcpu_svm *svm) return 1; } -static int nm_interception(struct vcpu_svm *svm) +static void svm_fpu_activate(struct kvm_vcpu *vcpu) { + struct vcpu_svm *svm = to_svm(vcpu); svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); - if (!(svm->vcpu.arch.cr0 & X86_CR0_TS)) - svm->vmcb->save.cr0 &= ~X86_CR0_TS; svm->vcpu.fpu_active = 1; + update_cr0_intercept(svm); +} +static int nm_interception(struct vcpu_svm *svm) +{ + svm_fpu_activate(&svm->vcpu); return 1; } @@ -1337,7 +1361,7 @@ static int vmmcall_interception(struct vcpu_svm *svm) static int nested_svm_check_permissions(struct vcpu_svm *svm) { - if (!(svm->vcpu.arch.shadow_efer & EFER_SVME) + if (!(svm->vcpu.arch.efer & EFER_SVME) || !is_paging(&svm->vcpu)) { kvm_queue_exception(&svm->vcpu, UD_VECTOR); return 1; @@ -1740,8 +1764,8 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) hsave->save.ds = vmcb->save.ds; hsave->save.gdtr = vmcb->save.gdtr; hsave->save.idtr = vmcb->save.idtr; - hsave->save.efer = svm->vcpu.arch.shadow_efer; - hsave->save.cr0 = svm->vcpu.arch.cr0; + hsave->save.efer = svm->vcpu.arch.efer; + hsave->save.cr0 = kvm_read_cr0(&svm->vcpu); hsave->save.cr4 = svm->vcpu.arch.cr4; hsave->save.rflags = vmcb->save.rflags; hsave->save.rip = svm->next_rip; @@ -2153,9 +2177,10 @@ static int rdmsr_interception(struct vcpu_svm *svm) u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; u64 data; - if (svm_get_msr(&svm->vcpu, ecx, &data)) + if (svm_get_msr(&svm->vcpu, ecx, &data)) { + trace_kvm_msr_read_ex(ecx); kvm_inject_gp(&svm->vcpu, 0); - else { + } else { trace_kvm_msr_read(ecx, data); svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff; @@ -2247,13 +2272,15 @@ static int wrmsr_interception(struct vcpu_svm *svm) u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u) | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32); - trace_kvm_msr_write(ecx, data); svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; - if (svm_set_msr(&svm->vcpu, ecx, data)) + if (svm_set_msr(&svm->vcpu, ecx, data)) { + trace_kvm_msr_write_ex(ecx, data); kvm_inject_gp(&svm->vcpu, 0); - else + } else { + trace_kvm_msr_write(ecx, data); skip_emulated_instruction(&svm->vcpu); + } return 1; } @@ -2297,7 +2324,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_READ_CR3] = emulate_on_interception, [SVM_EXIT_READ_CR4] = emulate_on_interception, [SVM_EXIT_READ_CR8] = emulate_on_interception, - /* for now: */ + [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, [SVM_EXIT_WRITE_CR0] = emulate_on_interception, [SVM_EXIT_WRITE_CR3] = emulate_on_interception, [SVM_EXIT_WRITE_CR4] = emulate_on_interception, @@ -2306,11 +2333,17 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_READ_DR1] = emulate_on_interception, [SVM_EXIT_READ_DR2] = emulate_on_interception, [SVM_EXIT_READ_DR3] = emulate_on_interception, + [SVM_EXIT_READ_DR4] = emulate_on_interception, + [SVM_EXIT_READ_DR5] = emulate_on_interception, + [SVM_EXIT_READ_DR6] = emulate_on_interception, + [SVM_EXIT_READ_DR7] = emulate_on_interception, [SVM_EXIT_WRITE_DR0] = emulate_on_interception, [SVM_EXIT_WRITE_DR1] = emulate_on_interception, [SVM_EXIT_WRITE_DR2] = emulate_on_interception, [SVM_EXIT_WRITE_DR3] = emulate_on_interception, + [SVM_EXIT_WRITE_DR4] = emulate_on_interception, [SVM_EXIT_WRITE_DR5] = emulate_on_interception, + [SVM_EXIT_WRITE_DR6] = emulate_on_interception, [SVM_EXIT_WRITE_DR7] = emulate_on_interception, [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception, [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception, @@ -2383,20 +2416,10 @@ static int handle_exit(struct kvm_vcpu *vcpu) svm_complete_interrupts(svm); - if (npt_enabled) { - int mmu_reload = 0; - if ((vcpu->arch.cr0 ^ svm->vmcb->save.cr0) & X86_CR0_PG) { - svm_set_cr0(vcpu, svm->vmcb->save.cr0); - mmu_reload = 1; - } + if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR0_MASK)) vcpu->arch.cr0 = svm->vmcb->save.cr0; + if (npt_enabled) vcpu->arch.cr3 = svm->vmcb->save.cr3; - if (mmu_reload) { - kvm_mmu_reset_context(vcpu); - kvm_mmu_load(vcpu); - } - } - if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) { kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; @@ -2798,12 +2821,6 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) svm->vmcb->save.cr3 = root; force_new_asid(vcpu); - - if (vcpu->fpu_active) { - svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR); - svm->vmcb->save.cr0 |= X86_CR0_TS; - vcpu->fpu_active = 0; - } } static int is_disabled(void) @@ -2852,6 +2869,10 @@ static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) return 0; } +static void svm_cpuid_update(struct kvm_vcpu *vcpu) +{ +} + static const struct trace_print_flags svm_exit_reasons_str[] = { { SVM_EXIT_READ_CR0, "read_cr0" }, { SVM_EXIT_READ_CR3, "read_cr3" }, @@ -2905,9 +2926,22 @@ static const struct trace_print_flags svm_exit_reasons_str[] = { { -1, NULL } }; -static bool svm_gb_page_enable(void) +static int svm_get_lpage_level(void) { - return true; + return PT_PDPE_LEVEL; +} + +static bool svm_rdtscp_supported(void) +{ + return false; +} + +static void svm_fpu_deactivate(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + update_cr0_intercept(svm); + svm->vmcb->control.intercept_exceptions |= 1 << NM_VECTOR; } static struct kvm_x86_ops svm_x86_ops = { @@ -2936,6 +2970,7 @@ static struct kvm_x86_ops svm_x86_ops = { .set_segment = svm_set_segment, .get_cpl = svm_get_cpl, .get_cs_db_l_bits = kvm_get_cs_db_l_bits, + .decache_cr0_guest_bits = svm_decache_cr0_guest_bits, .decache_cr4_guest_bits = svm_decache_cr4_guest_bits, .set_cr0 = svm_set_cr0, .set_cr3 = svm_set_cr3, @@ -2950,6 +2985,8 @@ static struct kvm_x86_ops svm_x86_ops = { .cache_reg = svm_cache_reg, .get_rflags = svm_get_rflags, .set_rflags = svm_set_rflags, + .fpu_activate = svm_fpu_activate, + .fpu_deactivate = svm_fpu_deactivate, .tlb_flush = svm_flush_tlb, @@ -2975,7 +3012,11 @@ static struct kvm_x86_ops svm_x86_ops = { .get_mt_mask = svm_get_mt_mask, .exit_reasons_str = svm_exit_reasons_str, - .gb_page_enable = svm_gb_page_enable, + .get_lpage_level = svm_get_lpage_level, + + .cpuid_update = svm_cpuid_update, + + .rdtscp_supported = svm_rdtscp_supported, }; static int __init svm_init(void) diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 816e0449db0..6ad30a29f04 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -56,6 +56,38 @@ TRACE_EVENT(kvm_hypercall, ); /* + * Tracepoint for hypercall. + */ +TRACE_EVENT(kvm_hv_hypercall, + TP_PROTO(__u16 code, bool fast, __u16 rep_cnt, __u16 rep_idx, + __u64 ingpa, __u64 outgpa), + TP_ARGS(code, fast, rep_cnt, rep_idx, ingpa, outgpa), + + TP_STRUCT__entry( + __field( __u16, code ) + __field( bool, fast ) + __field( __u16, rep_cnt ) + __field( __u16, rep_idx ) + __field( __u64, ingpa ) + __field( __u64, outgpa ) + ), + + TP_fast_assign( + __entry->code = code; + __entry->fast = fast; + __entry->rep_cnt = rep_cnt; + __entry->rep_idx = rep_idx; + __entry->ingpa = ingpa; + __entry->outgpa = outgpa; + ), + + TP_printk("code 0x%x %s cnt 0x%x idx 0x%x in 0x%llx out 0x%llx", + __entry->code, __entry->fast ? "fast" : "slow", + __entry->rep_cnt, __entry->rep_idx, __entry->ingpa, + __entry->outgpa) +); + +/* * Tracepoint for PIO. */ TRACE_EVENT(kvm_pio, @@ -214,28 +246,33 @@ TRACE_EVENT(kvm_page_fault, * Tracepoint for guest MSR access. */ TRACE_EVENT(kvm_msr, - TP_PROTO(unsigned int rw, unsigned int ecx, unsigned long data), - TP_ARGS(rw, ecx, data), + TP_PROTO(unsigned write, u32 ecx, u64 data, bool exception), + TP_ARGS(write, ecx, data, exception), TP_STRUCT__entry( - __field( unsigned int, rw ) - __field( unsigned int, ecx ) - __field( unsigned long, data ) + __field( unsigned, write ) + __field( u32, ecx ) + __field( u64, data ) + __field( u8, exception ) ), TP_fast_assign( - __entry->rw = rw; + __entry->write = write; __entry->ecx = ecx; __entry->data = data; + __entry->exception = exception; ), - TP_printk("msr_%s %x = 0x%lx", - __entry->rw ? "write" : "read", - __entry->ecx, __entry->data) + TP_printk("msr_%s %x = 0x%llx%s", + __entry->write ? "write" : "read", + __entry->ecx, __entry->data, + __entry->exception ? " (#GP)" : "") ); -#define trace_kvm_msr_read(ecx, data) trace_kvm_msr(0, ecx, data) -#define trace_kvm_msr_write(ecx, data) trace_kvm_msr(1, ecx, data) +#define trace_kvm_msr_read(ecx, data) trace_kvm_msr(0, ecx, data, false) +#define trace_kvm_msr_write(ecx, data) trace_kvm_msr(1, ecx, data, false) +#define trace_kvm_msr_read_ex(ecx) trace_kvm_msr(0, ecx, 0, true) +#define trace_kvm_msr_write_ex(ecx, data) trace_kvm_msr(1, ecx, data, true) /* * Tracepoint for guest CR access. diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d4918d6fc92..14873b9f843 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -61,6 +61,21 @@ module_param_named(unrestricted_guest, static int __read_mostly emulate_invalid_guest_state = 0; module_param(emulate_invalid_guest_state, bool, S_IRUGO); +#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \ + (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD) +#define KVM_GUEST_CR0_MASK \ + (KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) +#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST \ + (X86_CR0_WP | X86_CR0_NE) +#define KVM_VM_CR0_ALWAYS_ON \ + (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) +#define KVM_CR4_GUEST_OWNED_BITS \ + (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ + | X86_CR4_OSXMMEXCPT) + +#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) +#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) + /* * These 2 parameters are used to config the controls for Pause-Loop Exiting: * ple_gap: upper bound on the amount of time between two successive @@ -136,6 +151,8 @@ struct vcpu_vmx { ktime_t entry_time; s64 vnmi_blocked_time; u32 exit_reason; + + bool rdtscp_enabled; }; static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) @@ -210,7 +227,7 @@ static const u32 vmx_msr_index[] = { #ifdef CONFIG_X86_64 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, #endif - MSR_EFER, MSR_K6_STAR, + MSR_EFER, MSR_TSC_AUX, MSR_K6_STAR, }; #define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index) @@ -301,6 +318,11 @@ static inline bool cpu_has_vmx_ept_2m_page(void) return !!(vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT); } +static inline bool cpu_has_vmx_ept_1g_page(void) +{ + return !!(vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT); +} + static inline int cpu_has_vmx_invept_individual_addr(void) { return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT); @@ -336,9 +358,7 @@ static inline int cpu_has_vmx_ple(void) static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm) { - return flexpriority_enabled && - (cpu_has_vmx_virtualize_apic_accesses()) && - (irqchip_in_kernel(kvm)); + return flexpriority_enabled && irqchip_in_kernel(kvm); } static inline int cpu_has_vmx_vpid(void) @@ -347,6 +367,12 @@ static inline int cpu_has_vmx_vpid(void) SECONDARY_EXEC_ENABLE_VPID; } +static inline int cpu_has_vmx_rdtscp(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_RDTSCP; +} + static inline int cpu_has_virtual_nmis(void) { return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; @@ -551,22 +577,18 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) { u32 eb; - eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR); - if (!vcpu->fpu_active) - eb |= 1u << NM_VECTOR; - /* - * Unconditionally intercept #DB so we can maintain dr6 without - * reading it every exit. - */ - eb |= 1u << DB_VECTOR; - if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { - if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) - eb |= 1u << BP_VECTOR; - } + eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) | + (1u << NM_VECTOR) | (1u << DB_VECTOR); + if ((vcpu->guest_debug & + (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) == + (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) + eb |= 1u << BP_VECTOR; if (to_vmx(vcpu)->rmode.vm86_active) eb = ~0; if (enable_ept) eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */ + if (vcpu->fpu_active) + eb &= ~(1u << NM_VECTOR); vmcs_write32(EXCEPTION_BITMAP, eb); } @@ -589,7 +611,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) u64 guest_efer; u64 ignore_bits; - guest_efer = vmx->vcpu.arch.shadow_efer; + guest_efer = vmx->vcpu.arch.efer; /* * NX is emulated; LMA and LME handled by hardware; SCE meaninless @@ -767,22 +789,30 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu) static void vmx_fpu_activate(struct kvm_vcpu *vcpu) { + ulong cr0; + if (vcpu->fpu_active) return; vcpu->fpu_active = 1; - vmcs_clear_bits(GUEST_CR0, X86_CR0_TS); - if (vcpu->arch.cr0 & X86_CR0_TS) - vmcs_set_bits(GUEST_CR0, X86_CR0_TS); + cr0 = vmcs_readl(GUEST_CR0); + cr0 &= ~(X86_CR0_TS | X86_CR0_MP); + cr0 |= kvm_read_cr0_bits(vcpu, X86_CR0_TS | X86_CR0_MP); + vmcs_writel(GUEST_CR0, cr0); update_exception_bitmap(vcpu); + vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; + vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); } +static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu); + static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu) { - if (!vcpu->fpu_active) - return; - vcpu->fpu_active = 0; - vmcs_set_bits(GUEST_CR0, X86_CR0_TS); + vmx_decache_cr0_guest_bits(vcpu); + vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP); update_exception_bitmap(vcpu); + vcpu->arch.cr0_guest_owned_bits = 0; + vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); + vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); } static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) @@ -878,6 +908,11 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); } +static bool vmx_rdtscp_supported(void) +{ + return cpu_has_vmx_rdtscp(); +} + /* * Swap MSR entry in host/guest MSR entry array. */ @@ -913,12 +948,15 @@ static void setup_msrs(struct vcpu_vmx *vmx) index = __find_msr_index(vmx, MSR_CSTAR); if (index >= 0) move_msr_up(vmx, index, save_nmsrs++); + index = __find_msr_index(vmx, MSR_TSC_AUX); + if (index >= 0 && vmx->rdtscp_enabled) + move_msr_up(vmx, index, save_nmsrs++); /* * MSR_K6_STAR is only needed on long mode guests, and only * if efer.sce is enabled. */ index = __find_msr_index(vmx, MSR_K6_STAR); - if ((index >= 0) && (vmx->vcpu.arch.shadow_efer & EFER_SCE)) + if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE)) move_msr_up(vmx, index, save_nmsrs++); } #endif @@ -1002,6 +1040,10 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) case MSR_IA32_SYSENTER_ESP: data = vmcs_readl(GUEST_SYSENTER_ESP); break; + case MSR_TSC_AUX: + if (!to_vmx(vcpu)->rdtscp_enabled) + return 1; + /* Otherwise falls through */ default: vmx_load_host_state(to_vmx(vcpu)); msr = find_msr_entry(to_vmx(vcpu), msr_index); @@ -1065,7 +1107,15 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) vcpu->arch.pat = data; break; } - /* Otherwise falls through to kvm_set_msr_common */ + ret = kvm_set_msr_common(vcpu, msr_index, data); + break; + case MSR_TSC_AUX: + if (!vmx->rdtscp_enabled) + return 1; + /* Check reserved bit, higher 32 bits should be zero */ + if ((data >> 32) != 0) + return 1; + /* Otherwise falls through */ default: msr = find_msr_entry(vmx, msr_index); if (msr) { @@ -1224,6 +1274,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MOV_DR_EXITING | CPU_BASED_USE_TSC_OFFSETING | + CPU_BASED_MWAIT_EXITING | + CPU_BASED_MONITOR_EXITING | CPU_BASED_INVLPG_EXITING; opt = CPU_BASED_TPR_SHADOW | CPU_BASED_USE_MSR_BITMAPS | @@ -1243,7 +1295,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) SECONDARY_EXEC_ENABLE_VPID | SECONDARY_EXEC_ENABLE_EPT | SECONDARY_EXEC_UNRESTRICTED_GUEST | - SECONDARY_EXEC_PAUSE_LOOP_EXITING; + SECONDARY_EXEC_PAUSE_LOOP_EXITING | + SECONDARY_EXEC_RDTSCP; if (adjust_vmx_controls(min2, opt2, MSR_IA32_VMX_PROCBASED_CTLS2, &_cpu_based_2nd_exec_control) < 0) @@ -1457,8 +1510,12 @@ static void enter_pmode(struct kvm_vcpu *vcpu) static gva_t rmode_tss_base(struct kvm *kvm) { if (!kvm->arch.tss_addr) { - gfn_t base_gfn = kvm->memslots[0].base_gfn + - kvm->memslots[0].npages - 3; + struct kvm_memslots *slots; + gfn_t base_gfn; + + slots = rcu_dereference(kvm->memslots); + base_gfn = kvm->memslots->memslots[0].base_gfn + + kvm->memslots->memslots[0].npages - 3; return base_gfn << PAGE_SHIFT; } return kvm->arch.tss_addr; @@ -1544,9 +1601,7 @@ static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) * of this msr depends on is_long_mode(). */ vmx_load_host_state(to_vmx(vcpu)); - vcpu->arch.shadow_efer = efer; - if (!msr) - return; + vcpu->arch.efer = efer; if (efer & EFER_LMA) { vmcs_write32(VM_ENTRY_CONTROLS, vmcs_read32(VM_ENTRY_CONTROLS) | @@ -1576,13 +1631,13 @@ static void enter_lmode(struct kvm_vcpu *vcpu) (guest_tr_ar & ~AR_TYPE_MASK) | AR_TYPE_BUSY_64_TSS); } - vcpu->arch.shadow_efer |= EFER_LMA; - vmx_set_efer(vcpu, vcpu->arch.shadow_efer); + vcpu->arch.efer |= EFER_LMA; + vmx_set_efer(vcpu, vcpu->arch.efer); } static void exit_lmode(struct kvm_vcpu *vcpu) { - vcpu->arch.shadow_efer &= ~EFER_LMA; + vcpu->arch.efer &= ~EFER_LMA; vmcs_write32(VM_ENTRY_CONTROLS, vmcs_read32(VM_ENTRY_CONTROLS) @@ -1598,10 +1653,20 @@ static void vmx_flush_tlb(struct kvm_vcpu *vcpu) ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa)); } +static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) +{ + ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits; + + vcpu->arch.cr0 &= ~cr0_guest_owned_bits; + vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits; +} + static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) { - vcpu->arch.cr4 &= KVM_GUEST_CR4_MASK; - vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK; + ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits; + + vcpu->arch.cr4 &= ~cr4_guest_owned_bits; + vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits; } static void ept_load_pdptrs(struct kvm_vcpu *vcpu) @@ -1646,7 +1711,7 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, (CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING)); vcpu->arch.cr0 = cr0; - vmx_set_cr4(vcpu, vcpu->arch.cr4); + vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); } else if (!is_paging(vcpu)) { /* From nonpaging to paging */ vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, @@ -1654,23 +1719,13 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING)); vcpu->arch.cr0 = cr0; - vmx_set_cr4(vcpu, vcpu->arch.cr4); + vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); } if (!(cr0 & X86_CR0_WP)) *hw_cr0 &= ~X86_CR0_WP; } -static void ept_update_paging_mode_cr4(unsigned long *hw_cr4, - struct kvm_vcpu *vcpu) -{ - if (!is_paging(vcpu)) { - *hw_cr4 &= ~X86_CR4_PAE; - *hw_cr4 |= X86_CR4_PSE; - } else if (!(vcpu->arch.cr4 & X86_CR4_PAE)) - *hw_cr4 &= ~X86_CR4_PAE; -} - static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -1682,8 +1737,6 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) else hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON; - vmx_fpu_deactivate(vcpu); - if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE)) enter_pmode(vcpu); @@ -1691,7 +1744,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) enter_rmode(vcpu); #ifdef CONFIG_X86_64 - if (vcpu->arch.shadow_efer & EFER_LME) { + if (vcpu->arch.efer & EFER_LME) { if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) enter_lmode(vcpu); if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) @@ -1702,12 +1755,12 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) if (enable_ept) ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu); + if (!vcpu->fpu_active) + hw_cr0 |= X86_CR0_TS | X86_CR0_MP; + vmcs_writel(CR0_READ_SHADOW, cr0); vmcs_writel(GUEST_CR0, hw_cr0); vcpu->arch.cr0 = cr0; - - if (!(cr0 & X86_CR0_TS) || !(cr0 & X86_CR0_PE)) - vmx_fpu_activate(vcpu); } static u64 construct_eptp(unsigned long root_hpa) @@ -1738,8 +1791,6 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) vmx_flush_tlb(vcpu); vmcs_writel(GUEST_CR3, guest_cr3); - if (vcpu->arch.cr0 & X86_CR0_PE) - vmx_fpu_deactivate(vcpu); } static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) @@ -1748,8 +1799,14 @@ static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); vcpu->arch.cr4 = cr4; - if (enable_ept) - ept_update_paging_mode_cr4(&hw_cr4, vcpu); + if (enable_ept) { + if (!is_paging(vcpu)) { + hw_cr4 &= ~X86_CR4_PAE; + hw_cr4 |= X86_CR4_PSE; + } else if (!(cr4 & X86_CR4_PAE)) { + hw_cr4 &= ~X86_CR4_PAE; + } + } vmcs_writel(CR4_READ_SHADOW, cr4); vmcs_writel(GUEST_CR4, hw_cr4); @@ -1787,7 +1844,7 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu, static int vmx_get_cpl(struct kvm_vcpu *vcpu) { - if (!(vcpu->arch.cr0 & X86_CR0_PE)) /* if real mode */ + if (!is_protmode(vcpu)) return 0; if (vmx_get_rflags(vcpu) & X86_EFLAGS_VM) /* if virtual 8086 */ @@ -2042,7 +2099,7 @@ static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu) static bool guest_state_valid(struct kvm_vcpu *vcpu) { /* real mode guest state checks */ - if (!(vcpu->arch.cr0 & X86_CR0_PE)) { + if (!is_protmode(vcpu)) { if (!rmode_segment_valid(vcpu, VCPU_SREG_CS)) return false; if (!rmode_segment_valid(vcpu, VCPU_SREG_SS)) @@ -2175,7 +2232,7 @@ static int alloc_apic_access_page(struct kvm *kvm) struct kvm_userspace_memory_region kvm_userspace_mem; int r = 0; - down_write(&kvm->slots_lock); + mutex_lock(&kvm->slots_lock); if (kvm->arch.apic_access_page) goto out; kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT; @@ -2188,7 +2245,7 @@ static int alloc_apic_access_page(struct kvm *kvm) kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00); out: - up_write(&kvm->slots_lock); + mutex_unlock(&kvm->slots_lock); return r; } @@ -2197,7 +2254,7 @@ static int alloc_identity_pagetable(struct kvm *kvm) struct kvm_userspace_memory_region kvm_userspace_mem; int r = 0; - down_write(&kvm->slots_lock); + mutex_lock(&kvm->slots_lock); if (kvm->arch.ept_identity_pagetable) goto out; kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT; @@ -2212,7 +2269,7 @@ static int alloc_identity_pagetable(struct kvm *kvm) kvm->arch.ept_identity_pagetable = gfn_to_page(kvm, kvm->arch.ept_identity_map_addr >> PAGE_SHIFT); out: - up_write(&kvm->slots_lock); + mutex_unlock(&kvm->slots_lock); return r; } @@ -2384,14 +2441,12 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) for (i = 0; i < NR_VMX_MSR; ++i) { u32 index = vmx_msr_index[i]; u32 data_low, data_high; - u64 data; int j = vmx->nmsrs; if (rdmsr_safe(index, &data_low, &data_high) < 0) continue; if (wrmsr_safe(index, data_low, data_high) < 0) continue; - data = data_low | ((u64)data_high << 32); vmx->guest_msrs[j].index = i; vmx->guest_msrs[j].data = 0; vmx->guest_msrs[j].mask = -1ull; @@ -2404,7 +2459,10 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl); vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); - vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK); + vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS; + if (enable_ept) + vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE; + vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits); tsc_base = vmx->vcpu.kvm->arch.vm_init_tsc; rdtscll(tsc_this); @@ -2429,10 +2487,10 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); u64 msr; - int ret; + int ret, idx; vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); - down_read(&vcpu->kvm->slots_lock); + idx = srcu_read_lock(&vcpu->kvm->srcu); if (!init_rmode(vmx->vcpu.kvm)) { ret = -ENOMEM; goto out; @@ -2526,7 +2584,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; - vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */ + vmx_set_cr0(&vmx->vcpu, kvm_read_cr0(vcpu)); /* enter rmode */ vmx_set_cr4(&vmx->vcpu, 0); vmx_set_efer(&vmx->vcpu, 0); vmx_fpu_activate(&vmx->vcpu); @@ -2540,7 +2598,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmx->emulation_required = 0; out: - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&vcpu->kvm->srcu, idx); return ret; } @@ -2717,6 +2775,12 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu, kvm_queue_exception(vcpu, vec); return 1; case BP_VECTOR: + /* + * Update instruction length as we may reinject the exception + * from user space while in guest debugging mode. + */ + to_vmx(vcpu)->vcpu.arch.event_exit_inst_len = + vmcs_read32(VM_EXIT_INSTRUCTION_LEN); if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) return 0; /* fall through */ @@ -2839,6 +2903,13 @@ static int handle_exception(struct kvm_vcpu *vcpu) kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7); /* fall through */ case BP_VECTOR: + /* + * Update instruction length as we may reinject #BP from + * user space while in guest debugging mode. Reading it for + * #DB as well causes no harm, it is not used in that case. + */ + vmx->vcpu.arch.event_exit_inst_len = + vmcs_read32(VM_EXIT_INSTRUCTION_LEN); kvm_run->exit_reason = KVM_EXIT_DEBUG; kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip; kvm_run->debug.arch.exception = ex_no; @@ -2940,11 +3011,10 @@ static int handle_cr(struct kvm_vcpu *vcpu) }; break; case 2: /* clts */ - vmx_fpu_deactivate(vcpu); - vcpu->arch.cr0 &= ~X86_CR0_TS; - vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); - vmx_fpu_activate(vcpu); + vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); + trace_kvm_cr_write(0, kvm_read_cr0(vcpu)); skip_emulated_instruction(vcpu); + vmx_fpu_activate(vcpu); return 1; case 1: /*mov from cr*/ switch (cr) { @@ -2962,7 +3032,9 @@ static int handle_cr(struct kvm_vcpu *vcpu) } break; case 3: /* lmsw */ - kvm_lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f); + val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; + trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val); + kvm_lmsw(vcpu, val); skip_emulated_instruction(vcpu); return 1; @@ -2975,12 +3047,22 @@ static int handle_cr(struct kvm_vcpu *vcpu) return 0; } +static int check_dr_alias(struct kvm_vcpu *vcpu) +{ + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return -1; + } + return 0; +} + static int handle_dr(struct kvm_vcpu *vcpu) { unsigned long exit_qualification; unsigned long val; int dr, reg; + /* Do not handle if the CPL > 0, will trigger GP on re-entry */ if (!kvm_require_cpl(vcpu, 0)) return 1; dr = vmcs_readl(GUEST_DR7); @@ -3016,14 +3098,20 @@ static int handle_dr(struct kvm_vcpu *vcpu) case 0 ... 3: val = vcpu->arch.db[dr]; break; + case 4: + if (check_dr_alias(vcpu) < 0) + return 1; + /* fall through */ case 6: val = vcpu->arch.dr6; break; - case 7: + case 5: + if (check_dr_alias(vcpu) < 0) + return 1; + /* fall through */ + default: /* 7 */ val = vcpu->arch.dr7; break; - default: - val = 0; } kvm_register_write(vcpu, reg, val); } else { @@ -3034,21 +3122,25 @@ static int handle_dr(struct kvm_vcpu *vcpu) if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) vcpu->arch.eff_db[dr] = val; break; - case 4 ... 5: - if (vcpu->arch.cr4 & X86_CR4_DE) - kvm_queue_exception(vcpu, UD_VECTOR); - break; + case 4: + if (check_dr_alias(vcpu) < 0) + return 1; + /* fall through */ case 6: if (val & 0xffffffff00000000ULL) { - kvm_queue_exception(vcpu, GP_VECTOR); - break; + kvm_inject_gp(vcpu, 0); + return 1; } vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1; break; - case 7: + case 5: + if (check_dr_alias(vcpu) < 0) + return 1; + /* fall through */ + default: /* 7 */ if (val & 0xffffffff00000000ULL) { - kvm_queue_exception(vcpu, GP_VECTOR); - break; + kvm_inject_gp(vcpu, 0); + return 1; } vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1; if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { @@ -3075,6 +3167,7 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu) u64 data; if (vmx_get_msr(vcpu, ecx, &data)) { + trace_kvm_msr_read_ex(ecx); kvm_inject_gp(vcpu, 0); return 1; } @@ -3094,13 +3187,13 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu) u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32); - trace_kvm_msr_write(ecx, data); - if (vmx_set_msr(vcpu, ecx, data) != 0) { + trace_kvm_msr_write_ex(ecx, data); kvm_inject_gp(vcpu, 0); return 1; } + trace_kvm_msr_write(ecx, data); skip_emulated_instruction(vcpu); return 1; } @@ -3385,7 +3478,6 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) } if (err != EMULATE_DONE) { - kvm_report_emulation_failure(vcpu, "emulation failure"); vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; vcpu->run->internal.ndata = 0; @@ -3416,6 +3508,12 @@ static int handle_pause(struct kvm_vcpu *vcpu) return 1; } +static int handle_invalid_op(struct kvm_vcpu *vcpu) +{ + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; +} + /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs @@ -3453,6 +3551,8 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, + [EXIT_REASON_MWAIT_INSTRUCTION] = handle_invalid_op, + [EXIT_REASON_MONITOR_INSTRUCTION] = handle_invalid_op, }; static const int kvm_vmx_max_exit_handlers = @@ -3686,9 +3786,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) */ vmcs_writel(HOST_CR0, read_cr0()); - if (vcpu->arch.switch_db_regs) - set_debugreg(vcpu->arch.dr6, 6); - asm( /* Store host registers */ "push %%"R"dx; push %%"R"bp;" @@ -3789,9 +3886,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) | (1 << VCPU_EXREG_PDPTR)); vcpu->arch.regs_dirty = 0; - if (vcpu->arch.switch_db_regs) - get_debugreg(vcpu->arch.dr6, 6); - vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); if (vmx->rmode.irq.pending) fixup_rmode_irq(vmx); @@ -3920,7 +4014,7 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) * b. VT-d with snooping control feature: snooping control feature of * VT-d engine can guarantee the cache correctness. Just set it * to WB to keep consistent with host. So the same as item 3. - * 3. EPT without VT-d: always map as WB and set IGMT=1 to keep + * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep * consistent with host MTRR */ if (is_mmio) @@ -3931,37 +4025,88 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) VMX_EPT_MT_EPTE_SHIFT; else ret = (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) - | VMX_EPT_IGMT_BIT; + | VMX_EPT_IPAT_BIT; return ret; } +#define _ER(x) { EXIT_REASON_##x, #x } + static const struct trace_print_flags vmx_exit_reasons_str[] = { - { EXIT_REASON_EXCEPTION_NMI, "exception" }, - { EXIT_REASON_EXTERNAL_INTERRUPT, "ext_irq" }, - { EXIT_REASON_TRIPLE_FAULT, "triple_fault" }, - { EXIT_REASON_NMI_WINDOW, "nmi_window" }, - { EXIT_REASON_IO_INSTRUCTION, "io_instruction" }, - { EXIT_REASON_CR_ACCESS, "cr_access" }, - { EXIT_REASON_DR_ACCESS, "dr_access" }, - { EXIT_REASON_CPUID, "cpuid" }, - { EXIT_REASON_MSR_READ, "rdmsr" }, - { EXIT_REASON_MSR_WRITE, "wrmsr" }, - { EXIT_REASON_PENDING_INTERRUPT, "interrupt_window" }, - { EXIT_REASON_HLT, "halt" }, - { EXIT_REASON_INVLPG, "invlpg" }, - { EXIT_REASON_VMCALL, "hypercall" }, - { EXIT_REASON_TPR_BELOW_THRESHOLD, "tpr_below_thres" }, - { EXIT_REASON_APIC_ACCESS, "apic_access" }, - { EXIT_REASON_WBINVD, "wbinvd" }, - { EXIT_REASON_TASK_SWITCH, "task_switch" }, - { EXIT_REASON_EPT_VIOLATION, "ept_violation" }, + _ER(EXCEPTION_NMI), + _ER(EXTERNAL_INTERRUPT), + _ER(TRIPLE_FAULT), + _ER(PENDING_INTERRUPT), + _ER(NMI_WINDOW), + _ER(TASK_SWITCH), + _ER(CPUID), + _ER(HLT), + _ER(INVLPG), + _ER(RDPMC), + _ER(RDTSC), + _ER(VMCALL), + _ER(VMCLEAR), + _ER(VMLAUNCH), + _ER(VMPTRLD), + _ER(VMPTRST), + _ER(VMREAD), + _ER(VMRESUME), + _ER(VMWRITE), + _ER(VMOFF), + _ER(VMON), + _ER(CR_ACCESS), + _ER(DR_ACCESS), + _ER(IO_INSTRUCTION), + _ER(MSR_READ), + _ER(MSR_WRITE), + _ER(MWAIT_INSTRUCTION), + _ER(MONITOR_INSTRUCTION), + _ER(PAUSE_INSTRUCTION), + _ER(MCE_DURING_VMENTRY), + _ER(TPR_BELOW_THRESHOLD), + _ER(APIC_ACCESS), + _ER(EPT_VIOLATION), + _ER(EPT_MISCONFIG), + _ER(WBINVD), { -1, NULL } }; -static bool vmx_gb_page_enable(void) +#undef _ER + +static int vmx_get_lpage_level(void) +{ + if (enable_ept && !cpu_has_vmx_ept_1g_page()) + return PT_DIRECTORY_LEVEL; + else + /* For shadow and EPT supported 1GB page */ + return PT_PDPE_LEVEL; +} + +static inline u32 bit(int bitno) +{ + return 1 << (bitno & 31); +} + +static void vmx_cpuid_update(struct kvm_vcpu *vcpu) { - return false; + struct kvm_cpuid_entry2 *best; + struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 exec_control; + + vmx->rdtscp_enabled = false; + if (vmx_rdtscp_supported()) { + exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + if (exec_control & SECONDARY_EXEC_RDTSCP) { + best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); + if (best && (best->edx & bit(X86_FEATURE_RDTSCP))) + vmx->rdtscp_enabled = true; + else { + exec_control &= ~SECONDARY_EXEC_RDTSCP; + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, + exec_control); + } + } + } } static struct kvm_x86_ops vmx_x86_ops = { @@ -3990,6 +4135,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .set_segment = vmx_set_segment, .get_cpl = vmx_get_cpl, .get_cs_db_l_bits = vmx_get_cs_db_l_bits, + .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits, .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits, .set_cr0 = vmx_set_cr0, .set_cr3 = vmx_set_cr3, @@ -4002,6 +4148,8 @@ static struct kvm_x86_ops vmx_x86_ops = { .cache_reg = vmx_cache_reg, .get_rflags = vmx_get_rflags, .set_rflags = vmx_set_rflags, + .fpu_activate = vmx_fpu_activate, + .fpu_deactivate = vmx_fpu_deactivate, .tlb_flush = vmx_flush_tlb, @@ -4027,7 +4175,11 @@ static struct kvm_x86_ops vmx_x86_ops = { .get_mt_mask = vmx_get_mt_mask, .exit_reasons_str = vmx_exit_reasons_str, - .gb_page_enable = vmx_gb_page_enable, + .get_lpage_level = vmx_get_lpage_level, + + .cpuid_update = vmx_cpuid_update, + + .rdtscp_supported = vmx_rdtscp_supported, }; static int __init vmx_init(void) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a1e1bc9d412..e46282a5656 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -38,6 +38,7 @@ #include <linux/intel-iommu.h> #include <linux/cpufreq.h> #include <linux/user-return-notifier.h> +#include <linux/srcu.h> #include <trace/events/kvm.h> #undef TRACE_INCLUDE_FILE #define CREATE_TRACE_POINTS @@ -93,16 +94,16 @@ module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR); struct kvm_shared_msrs_global { int nr; - struct kvm_shared_msr { - u32 msr; - u64 value; - } msrs[KVM_NR_SHARED_MSRS]; + u32 msrs[KVM_NR_SHARED_MSRS]; }; struct kvm_shared_msrs { struct user_return_notifier urn; bool registered; - u64 current_value[KVM_NR_SHARED_MSRS]; + struct kvm_shared_msr_values { + u64 host; + u64 curr; + } values[KVM_NR_SHARED_MSRS]; }; static struct kvm_shared_msrs_global __read_mostly shared_msrs_global; @@ -147,53 +148,64 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { static void kvm_on_user_return(struct user_return_notifier *urn) { unsigned slot; - struct kvm_shared_msr *global; struct kvm_shared_msrs *locals = container_of(urn, struct kvm_shared_msrs, urn); + struct kvm_shared_msr_values *values; for (slot = 0; slot < shared_msrs_global.nr; ++slot) { - global = &shared_msrs_global.msrs[slot]; - if (global->value != locals->current_value[slot]) { - wrmsrl(global->msr, global->value); - locals->current_value[slot] = global->value; + values = &locals->values[slot]; + if (values->host != values->curr) { + wrmsrl(shared_msrs_global.msrs[slot], values->host); + values->curr = values->host; } } locals->registered = false; user_return_notifier_unregister(urn); } -void kvm_define_shared_msr(unsigned slot, u32 msr) +static void shared_msr_update(unsigned slot, u32 msr) { - int cpu; + struct kvm_shared_msrs *smsr; u64 value; + smsr = &__get_cpu_var(shared_msrs); + /* only read, and nobody should modify it at this time, + * so don't need lock */ + if (slot >= shared_msrs_global.nr) { + printk(KERN_ERR "kvm: invalid MSR slot!"); + return; + } + rdmsrl_safe(msr, &value); + smsr->values[slot].host = value; + smsr->values[slot].curr = value; +} + +void kvm_define_shared_msr(unsigned slot, u32 msr) +{ if (slot >= shared_msrs_global.nr) shared_msrs_global.nr = slot + 1; - shared_msrs_global.msrs[slot].msr = msr; - rdmsrl_safe(msr, &value); - shared_msrs_global.msrs[slot].value = value; - for_each_online_cpu(cpu) - per_cpu(shared_msrs, cpu).current_value[slot] = value; + shared_msrs_global.msrs[slot] = msr; + /* we need ensured the shared_msr_global have been updated */ + smp_wmb(); } EXPORT_SYMBOL_GPL(kvm_define_shared_msr); static void kvm_shared_msr_cpu_online(void) { unsigned i; - struct kvm_shared_msrs *locals = &__get_cpu_var(shared_msrs); for (i = 0; i < shared_msrs_global.nr; ++i) - locals->current_value[i] = shared_msrs_global.msrs[i].value; + shared_msr_update(i, shared_msrs_global.msrs[i]); } void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask) { struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs); - if (((value ^ smsr->current_value[slot]) & mask) == 0) + if (((value ^ smsr->values[slot].curr) & mask) == 0) return; - smsr->current_value[slot] = value; - wrmsrl(shared_msrs_global.msrs[slot].msr, value); + smsr->values[slot].curr = value; + wrmsrl(shared_msrs_global.msrs[slot], value); if (!smsr->registered) { smsr->urn.on_user_return = kvm_on_user_return; user_return_notifier_register(&smsr->urn); @@ -257,12 +269,68 @@ void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data) } EXPORT_SYMBOL_GPL(kvm_set_apic_base); +#define EXCPT_BENIGN 0 +#define EXCPT_CONTRIBUTORY 1 +#define EXCPT_PF 2 + +static int exception_class(int vector) +{ + switch (vector) { + case PF_VECTOR: + return EXCPT_PF; + case DE_VECTOR: + case TS_VECTOR: + case NP_VECTOR: + case SS_VECTOR: + case GP_VECTOR: + return EXCPT_CONTRIBUTORY; + default: + break; + } + return EXCPT_BENIGN; +} + +static void kvm_multiple_exception(struct kvm_vcpu *vcpu, + unsigned nr, bool has_error, u32 error_code) +{ + u32 prev_nr; + int class1, class2; + + if (!vcpu->arch.exception.pending) { + queue: + vcpu->arch.exception.pending = true; + vcpu->arch.exception.has_error_code = has_error; + vcpu->arch.exception.nr = nr; + vcpu->arch.exception.error_code = error_code; + return; + } + + /* to check exception */ + prev_nr = vcpu->arch.exception.nr; + if (prev_nr == DF_VECTOR) { + /* triple fault -> shutdown */ + set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); + return; + } + class1 = exception_class(prev_nr); + class2 = exception_class(nr); + if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY) + || (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) { + /* generate double fault per SDM Table 5-5 */ + vcpu->arch.exception.pending = true; + vcpu->arch.exception.has_error_code = true; + vcpu->arch.exception.nr = DF_VECTOR; + vcpu->arch.exception.error_code = 0; + } else + /* replace previous exception with a new one in a hope + that instruction re-execution will regenerate lost + exception */ + goto queue; +} + void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) { - WARN_ON(vcpu->arch.exception.pending); - vcpu->arch.exception.pending = true; - vcpu->arch.exception.has_error_code = false; - vcpu->arch.exception.nr = nr; + kvm_multiple_exception(vcpu, nr, false, 0); } EXPORT_SYMBOL_GPL(kvm_queue_exception); @@ -270,25 +338,6 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr, u32 error_code) { ++vcpu->stat.pf_guest; - - if (vcpu->arch.exception.pending) { - switch(vcpu->arch.exception.nr) { - case DF_VECTOR: - /* triple fault -> shutdown */ - set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); - return; - case PF_VECTOR: - vcpu->arch.exception.nr = DF_VECTOR; - vcpu->arch.exception.error_code = 0; - return; - default: - /* replace previous exception with a new one in a hope - that instruction re-execution will regenerate lost - exception */ - vcpu->arch.exception.pending = false; - break; - } - } vcpu->arch.cr2 = addr; kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); } @@ -301,11 +350,7 @@ EXPORT_SYMBOL_GPL(kvm_inject_nmi); void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) { - WARN_ON(vcpu->arch.exception.pending); - vcpu->arch.exception.pending = true; - vcpu->arch.exception.has_error_code = true; - vcpu->arch.exception.nr = nr; - vcpu->arch.exception.error_code = error_code; + kvm_multiple_exception(vcpu, nr, true, error_code); } EXPORT_SYMBOL_GPL(kvm_queue_exception_e); @@ -383,12 +428,18 @@ out: void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { - if (cr0 & CR0_RESERVED_BITS) { + cr0 |= X86_CR0_ET; + +#ifdef CONFIG_X86_64 + if (cr0 & 0xffffffff00000000UL) { printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", - cr0, vcpu->arch.cr0); + cr0, kvm_read_cr0(vcpu)); kvm_inject_gp(vcpu, 0); return; } +#endif + + cr0 &= ~CR0_RESERVED_BITS; if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) { printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n"); @@ -405,7 +456,7 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { #ifdef CONFIG_X86_64 - if ((vcpu->arch.shadow_efer & EFER_LME)) { + if ((vcpu->arch.efer & EFER_LME)) { int cs_db, cs_l; if (!is_pae(vcpu)) { @@ -443,13 +494,13 @@ EXPORT_SYMBOL_GPL(kvm_set_cr0); void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) { - kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)); + kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0ful) | (msw & 0x0f)); } EXPORT_SYMBOL_GPL(kvm_lmsw); void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { - unsigned long old_cr4 = vcpu->arch.cr4; + unsigned long old_cr4 = kvm_read_cr4(vcpu); unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE; if (cr4 & CR4_RESERVED_BITS) { @@ -575,9 +626,11 @@ static inline u32 bit(int bitno) * kvm-specific. Those are put in the beginning of the list. */ -#define KVM_SAVE_MSRS_BEGIN 2 +#define KVM_SAVE_MSRS_BEGIN 5 static u32 msrs_to_save[] = { MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, + HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, + HV_X64_MSR_APIC_ASSIST_PAGE, MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, MSR_K6_STAR, #ifdef CONFIG_X86_64 @@ -602,7 +655,7 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer) } if (is_paging(vcpu) - && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) { + && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) { printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n"); kvm_inject_gp(vcpu, 0); return; @@ -633,9 +686,9 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer) kvm_x86_ops->set_efer(vcpu, efer); efer &= ~EFER_LMA; - efer |= vcpu->arch.shadow_efer & EFER_LMA; + efer |= vcpu->arch.efer & EFER_LMA; - vcpu->arch.shadow_efer = efer; + vcpu->arch.efer = efer; vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; kvm_mmu_reset_context(vcpu); @@ -957,6 +1010,100 @@ out: return r; } +static bool kvm_hv_hypercall_enabled(struct kvm *kvm) +{ + return kvm->arch.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE; +} + +static bool kvm_hv_msr_partition_wide(u32 msr) +{ + bool r = false; + switch (msr) { + case HV_X64_MSR_GUEST_OS_ID: + case HV_X64_MSR_HYPERCALL: + r = true; + break; + } + + return r; +} + +static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data) +{ + struct kvm *kvm = vcpu->kvm; + + switch (msr) { + case HV_X64_MSR_GUEST_OS_ID: + kvm->arch.hv_guest_os_id = data; + /* setting guest os id to zero disables hypercall page */ + if (!kvm->arch.hv_guest_os_id) + kvm->arch.hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE; + break; + case HV_X64_MSR_HYPERCALL: { + u64 gfn; + unsigned long addr; + u8 instructions[4]; + + /* if guest os id is not set hypercall should remain disabled */ + if (!kvm->arch.hv_guest_os_id) + break; + if (!(data & HV_X64_MSR_HYPERCALL_ENABLE)) { + kvm->arch.hv_hypercall = data; + break; + } + gfn = data >> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT; + addr = gfn_to_hva(kvm, gfn); + if (kvm_is_error_hva(addr)) + return 1; + kvm_x86_ops->patch_hypercall(vcpu, instructions); + ((unsigned char *)instructions)[3] = 0xc3; /* ret */ + if (copy_to_user((void __user *)addr, instructions, 4)) + return 1; + kvm->arch.hv_hypercall = data; + break; + } + default: + pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " + "data 0x%llx\n", msr, data); + return 1; + } + return 0; +} + +static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data) +{ + switch (msr) { + case HV_X64_MSR_APIC_ASSIST_PAGE: { + unsigned long addr; + + if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) { + vcpu->arch.hv_vapic = data; + break; + } + addr = gfn_to_hva(vcpu->kvm, data >> + HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT); + if (kvm_is_error_hva(addr)) + return 1; + if (clear_user((void __user *)addr, PAGE_SIZE)) + return 1; + vcpu->arch.hv_vapic = data; + break; + } + case HV_X64_MSR_EOI: + return kvm_hv_vapic_msr_write(vcpu, APIC_EOI, data); + case HV_X64_MSR_ICR: + return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data); + case HV_X64_MSR_TPR: + return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data); + default: + pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " + "data 0x%llx\n", msr, data); + return 1; + } + + return 0; +} + int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) { switch (msr) { @@ -1071,6 +1218,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " "0x%x data 0x%llx\n", msr, data); break; + case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: + if (kvm_hv_msr_partition_wide(msr)) { + int r; + mutex_lock(&vcpu->kvm->lock); + r = set_msr_hyperv_pw(vcpu, msr, data); + mutex_unlock(&vcpu->kvm->lock); + return r; + } else + return set_msr_hyperv(vcpu, msr, data); + break; default: if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) return xen_hvm_config(vcpu, data); @@ -1170,6 +1327,54 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) return 0; } +static int get_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) +{ + u64 data = 0; + struct kvm *kvm = vcpu->kvm; + + switch (msr) { + case HV_X64_MSR_GUEST_OS_ID: + data = kvm->arch.hv_guest_os_id; + break; + case HV_X64_MSR_HYPERCALL: + data = kvm->arch.hv_hypercall; + break; + default: + pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); + return 1; + } + + *pdata = data; + return 0; +} + +static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) +{ + u64 data = 0; + + switch (msr) { + case HV_X64_MSR_VP_INDEX: { + int r; + struct kvm_vcpu *v; + kvm_for_each_vcpu(r, v, vcpu->kvm) + if (v == vcpu) + data = r; + break; + } + case HV_X64_MSR_EOI: + return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata); + case HV_X64_MSR_ICR: + return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata); + case HV_X64_MSR_TPR: + return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata); + default: + pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); + return 1; + } + *pdata = data; + return 0; +} + int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) { u64 data; @@ -1221,7 +1426,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) data |= (((uint64_t)4ULL) << 40); break; case MSR_EFER: - data = vcpu->arch.shadow_efer; + data = vcpu->arch.efer; break; case MSR_KVM_WALL_CLOCK: data = vcpu->kvm->arch.wall_clock; @@ -1236,6 +1441,16 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_IA32_MCG_STATUS: case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: return get_msr_mce(vcpu, msr, pdata); + case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: + if (kvm_hv_msr_partition_wide(msr)) { + int r; + mutex_lock(&vcpu->kvm->lock); + r = get_msr_hyperv_pw(vcpu, msr, pdata); + mutex_unlock(&vcpu->kvm->lock); + return r; + } else + return get_msr_hyperv(vcpu, msr, pdata); + break; default: if (!ignore_msrs) { pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); @@ -1261,15 +1476,15 @@ static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, int (*do_msr)(struct kvm_vcpu *vcpu, unsigned index, u64 *data)) { - int i; + int i, idx; vcpu_load(vcpu); - down_read(&vcpu->kvm->slots_lock); + idx = srcu_read_lock(&vcpu->kvm->srcu); for (i = 0; i < msrs->nmsrs; ++i) if (do_msr(vcpu, entries[i].index, &entries[i].data)) break; - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&vcpu->kvm->srcu, idx); vcpu_put(vcpu); @@ -1351,6 +1566,11 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_XEN_HVM: case KVM_CAP_ADJUST_CLOCK: case KVM_CAP_VCPU_EVENTS: + case KVM_CAP_HYPERV: + case KVM_CAP_HYPERV_VAPIC: + case KVM_CAP_HYPERV_SPIN: + case KVM_CAP_PCI_SEGMENT: + case KVM_CAP_X86_ROBUST_SINGLESTEP: r = 1; break; case KVM_CAP_COALESCED_MMIO: @@ -1464,8 +1684,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { - kvm_x86_ops->vcpu_put(vcpu); kvm_put_guest_fpu(vcpu); + kvm_x86_ops->vcpu_put(vcpu); } static int is_efer_nx(void) @@ -1530,6 +1750,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, cpuid_fix_nx_cap(vcpu); r = 0; kvm_apic_set_version(vcpu); + kvm_x86_ops->cpuid_update(vcpu); out_free: vfree(cpuid_entries); @@ -1552,6 +1773,7 @@ static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, goto out; vcpu->arch.cpuid_nent = cpuid->nent; kvm_apic_set_version(vcpu); + kvm_x86_ops->cpuid_update(vcpu); return 0; out: @@ -1594,12 +1816,15 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, u32 index, int *nent, int maxnent) { unsigned f_nx = is_efer_nx() ? F(NX) : 0; - unsigned f_gbpages = kvm_x86_ops->gb_page_enable() ? F(GBPAGES) : 0; #ifdef CONFIG_X86_64 + unsigned f_gbpages = (kvm_x86_ops->get_lpage_level() == PT_PDPE_LEVEL) + ? F(GBPAGES) : 0; unsigned f_lm = F(LM); #else + unsigned f_gbpages = 0; unsigned f_lm = 0; #endif + unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0; /* cpuid 1.edx */ const u32 kvm_supported_word0_x86_features = @@ -1619,7 +1844,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | F(PAT) | F(PSE36) | 0 /* Reserved */ | f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) | - F(FXSR) | F(FXSR_OPT) | f_gbpages | 0 /* RDTSCP */ | + F(FXSR) | F(FXSR_OPT) | f_gbpages | f_rdtscp | 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); /* cpuid 1.ecx */ const u32 kvm_supported_word4_x86_features = @@ -1866,7 +2091,7 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, return 0; if (mce->status & MCI_STATUS_UC) { if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) || - !(vcpu->arch.cr4 & X86_CR4_MCE)) { + !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) { printk(KERN_DEBUG "kvm: set_mce: " "injects mce exception while " "previous one is in progress!\n"); @@ -2160,14 +2385,14 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES) return -EINVAL; - down_write(&kvm->slots_lock); + mutex_lock(&kvm->slots_lock); spin_lock(&kvm->mmu_lock); kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages); kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages; spin_unlock(&kvm->mmu_lock); - up_write(&kvm->slots_lock); + mutex_unlock(&kvm->slots_lock); return 0; } @@ -2176,13 +2401,35 @@ static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) return kvm->arch.n_alloc_mmu_pages; } +gfn_t unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn) +{ + int i; + struct kvm_mem_alias *alias; + struct kvm_mem_aliases *aliases; + + aliases = rcu_dereference(kvm->arch.aliases); + + for (i = 0; i < aliases->naliases; ++i) { + alias = &aliases->aliases[i]; + if (alias->flags & KVM_ALIAS_INVALID) + continue; + if (gfn >= alias->base_gfn + && gfn < alias->base_gfn + alias->npages) + return alias->target_gfn + gfn - alias->base_gfn; + } + return gfn; +} + gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) { int i; struct kvm_mem_alias *alias; + struct kvm_mem_aliases *aliases; - for (i = 0; i < kvm->arch.naliases; ++i) { - alias = &kvm->arch.aliases[i]; + aliases = rcu_dereference(kvm->arch.aliases); + + for (i = 0; i < aliases->naliases; ++i) { + alias = &aliases->aliases[i]; if (gfn >= alias->base_gfn && gfn < alias->base_gfn + alias->npages) return alias->target_gfn + gfn - alias->base_gfn; @@ -2200,6 +2447,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, { int r, n; struct kvm_mem_alias *p; + struct kvm_mem_aliases *aliases, *old_aliases; r = -EINVAL; /* General sanity checks */ @@ -2216,26 +2464,48 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, < alias->target_phys_addr) goto out; - down_write(&kvm->slots_lock); - spin_lock(&kvm->mmu_lock); + r = -ENOMEM; + aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL); + if (!aliases) + goto out; + + mutex_lock(&kvm->slots_lock); - p = &kvm->arch.aliases[alias->slot]; + /* invalidate any gfn reference in case of deletion/shrinking */ + memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases)); + aliases->aliases[alias->slot].flags |= KVM_ALIAS_INVALID; + old_aliases = kvm->arch.aliases; + rcu_assign_pointer(kvm->arch.aliases, aliases); + synchronize_srcu_expedited(&kvm->srcu); + kvm_mmu_zap_all(kvm); + kfree(old_aliases); + + r = -ENOMEM; + aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL); + if (!aliases) + goto out_unlock; + + memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases)); + + p = &aliases->aliases[alias->slot]; p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; p->npages = alias->memory_size >> PAGE_SHIFT; p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT; + p->flags &= ~(KVM_ALIAS_INVALID); for (n = KVM_ALIAS_SLOTS; n > 0; --n) - if (kvm->arch.aliases[n - 1].npages) + if (aliases->aliases[n - 1].npages) break; - kvm->arch.naliases = n; + aliases->naliases = n; - spin_unlock(&kvm->mmu_lock); - kvm_mmu_zap_all(kvm); - - up_write(&kvm->slots_lock); - - return 0; + old_aliases = kvm->arch.aliases; + rcu_assign_pointer(kvm->arch.aliases, aliases); + synchronize_srcu_expedited(&kvm->srcu); + kfree(old_aliases); + r = 0; +out_unlock: + mutex_unlock(&kvm->slots_lock); out: return r; } @@ -2273,18 +2543,18 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) r = 0; switch (chip->chip_id) { case KVM_IRQCHIP_PIC_MASTER: - spin_lock(&pic_irqchip(kvm)->lock); + raw_spin_lock(&pic_irqchip(kvm)->lock); memcpy(&pic_irqchip(kvm)->pics[0], &chip->chip.pic, sizeof(struct kvm_pic_state)); - spin_unlock(&pic_irqchip(kvm)->lock); + raw_spin_unlock(&pic_irqchip(kvm)->lock); break; case KVM_IRQCHIP_PIC_SLAVE: - spin_lock(&pic_irqchip(kvm)->lock); + raw_spin_lock(&pic_irqchip(kvm)->lock); memcpy(&pic_irqchip(kvm)->pics[1], &chip->chip.pic, sizeof(struct kvm_pic_state)); - spin_unlock(&pic_irqchip(kvm)->lock); + raw_spin_unlock(&pic_irqchip(kvm)->lock); break; case KVM_IRQCHIP_IOAPIC: r = kvm_set_ioapic(kvm, &chip->chip.ioapic); @@ -2364,29 +2634,62 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm, int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) { - int r; - int n; + int r, n, i; struct kvm_memory_slot *memslot; - int is_dirty = 0; + unsigned long is_dirty = 0; + unsigned long *dirty_bitmap = NULL; - down_write(&kvm->slots_lock); + mutex_lock(&kvm->slots_lock); - r = kvm_get_dirty_log(kvm, log, &is_dirty); - if (r) + r = -EINVAL; + if (log->slot >= KVM_MEMORY_SLOTS) + goto out; + + memslot = &kvm->memslots->memslots[log->slot]; + r = -ENOENT; + if (!memslot->dirty_bitmap) + goto out; + + n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; + + r = -ENOMEM; + dirty_bitmap = vmalloc(n); + if (!dirty_bitmap) goto out; + memset(dirty_bitmap, 0, n); + + for (i = 0; !is_dirty && i < n/sizeof(long); i++) + is_dirty = memslot->dirty_bitmap[i]; /* If nothing is dirty, don't bother messing with page tables. */ if (is_dirty) { + struct kvm_memslots *slots, *old_slots; + spin_lock(&kvm->mmu_lock); kvm_mmu_slot_remove_write_access(kvm, log->slot); spin_unlock(&kvm->mmu_lock); - memslot = &kvm->memslots[log->slot]; - n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; - memset(memslot->dirty_bitmap, 0, n); + + slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); + if (!slots) + goto out_free; + + memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); + slots->memslots[log->slot].dirty_bitmap = dirty_bitmap; + + old_slots = kvm->memslots; + rcu_assign_pointer(kvm->memslots, slots); + synchronize_srcu_expedited(&kvm->srcu); + dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap; + kfree(old_slots); } + r = 0; + if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) + r = -EFAULT; +out_free: + vfree(dirty_bitmap); out: - up_write(&kvm->slots_lock); + mutex_unlock(&kvm->slots_lock); return r; } @@ -2469,6 +2772,8 @@ long kvm_arch_vm_ioctl(struct file *filp, if (vpic) { r = kvm_ioapic_init(kvm); if (r) { + kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, + &vpic->dev); kfree(vpic); goto create_irqchip_unlock; } @@ -2480,10 +2785,8 @@ long kvm_arch_vm_ioctl(struct file *filp, r = kvm_setup_default_irq_routing(kvm); if (r) { mutex_lock(&kvm->irq_lock); - kfree(kvm->arch.vpic); - kfree(kvm->arch.vioapic); - kvm->arch.vpic = NULL; - kvm->arch.vioapic = NULL; + kvm_ioapic_destroy(kvm); + kvm_destroy_pic(kvm); mutex_unlock(&kvm->irq_lock); } create_irqchip_unlock: @@ -2499,7 +2802,7 @@ long kvm_arch_vm_ioctl(struct file *filp, sizeof(struct kvm_pit_config))) goto out; create_pit: - down_write(&kvm->slots_lock); + mutex_lock(&kvm->slots_lock); r = -EEXIST; if (kvm->arch.vpit) goto create_pit_unlock; @@ -2508,7 +2811,7 @@ long kvm_arch_vm_ioctl(struct file *filp, if (kvm->arch.vpit) r = 0; create_pit_unlock: - up_write(&kvm->slots_lock); + mutex_unlock(&kvm->slots_lock); break; case KVM_IRQ_LINE_STATUS: case KVM_IRQ_LINE: { @@ -2725,7 +3028,7 @@ static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len, !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v)) return 0; - return kvm_io_bus_write(&vcpu->kvm->mmio_bus, addr, len, v); + return kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, len, v); } static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) @@ -2734,17 +3037,44 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, len, v)) return 0; - return kvm_io_bus_read(&vcpu->kvm->mmio_bus, addr, len, v); + return kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, len, v); } -static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, - struct kvm_vcpu *vcpu) +gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) +{ + u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); +} + + gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) +{ + u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + access |= PFERR_FETCH_MASK; + return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); +} + +gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) +{ + u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + access |= PFERR_WRITE_MASK; + return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); +} + +/* uses this to access any guest's mapped memory without checking CPL */ +gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) +{ + return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, 0, error); +} + +static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, + struct kvm_vcpu *vcpu, u32 access, + u32 *error) { void *data = val; int r = X86EMUL_CONTINUE; while (bytes) { - gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr, access, error); unsigned offset = addr & (PAGE_SIZE-1); unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); int ret; @@ -2767,14 +3097,37 @@ out: return r; } +/* used for instruction fetching */ +static int kvm_fetch_guest_virt(gva_t addr, void *val, unsigned int bytes, + struct kvm_vcpu *vcpu, u32 *error) +{ + u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, + access | PFERR_FETCH_MASK, error); +} + +static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, + struct kvm_vcpu *vcpu, u32 *error) +{ + u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, + error); +} + +static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes, + struct kvm_vcpu *vcpu, u32 *error) +{ + return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error); +} + static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes, - struct kvm_vcpu *vcpu) + struct kvm_vcpu *vcpu, u32 *error) { void *data = val; int r = X86EMUL_CONTINUE; while (bytes) { - gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + gpa_t gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error); unsigned offset = addr & (PAGE_SIZE-1); unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); int ret; @@ -2804,6 +3157,7 @@ static int emulator_read_emulated(unsigned long addr, struct kvm_vcpu *vcpu) { gpa_t gpa; + u32 error_code; if (vcpu->mmio_read_completed) { memcpy(val, vcpu->mmio_data, bytes); @@ -2813,17 +3167,20 @@ static int emulator_read_emulated(unsigned long addr, return X86EMUL_CONTINUE; } - gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, &error_code); + + if (gpa == UNMAPPED_GVA) { + kvm_inject_page_fault(vcpu, addr, error_code); + return X86EMUL_PROPAGATE_FAULT; + } /* For APIC access vmexit */ if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) goto mmio; - if (kvm_read_guest_virt(addr, val, bytes, vcpu) + if (kvm_read_guest_virt(addr, val, bytes, vcpu, NULL) == X86EMUL_CONTINUE) return X86EMUL_CONTINUE; - if (gpa == UNMAPPED_GVA) - return X86EMUL_PROPAGATE_FAULT; mmio: /* @@ -2862,11 +3219,12 @@ static int emulator_write_emulated_onepage(unsigned long addr, struct kvm_vcpu *vcpu) { gpa_t gpa; + u32 error_code; - gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, &error_code); if (gpa == UNMAPPED_GVA) { - kvm_inject_page_fault(vcpu, addr, 2); + kvm_inject_page_fault(vcpu, addr, error_code); return X86EMUL_PROPAGATE_FAULT; } @@ -2930,7 +3288,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr, char *kaddr; u64 val; - gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL); if (gpa == UNMAPPED_GVA || (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) @@ -2967,35 +3325,21 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) int emulate_clts(struct kvm_vcpu *vcpu) { - kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS); + kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); + kvm_x86_ops->fpu_activate(vcpu); return X86EMUL_CONTINUE; } int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest) { - struct kvm_vcpu *vcpu = ctxt->vcpu; - - switch (dr) { - case 0 ... 3: - *dest = kvm_x86_ops->get_dr(vcpu, dr); - return X86EMUL_CONTINUE; - default: - pr_unimpl(vcpu, "%s: unexpected dr %u\n", __func__, dr); - return X86EMUL_UNHANDLEABLE; - } + return kvm_x86_ops->get_dr(ctxt->vcpu, dr, dest); } int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) { unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U; - int exception; - kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception); - if (exception) { - /* FIXME: better handling */ - return X86EMUL_UNHANDLEABLE; - } - return X86EMUL_CONTINUE; + return kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask); } void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) @@ -3009,7 +3353,7 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); - kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu); + kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu, NULL); printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n", context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]); @@ -3017,7 +3361,8 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) EXPORT_SYMBOL_GPL(kvm_report_emulation_failure); static struct x86_emulate_ops emulate_ops = { - .read_std = kvm_read_guest_virt, + .read_std = kvm_read_guest_virt_system, + .fetch = kvm_fetch_guest_virt, .read_emulated = emulator_read_emulated, .write_emulated = emulator_write_emulated, .cmpxchg_emulated = emulator_cmpxchg_emulated, @@ -3060,8 +3405,9 @@ int emulate_instruction(struct kvm_vcpu *vcpu, vcpu->arch.emulate_ctxt.vcpu = vcpu; vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu); vcpu->arch.emulate_ctxt.mode = + (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) - ? X86EMUL_MODE_REAL : cs_l + ? X86EMUL_MODE_VM86 : cs_l ? X86EMUL_MODE_PROT64 : cs_db ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; @@ -3153,12 +3499,17 @@ static int pio_copy_data(struct kvm_vcpu *vcpu) gva_t q = vcpu->arch.pio.guest_gva; unsigned bytes; int ret; + u32 error_code; bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count; if (vcpu->arch.pio.in) - ret = kvm_write_guest_virt(q, p, bytes, vcpu); + ret = kvm_write_guest_virt(q, p, bytes, vcpu, &error_code); else - ret = kvm_read_guest_virt(q, p, bytes, vcpu); + ret = kvm_read_guest_virt(q, p, bytes, vcpu, &error_code); + + if (ret == X86EMUL_PROPAGATE_FAULT) + kvm_inject_page_fault(vcpu, q, error_code); + return ret; } @@ -3179,7 +3530,7 @@ int complete_pio(struct kvm_vcpu *vcpu) if (io->in) { r = pio_copy_data(vcpu); if (r) - return r; + goto out; } delta = 1; @@ -3206,7 +3557,7 @@ int complete_pio(struct kvm_vcpu *vcpu) kvm_register_write(vcpu, VCPU_REGS_RSI, val); } } - +out: io->count -= io->cur_count; io->cur_count = 0; @@ -3219,11 +3570,12 @@ static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) int r; if (vcpu->arch.pio.in) - r = kvm_io_bus_read(&vcpu->kvm->pio_bus, vcpu->arch.pio.port, + r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port, vcpu->arch.pio.size, pd); else - r = kvm_io_bus_write(&vcpu->kvm->pio_bus, vcpu->arch.pio.port, - vcpu->arch.pio.size, pd); + r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS, + vcpu->arch.pio.port, vcpu->arch.pio.size, + pd); return r; } @@ -3234,7 +3586,7 @@ static int pio_string_write(struct kvm_vcpu *vcpu) int i, r = 0; for (i = 0; i < io->cur_count; i++) { - if (kvm_io_bus_write(&vcpu->kvm->pio_bus, + if (kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS, io->port, io->size, pd)) { r = -EOPNOTSUPP; break; @@ -3248,6 +3600,8 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port) { unsigned long val; + trace_kvm_pio(!in, port, size, 1); + vcpu->run->exit_reason = KVM_EXIT_IO; vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; vcpu->run->io.size = vcpu->arch.pio.size = size; @@ -3259,11 +3613,10 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port) vcpu->arch.pio.down = 0; vcpu->arch.pio.rep = 0; - trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port, - size, 1); - - val = kvm_register_read(vcpu, VCPU_REGS_RAX); - memcpy(vcpu->arch.pio_data, &val, 4); + if (!vcpu->arch.pio.in) { + val = kvm_register_read(vcpu, VCPU_REGS_RAX); + memcpy(vcpu->arch.pio_data, &val, 4); + } if (!kernel_pio(vcpu, vcpu->arch.pio_data)) { complete_pio(vcpu); @@ -3280,6 +3633,8 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in, unsigned now, in_page; int ret = 0; + trace_kvm_pio(!in, port, size, count); + vcpu->run->exit_reason = KVM_EXIT_IO; vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; vcpu->run->io.size = vcpu->arch.pio.size = size; @@ -3291,9 +3646,6 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in, vcpu->arch.pio.down = down; vcpu->arch.pio.rep = rep; - trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port, - size, count); - if (!count) { kvm_x86_ops->skip_emulated_instruction(vcpu); return 1; @@ -3325,10 +3677,8 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in, if (!vcpu->arch.pio.in) { /* string PIO write */ ret = pio_copy_data(vcpu); - if (ret == X86EMUL_PROPAGATE_FAULT) { - kvm_inject_gp(vcpu, 0); + if (ret == X86EMUL_PROPAGATE_FAULT) return 1; - } if (ret == 0 && !pio_string_write(vcpu)) { complete_pio(vcpu); if (vcpu->arch.pio.count == 0) @@ -3487,11 +3837,76 @@ static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0, return a0 | ((gpa_t)a1 << 32); } +int kvm_hv_hypercall(struct kvm_vcpu *vcpu) +{ + u64 param, ingpa, outgpa, ret; + uint16_t code, rep_idx, rep_cnt, res = HV_STATUS_SUCCESS, rep_done = 0; + bool fast, longmode; + int cs_db, cs_l; + + /* + * hypercall generates UD from non zero cpl and real mode + * per HYPER-V spec + */ + if (kvm_x86_ops->get_cpl(vcpu) != 0 || !is_protmode(vcpu)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 0; + } + + kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); + longmode = is_long_mode(vcpu) && cs_l == 1; + + if (!longmode) { + param = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDX) << 32) | + (kvm_register_read(vcpu, VCPU_REGS_RAX) & 0xffffffff); + ingpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RBX) << 32) | + (kvm_register_read(vcpu, VCPU_REGS_RCX) & 0xffffffff); + outgpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDI) << 32) | + (kvm_register_read(vcpu, VCPU_REGS_RSI) & 0xffffffff); + } +#ifdef CONFIG_X86_64 + else { + param = kvm_register_read(vcpu, VCPU_REGS_RCX); + ingpa = kvm_register_read(vcpu, VCPU_REGS_RDX); + outgpa = kvm_register_read(vcpu, VCPU_REGS_R8); + } +#endif + + code = param & 0xffff; + fast = (param >> 16) & 0x1; + rep_cnt = (param >> 32) & 0xfff; + rep_idx = (param >> 48) & 0xfff; + + trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa); + + switch (code) { + case HV_X64_HV_NOTIFY_LONG_SPIN_WAIT: + kvm_vcpu_on_spin(vcpu); + break; + default: + res = HV_STATUS_INVALID_HYPERCALL_CODE; + break; + } + + ret = res | (((u64)rep_done & 0xfff) << 32); + if (longmode) { + kvm_register_write(vcpu, VCPU_REGS_RAX, ret); + } else { + kvm_register_write(vcpu, VCPU_REGS_RDX, ret >> 32); + kvm_register_write(vcpu, VCPU_REGS_RAX, ret & 0xffffffff); + } + + return 1; +} + int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) { unsigned long nr, a0, a1, a2, a3, ret; int r = 1; + if (kvm_hv_hypercall_enabled(vcpu->kvm)) + return kvm_hv_hypercall(vcpu); + nr = kvm_register_read(vcpu, VCPU_REGS_RAX); a0 = kvm_register_read(vcpu, VCPU_REGS_RBX); a1 = kvm_register_read(vcpu, VCPU_REGS_RCX); @@ -3534,10 +3949,8 @@ EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); int kvm_fix_hypercall(struct kvm_vcpu *vcpu) { char instruction[3]; - int ret = 0; unsigned long rip = kvm_rip_read(vcpu); - /* * Blow out the MMU to ensure that no other VCPU has an active mapping * to ensure that the updated hypercall appears atomically across all @@ -3546,11 +3959,8 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu) kvm_mmu_zap_all(vcpu->kvm); kvm_x86_ops->patch_hypercall(vcpu, instruction); - if (emulator_write_emulated(rip, instruction, 3, vcpu) - != X86EMUL_CONTINUE) - ret = -EFAULT; - return ret; + return emulator_write_emulated(rip, instruction, 3, vcpu); } static u64 mk_cr_64(u64 curr_cr, u32 new_val) @@ -3583,10 +3993,9 @@ unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) { unsigned long value; - kvm_x86_ops->decache_cr4_guest_bits(vcpu); switch (cr) { case 0: - value = vcpu->arch.cr0; + value = kvm_read_cr0(vcpu); break; case 2: value = vcpu->arch.cr2; @@ -3595,7 +4004,7 @@ unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) value = vcpu->arch.cr3; break; case 4: - value = vcpu->arch.cr4; + value = kvm_read_cr4(vcpu); break; case 8: value = kvm_get_cr8(vcpu); @@ -3613,7 +4022,7 @@ void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, { switch (cr) { case 0: - kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val)); + kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val)); *rflags = kvm_get_rflags(vcpu); break; case 2: @@ -3623,7 +4032,7 @@ void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, kvm_set_cr3(vcpu, val); break; case 4: - kvm_set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val)); + kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val)); break; case 8: kvm_set_cr8(vcpu, val & 0xfUL); @@ -3690,6 +4099,7 @@ struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, } return best; } +EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry); int cpuid_maxphyaddr(struct kvm_vcpu *vcpu) { @@ -3773,14 +4183,15 @@ static void vapic_enter(struct kvm_vcpu *vcpu) static void vapic_exit(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; + int idx; if (!apic || !apic->vapic_addr) return; - down_read(&vcpu->kvm->slots_lock); + idx = srcu_read_lock(&vcpu->kvm->srcu); kvm_release_page_dirty(apic->vapic_page); mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&vcpu->kvm->srcu, idx); } static void update_cr8_intercept(struct kvm_vcpu *vcpu) @@ -3876,12 +4287,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) r = 0; goto out; } + if (test_and_clear_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests)) { + vcpu->fpu_active = 0; + kvm_x86_ops->fpu_deactivate(vcpu); + } } preempt_disable(); kvm_x86_ops->prepare_guest_switch(vcpu); - kvm_load_guest_fpu(vcpu); + if (vcpu->fpu_active) + kvm_load_guest_fpu(vcpu); local_irq_disable(); @@ -3909,7 +4325,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_lapic_sync_to_vapic(vcpu); } - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); kvm_guest_enter(); @@ -3951,7 +4367,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) preempt_enable(); - down_read(&vcpu->kvm->slots_lock); + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); /* * Profile KVM exit RIPs: @@ -3973,6 +4389,7 @@ out: static int __vcpu_run(struct kvm_vcpu *vcpu) { int r; + struct kvm *kvm = vcpu->kvm; if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) { pr_debug("vcpu %d received sipi with vector # %x\n", @@ -3984,7 +4401,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; } - down_read(&vcpu->kvm->slots_lock); + vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); vapic_enter(vcpu); r = 1; @@ -3992,9 +4409,9 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) r = vcpu_enter_guest(vcpu); else { - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); kvm_vcpu_block(vcpu); - down_read(&vcpu->kvm->slots_lock); + vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests)) { switch(vcpu->arch.mp_state) { @@ -4029,13 +4446,13 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) ++vcpu->stat.signal_exits; } if (need_resched()) { - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); kvm_resched(vcpu); - down_read(&vcpu->kvm->slots_lock); + vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); } } - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); post_kvm_run_save(vcpu); vapic_exit(vcpu); @@ -4074,10 +4491,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) vcpu->mmio_read_completed = 1; vcpu->mmio_needed = 0; - down_read(&vcpu->kvm->slots_lock); + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); r = emulate_instruction(vcpu, vcpu->arch.mmio_fault_cr2, 0, EMULTYPE_NO_DECODE); - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); if (r == EMULATE_DO_MMIO) { /* * Read-modify-write. Back to userspace. @@ -4204,13 +4621,12 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, sregs->gdt.limit = dt.limit; sregs->gdt.base = dt.base; - kvm_x86_ops->decache_cr4_guest_bits(vcpu); - sregs->cr0 = vcpu->arch.cr0; + sregs->cr0 = kvm_read_cr0(vcpu); sregs->cr2 = vcpu->arch.cr2; sregs->cr3 = vcpu->arch.cr3; - sregs->cr4 = vcpu->arch.cr4; + sregs->cr4 = kvm_read_cr4(vcpu); sregs->cr8 = kvm_get_cr8(vcpu); - sregs->efer = vcpu->arch.shadow_efer; + sregs->efer = vcpu->arch.efer; sregs->apic_base = kvm_get_apic_base(vcpu); memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap); @@ -4298,14 +4714,23 @@ static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, { struct descriptor_table dtable; u16 index = selector >> 3; + int ret; + u32 err; + gva_t addr; get_segment_descriptor_dtable(vcpu, selector, &dtable); if (dtable.limit < index * 8 + 7) { kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc); - return 1; + return X86EMUL_PROPAGATE_FAULT; } - return kvm_read_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu); + addr = dtable.base + index * 8; + ret = kvm_read_guest_virt_system(addr, seg_desc, sizeof(*seg_desc), + vcpu, &err); + if (ret == X86EMUL_PROPAGATE_FAULT) + kvm_inject_page_fault(vcpu, addr, err); + + return ret; } /* allowed just for 8 bytes segments */ @@ -4319,15 +4744,23 @@ static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, if (dtable.limit < index * 8 + 7) return 1; - return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu); + return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu, NULL); +} + +static gpa_t get_tss_base_addr_write(struct kvm_vcpu *vcpu, + struct desc_struct *seg_desc) +{ + u32 base_addr = get_desc_base(seg_desc); + + return kvm_mmu_gva_to_gpa_write(vcpu, base_addr, NULL); } -static gpa_t get_tss_base_addr(struct kvm_vcpu *vcpu, +static gpa_t get_tss_base_addr_read(struct kvm_vcpu *vcpu, struct desc_struct *seg_desc) { u32 base_addr = get_desc_base(seg_desc); - return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr); + return kvm_mmu_gva_to_gpa_read(vcpu, base_addr, NULL); } static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg) @@ -4338,18 +4771,6 @@ static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg) return kvm_seg.selector; } -static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu, - u16 selector, - struct kvm_segment *kvm_seg) -{ - struct desc_struct seg_desc; - - if (load_guest_segment_descriptor(vcpu, selector, &seg_desc)) - return 1; - seg_desct_to_kvm_desct(&seg_desc, selector, kvm_seg); - return 0; -} - static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg) { struct kvm_segment segvar = { @@ -4367,7 +4788,7 @@ static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int se .unusable = 0, }; kvm_x86_ops->set_segment(vcpu, &segvar, seg); - return 0; + return X86EMUL_CONTINUE; } static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg) @@ -4377,24 +4798,112 @@ static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg) (kvm_get_rflags(vcpu) & X86_EFLAGS_VM); } -int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, - int type_bits, int seg) +int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg) { struct kvm_segment kvm_seg; + struct desc_struct seg_desc; + u8 dpl, rpl, cpl; + unsigned err_vec = GP_VECTOR; + u32 err_code = 0; + bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */ + int ret; - if (is_vm86_segment(vcpu, seg) || !(vcpu->arch.cr0 & X86_CR0_PE)) + if (is_vm86_segment(vcpu, seg) || !is_protmode(vcpu)) return kvm_load_realmode_segment(vcpu, selector, seg); - if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg)) - return 1; - kvm_seg.type |= type_bits; - if (seg != VCPU_SREG_SS && seg != VCPU_SREG_CS && - seg != VCPU_SREG_LDTR) - if (!kvm_seg.s) - kvm_seg.unusable = 1; + /* NULL selector is not valid for TR, CS and SS */ + if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR) + && null_selector) + goto exception; + + /* TR should be in GDT only */ + if (seg == VCPU_SREG_TR && (selector & (1 << 2))) + goto exception; + + ret = load_guest_segment_descriptor(vcpu, selector, &seg_desc); + if (ret) + return ret; + + seg_desct_to_kvm_desct(&seg_desc, selector, &kvm_seg); + + if (null_selector) { /* for NULL selector skip all following checks */ + kvm_seg.unusable = 1; + goto load; + } + + err_code = selector & 0xfffc; + err_vec = GP_VECTOR; + /* can't load system descriptor into segment selecor */ + if (seg <= VCPU_SREG_GS && !kvm_seg.s) + goto exception; + + if (!kvm_seg.present) { + err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR; + goto exception; + } + + rpl = selector & 3; + dpl = kvm_seg.dpl; + cpl = kvm_x86_ops->get_cpl(vcpu); + + switch (seg) { + case VCPU_SREG_SS: + /* + * segment is not a writable data segment or segment + * selector's RPL != CPL or segment selector's RPL != CPL + */ + if (rpl != cpl || (kvm_seg.type & 0xa) != 0x2 || dpl != cpl) + goto exception; + break; + case VCPU_SREG_CS: + if (!(kvm_seg.type & 8)) + goto exception; + + if (kvm_seg.type & 4) { + /* conforming */ + if (dpl > cpl) + goto exception; + } else { + /* nonconforming */ + if (rpl > cpl || dpl != cpl) + goto exception; + } + /* CS(RPL) <- CPL */ + selector = (selector & 0xfffc) | cpl; + break; + case VCPU_SREG_TR: + if (kvm_seg.s || (kvm_seg.type != 1 && kvm_seg.type != 9)) + goto exception; + break; + case VCPU_SREG_LDTR: + if (kvm_seg.s || kvm_seg.type != 2) + goto exception; + break; + default: /* DS, ES, FS, or GS */ + /* + * segment is not a data or readable code segment or + * ((segment is a data or nonconforming code segment) + * and (both RPL and CPL > DPL)) + */ + if ((kvm_seg.type & 0xa) == 0x8 || + (((kvm_seg.type & 0xc) != 0xc) && (rpl > dpl && cpl > dpl))) + goto exception; + break; + } + + if (!kvm_seg.unusable && kvm_seg.s) { + /* mark segment as accessed */ + kvm_seg.type |= 1; + seg_desc.type |= 1; + save_guest_segment_descriptor(vcpu, selector, &seg_desc); + } +load: kvm_set_segment(vcpu, &kvm_seg, seg); - return 0; + return X86EMUL_CONTINUE; +exception: + kvm_queue_exception_e(vcpu, err_vec, err_code); + return X86EMUL_PROPAGATE_FAULT; } static void save_state_to_tss32(struct kvm_vcpu *vcpu, @@ -4420,6 +4929,14 @@ static void save_state_to_tss32(struct kvm_vcpu *vcpu, tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR); } +static void kvm_load_segment_selector(struct kvm_vcpu *vcpu, u16 sel, int seg) +{ + struct kvm_segment kvm_seg; + kvm_get_segment(vcpu, &kvm_seg, seg); + kvm_seg.selector = sel; + kvm_set_segment(vcpu, &kvm_seg, seg); +} + static int load_state_from_tss32(struct kvm_vcpu *vcpu, struct tss_segment_32 *tss) { @@ -4437,25 +4954,41 @@ static int load_state_from_tss32(struct kvm_vcpu *vcpu, kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi); kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi); - if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR)) + /* + * SDM says that segment selectors are loaded before segment + * descriptors + */ + kvm_load_segment_selector(vcpu, tss->ldt_selector, VCPU_SREG_LDTR); + kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES); + kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS); + kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS); + kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS); + kvm_load_segment_selector(vcpu, tss->fs, VCPU_SREG_FS); + kvm_load_segment_selector(vcpu, tss->gs, VCPU_SREG_GS); + + /* + * Now load segment descriptors. If fault happenes at this stage + * it is handled in a context of new task + */ + if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, VCPU_SREG_LDTR)) return 1; - if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) + if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES)) return 1; - if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) + if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS)) return 1; - if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) + if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS)) return 1; - if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) + if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS)) return 1; - if (kvm_load_segment_descriptor(vcpu, tss->fs, 1, VCPU_SREG_FS)) + if (kvm_load_segment_descriptor(vcpu, tss->fs, VCPU_SREG_FS)) return 1; - if (kvm_load_segment_descriptor(vcpu, tss->gs, 1, VCPU_SREG_GS)) + if (kvm_load_segment_descriptor(vcpu, tss->gs, VCPU_SREG_GS)) return 1; return 0; } @@ -4495,19 +5028,33 @@ static int load_state_from_tss16(struct kvm_vcpu *vcpu, kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si); kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di); - if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR)) + /* + * SDM says that segment selectors are loaded before segment + * descriptors + */ + kvm_load_segment_selector(vcpu, tss->ldt, VCPU_SREG_LDTR); + kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES); + kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS); + kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS); + kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS); + + /* + * Now load segment descriptors. If fault happenes at this stage + * it is handled in a context of new task + */ + if (kvm_load_segment_descriptor(vcpu, tss->ldt, VCPU_SREG_LDTR)) return 1; - if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) + if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES)) return 1; - if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) + if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS)) return 1; - if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) + if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS)) return 1; - if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) + if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS)) return 1; return 0; } @@ -4529,7 +5076,7 @@ static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector, sizeof tss_segment_16)) goto out; - if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc), + if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc), &tss_segment_16, sizeof tss_segment_16)) goto out; @@ -4537,7 +5084,7 @@ static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector, tss_segment_16.prev_task_link = old_tss_sel; if (kvm_write_guest(vcpu->kvm, - get_tss_base_addr(vcpu, nseg_desc), + get_tss_base_addr_write(vcpu, nseg_desc), &tss_segment_16.prev_task_link, sizeof tss_segment_16.prev_task_link)) goto out; @@ -4568,7 +5115,7 @@ static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector, sizeof tss_segment_32)) goto out; - if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc), + if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc), &tss_segment_32, sizeof tss_segment_32)) goto out; @@ -4576,7 +5123,7 @@ static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector, tss_segment_32.prev_task_link = old_tss_sel; if (kvm_write_guest(vcpu->kvm, - get_tss_base_addr(vcpu, nseg_desc), + get_tss_base_addr_write(vcpu, nseg_desc), &tss_segment_32.prev_task_link, sizeof tss_segment_32.prev_task_link)) goto out; @@ -4599,7 +5146,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR); u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR); - old_tss_base = vcpu->arch.mmu.gva_to_gpa(vcpu, old_tss_base); + old_tss_base = kvm_mmu_gva_to_gpa_write(vcpu, old_tss_base, NULL); /* FIXME: Handle errors. Failure to read either TSS or their * descriptors should generate a pagefault. @@ -4658,7 +5205,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) &nseg_desc); } - kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 | X86_CR0_TS); + kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0(vcpu) | X86_CR0_TS); seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg); tr_seg.type = 11; kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR); @@ -4689,17 +5236,15 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, kvm_set_cr8(vcpu, sregs->cr8); - mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer; + mmu_reset_needed |= vcpu->arch.efer != sregs->efer; kvm_x86_ops->set_efer(vcpu, sregs->efer); kvm_set_apic_base(vcpu, sregs->apic_base); - kvm_x86_ops->decache_cr4_guest_bits(vcpu); - - mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0; + mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0; kvm_x86_ops->set_cr0(vcpu, sregs->cr0); vcpu->arch.cr0 = sregs->cr0; - mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4; + mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; kvm_x86_ops->set_cr4(vcpu, sregs->cr4); if (!is_long_mode(vcpu) && is_pae(vcpu)) { load_pdptrs(vcpu, vcpu->arch.cr3); @@ -4734,7 +5279,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, /* Older userspace won't unhalt the vcpu on reset. */ if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 && sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 && - !(vcpu->arch.cr0 & X86_CR0_PE)) + !is_protmode(vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; vcpu_put(vcpu); @@ -4832,11 +5377,12 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, { unsigned long vaddr = tr->linear_address; gpa_t gpa; + int idx; vcpu_load(vcpu); - down_read(&vcpu->kvm->slots_lock); - gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr); - up_read(&vcpu->kvm->slots_lock); + idx = srcu_read_lock(&vcpu->kvm->srcu); + gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL); + srcu_read_unlock(&vcpu->kvm->srcu, idx); tr->physical_address = gpa; tr->valid = gpa != UNMAPPED_GVA; tr->writeable = 1; @@ -4917,14 +5463,14 @@ EXPORT_SYMBOL_GPL(fx_init); void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) { - if (!vcpu->fpu_active || vcpu->guest_fpu_loaded) + if (vcpu->guest_fpu_loaded) return; vcpu->guest_fpu_loaded = 1; kvm_fx_save(&vcpu->arch.host_fx_image); kvm_fx_restore(&vcpu->arch.guest_fx_image); + trace_kvm_fpu(1); } -EXPORT_SYMBOL_GPL(kvm_load_guest_fpu); void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) { @@ -4935,8 +5481,9 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) kvm_fx_save(&vcpu->arch.guest_fx_image); kvm_fx_restore(&vcpu->arch.host_fx_image); ++vcpu->stat.fpu_reload; + set_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests); + trace_kvm_fpu(0); } -EXPORT_SYMBOL_GPL(kvm_put_guest_fpu); void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) { @@ -5088,11 +5635,13 @@ fail: void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) { + int idx; + kfree(vcpu->arch.mce_banks); kvm_free_lapic(vcpu); - down_read(&vcpu->kvm->slots_lock); + idx = srcu_read_lock(&vcpu->kvm->srcu); kvm_mmu_destroy(vcpu); - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&vcpu->kvm->srcu, idx); free_page((unsigned long)vcpu->arch.pio_data); } @@ -5103,6 +5652,12 @@ struct kvm *kvm_arch_create_vm(void) if (!kvm) return ERR_PTR(-ENOMEM); + kvm->arch.aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL); + if (!kvm->arch.aliases) { + kfree(kvm); + return ERR_PTR(-ENOMEM); + } + INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); @@ -5159,16 +5714,18 @@ void kvm_arch_destroy_vm(struct kvm *kvm) put_page(kvm->arch.apic_access_page); if (kvm->arch.ept_identity_pagetable) put_page(kvm->arch.ept_identity_pagetable); + cleanup_srcu_struct(&kvm->srcu); + kfree(kvm->arch.aliases); kfree(kvm); } -int kvm_arch_set_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem, +int kvm_arch_prepare_memory_region(struct kvm *kvm, + struct kvm_memory_slot *memslot, struct kvm_memory_slot old, + struct kvm_userspace_memory_region *mem, int user_alloc) { - int npages = mem->memory_size >> PAGE_SHIFT; - struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot]; + int npages = memslot->npages; /*To keep backward compatibility with older userspace, *x86 needs to hanlde !user_alloc case. @@ -5188,26 +5745,35 @@ int kvm_arch_set_memory_region(struct kvm *kvm, if (IS_ERR((void *)userspace_addr)) return PTR_ERR((void *)userspace_addr); - /* set userspace_addr atomically for kvm_hva_to_rmapp */ - spin_lock(&kvm->mmu_lock); memslot->userspace_addr = userspace_addr; - spin_unlock(&kvm->mmu_lock); - } else { - if (!old.user_alloc && old.rmap) { - int ret; - - down_write(¤t->mm->mmap_sem); - ret = do_munmap(current->mm, old.userspace_addr, - old.npages * PAGE_SIZE); - up_write(¤t->mm->mmap_sem); - if (ret < 0) - printk(KERN_WARNING - "kvm_vm_ioctl_set_memory_region: " - "failed to munmap memory\n"); - } } } + + return 0; +} + +void kvm_arch_commit_memory_region(struct kvm *kvm, + struct kvm_userspace_memory_region *mem, + struct kvm_memory_slot old, + int user_alloc) +{ + + int npages = mem->memory_size >> PAGE_SHIFT; + + if (!user_alloc && !old.user_alloc && old.rmap && !npages) { + int ret; + + down_write(¤t->mm->mmap_sem); + ret = do_munmap(current->mm, old.userspace_addr, + old.npages * PAGE_SIZE); + up_write(¤t->mm->mmap_sem); + if (ret < 0) + printk(KERN_WARNING + "kvm_vm_ioctl_set_memory_region: " + "failed to munmap memory\n"); + } + spin_lock(&kvm->mmu_lock); if (!kvm->arch.n_requested_mmu_pages) { unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); @@ -5216,8 +5782,6 @@ int kvm_arch_set_memory_region(struct kvm *kvm, kvm_mmu_slot_remove_write_access(kvm, mem->slot); spin_unlock(&kvm->mmu_lock); - - return 0; } void kvm_arch_flush_shadow(struct kvm *kvm) diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 5eadea585d2..2d101639bd8 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -2,6 +2,7 @@ #define ARCH_X86_KVM_X86_H #include <linux/kvm_host.h> +#include "kvm_cache_regs.h" static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu) { @@ -35,4 +36,33 @@ static inline bool kvm_exception_is_soft(unsigned int nr) struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, u32 function, u32 index); +static inline bool is_protmode(struct kvm_vcpu *vcpu) +{ + return kvm_read_cr0_bits(vcpu, X86_CR0_PE); +} + +static inline int is_long_mode(struct kvm_vcpu *vcpu) +{ +#ifdef CONFIG_X86_64 + return vcpu->arch.efer & EFER_LMA; +#else + return 0; +#endif +} + +static inline int is_pae(struct kvm_vcpu *vcpu) +{ + return kvm_read_cr4_bits(vcpu, X86_CR4_PAE); +} + +static inline int is_pse(struct kvm_vcpu *vcpu) +{ + return kvm_read_cr4_bits(vcpu, X86_CR4_PSE); +} + +static inline int is_paging(struct kvm_vcpu *vcpu) +{ + return kvm_read_cr0_bits(vcpu, X86_CR0_PG); +} + #endif diff --git a/drivers/staging/pohmelfs/inode.c b/drivers/staging/pohmelfs/inode.c index f69b7783027..11fc4d5c43e 100644 --- a/drivers/staging/pohmelfs/inode.c +++ b/drivers/staging/pohmelfs/inode.c @@ -969,7 +969,7 @@ int pohmelfs_setattr_raw(struct inode *inode, struct iattr *attr) if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { - err = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0; + err = dquot_transfer(inode, attr); if (err) goto err_out_exit; } diff --git a/fs/9p/fid.c b/fs/9p/fid.c index 14d94420457..08b2eb15704 100644 --- a/fs/9p/fid.c +++ b/fs/9p/fid.c @@ -151,7 +151,7 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry) if (access == V9FS_ACCESS_SINGLE) return ERR_PTR(-EPERM); - if (v9fs_extended(v9ses)) + if (v9fs_proto_dotu(v9ses)) uname = NULL; else uname = v9ses->uname; diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 7d6c2139891..6c7f6a25111 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -241,7 +241,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, list_add(&v9ses->slist, &v9fs_sessionlist); spin_unlock(&v9fs_sessionlist_lock); - v9ses->flags = V9FS_EXTENDED | V9FS_ACCESS_USER; + v9ses->flags = V9FS_PROTO_2000U | V9FS_ACCESS_USER; strcpy(v9ses->uname, V9FS_DEFUSER); strcpy(v9ses->aname, V9FS_DEFANAME); v9ses->uid = ~0; @@ -262,13 +262,13 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, goto error; } - if (!v9ses->clnt->dotu) - v9ses->flags &= ~V9FS_EXTENDED; + if (!p9_is_proto_dotu(v9ses->clnt)) + v9ses->flags &= ~V9FS_PROTO_2000U; v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ; /* for legacy mode, fall back to V9FS_ACCESS_ANY */ - if (!v9fs_extended(v9ses) && + if (!v9fs_proto_dotu(v9ses) && ((v9ses->flags&V9FS_ACCESS_MASK) == V9FS_ACCESS_USER)) { v9ses->flags &= ~V9FS_ACCESS_MASK; diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h index 019f4ccb70c..79000bf6249 100644 --- a/fs/9p/v9fs.h +++ b/fs/9p/v9fs.h @@ -23,7 +23,8 @@ /** * enum p9_session_flags - option flags for each 9P session - * @V9FS_EXTENDED: whether or not to use 9P2000.u extensions + * @V9FS_PROTO_2000U: whether or not to use 9P2000.u extensions + * @V9FS_PROTO_2010L: whether or not to use 9P2010.l extensions * @V9FS_ACCESS_SINGLE: only the mounting user can access the hierarchy * @V9FS_ACCESS_USER: a new attach will be issued for every user (default) * @V9FS_ACCESS_ANY: use a single attach for all users @@ -32,11 +33,12 @@ * Session flags reflect options selected by users at mount time */ enum p9_session_flags { - V9FS_EXTENDED = 0x01, - V9FS_ACCESS_SINGLE = 0x02, - V9FS_ACCESS_USER = 0x04, - V9FS_ACCESS_ANY = 0x06, - V9FS_ACCESS_MASK = 0x06, + V9FS_PROTO_2000U = 0x01, + V9FS_PROTO_2010L = 0x02, + V9FS_ACCESS_SINGLE = 0x04, + V9FS_ACCESS_USER = 0x08, + V9FS_ACCESS_ANY = 0x0C, + V9FS_ACCESS_MASK = 0x0C, }; /* possible values of ->cache */ @@ -121,7 +123,12 @@ static inline struct v9fs_session_info *v9fs_inode2v9ses(struct inode *inode) return (inode->i_sb->s_fs_info); } -static inline int v9fs_extended(struct v9fs_session_info *v9ses) +static inline int v9fs_proto_dotu(struct v9fs_session_info *v9ses) { - return v9ses->flags & V9FS_EXTENDED; + return v9ses->flags & V9FS_PROTO_2000U; +} + +static inline int v9fs_proto_dotl(struct v9fs_session_info *v9ses) +{ + return v9ses->flags & V9FS_PROTO_2010L; } diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index 15cce53bf61..6580aa44954 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -135,7 +135,7 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir) while (rdir->head < rdir->tail) { err = p9stat_read(rdir->buf + rdir->head, buflen - rdir->head, &st, - fid->clnt->dotu); + fid->clnt->proto_version); if (err) { P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err); err = -EIO; diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 74a0461a9ac..36122683fae 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -61,7 +61,7 @@ int v9fs_file_open(struct inode *inode, struct file *file) P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p \n", inode, file); v9ses = v9fs_inode2v9ses(inode); - omode = v9fs_uflags2omode(file->f_flags, v9fs_extended(v9ses)); + omode = v9fs_uflags2omode(file->f_flags, v9fs_proto_dotu(v9ses)); fid = file->private_data; if (!fid) { fid = v9fs_fid_clone(file->f_path.dentry); @@ -77,7 +77,7 @@ int v9fs_file_open(struct inode *inode, struct file *file) i_size_write(inode, 0); inode->i_blocks = 0; } - if ((file->f_flags & O_APPEND) && (!v9fs_extended(v9ses))) + if ((file->f_flags & O_APPEND) && (!v9fs_proto_dotu(v9ses))) generic_file_llseek(file, 0, SEEK_END); } diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index a407fa3388c..5fe45d692c9 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -60,7 +60,7 @@ static int unixmode2p9mode(struct v9fs_session_info *v9ses, int mode) res = mode & 0777; if (S_ISDIR(mode)) res |= P9_DMDIR; - if (v9fs_extended(v9ses)) { + if (v9fs_proto_dotu(v9ses)) { if (S_ISLNK(mode)) res |= P9_DMSYMLINK; if (v9ses->nodev == 0) { @@ -102,21 +102,21 @@ static int p9mode2unixmode(struct v9fs_session_info *v9ses, int mode) if ((mode & P9_DMDIR) == P9_DMDIR) res |= S_IFDIR; - else if ((mode & P9_DMSYMLINK) && (v9fs_extended(v9ses))) + else if ((mode & P9_DMSYMLINK) && (v9fs_proto_dotu(v9ses))) res |= S_IFLNK; - else if ((mode & P9_DMSOCKET) && (v9fs_extended(v9ses)) + else if ((mode & P9_DMSOCKET) && (v9fs_proto_dotu(v9ses)) && (v9ses->nodev == 0)) res |= S_IFSOCK; - else if ((mode & P9_DMNAMEDPIPE) && (v9fs_extended(v9ses)) + else if ((mode & P9_DMNAMEDPIPE) && (v9fs_proto_dotu(v9ses)) && (v9ses->nodev == 0)) res |= S_IFIFO; - else if ((mode & P9_DMDEVICE) && (v9fs_extended(v9ses)) + else if ((mode & P9_DMDEVICE) && (v9fs_proto_dotu(v9ses)) && (v9ses->nodev == 0)) res |= S_IFBLK; else res |= S_IFREG; - if (v9fs_extended(v9ses)) { + if (v9fs_proto_dotu(v9ses)) { if ((mode & P9_DMSETUID) == P9_DMSETUID) res |= S_ISUID; @@ -265,7 +265,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode) case S_IFBLK: case S_IFCHR: case S_IFSOCK: - if (!v9fs_extended(v9ses)) { + if (!v9fs_proto_dotu(v9ses)) { P9_DPRINTK(P9_DEBUG_ERROR, "special files without extended mode\n"); err = -EINVAL; @@ -278,7 +278,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode) inode->i_fop = &v9fs_file_operations; break; case S_IFLNK: - if (!v9fs_extended(v9ses)) { + if (!v9fs_proto_dotu(v9ses)) { P9_DPRINTK(P9_DEBUG_ERROR, "extended modes used w/o 9P2000.u\n"); err = -EINVAL; @@ -288,7 +288,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode) break; case S_IFDIR: inc_nlink(inode); - if (v9fs_extended(v9ses)) + if (v9fs_proto_dotu(v9ses)) inode->i_op = &v9fs_dir_inode_operations_ext; else inode->i_op = &v9fs_dir_inode_operations; @@ -575,7 +575,8 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode, flags = O_RDWR; fid = v9fs_create(v9ses, dir, dentry, NULL, perm, - v9fs_uflags2omode(flags, v9fs_extended(v9ses))); + v9fs_uflags2omode(flags, + v9fs_proto_dotu(v9ses))); if (IS_ERR(fid)) { err = PTR_ERR(fid); fid = NULL; @@ -858,7 +859,7 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr) if (iattr->ia_valid & ATTR_SIZE) wstat.length = iattr->ia_size; - if (v9fs_extended(v9ses)) { + if (v9fs_proto_dotu(v9ses)) { if (iattr->ia_valid & ATTR_UID) wstat.n_uid = iattr->ia_uid; @@ -886,6 +887,8 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, struct super_block *sb) { char ext[32]; + char tag_name[14]; + unsigned int i_nlink; struct v9fs_session_info *v9ses = sb->s_fs_info; inode->i_nlink = 1; @@ -897,11 +900,26 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, inode->i_uid = v9ses->dfltuid; inode->i_gid = v9ses->dfltgid; - if (v9fs_extended(v9ses)) { + if (v9fs_proto_dotu(v9ses)) { inode->i_uid = stat->n_uid; inode->i_gid = stat->n_gid; } - + if ((S_ISREG(inode->i_mode)) || (S_ISDIR(inode->i_mode))) { + if (v9fs_proto_dotu(v9ses) && (stat->extension[0] != '\0')) { + /* + * Hadlink support got added later to + * to the .u extension. So there can be + * server out there that doesn't support + * this even with .u extension. So check + * for non NULL stat->extension + */ + strncpy(ext, stat->extension, sizeof(ext)); + /* HARDLINKCOUNT %u */ + sscanf(ext, "%13s %u", tag_name, &i_nlink); + if (!strncmp(tag_name, "HARDLINKCOUNT", 13)) + inode->i_nlink = i_nlink; + } + } inode->i_mode = p9mode2unixmode(v9ses, stat->mode); if ((S_ISBLK(inode->i_mode)) || (S_ISCHR(inode->i_mode))) { char type = 0; @@ -976,7 +994,7 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen) if (IS_ERR(fid)) return PTR_ERR(fid); - if (!v9fs_extended(v9ses)) + if (!v9fs_proto_dotu(v9ses)) return -EBADF; st = p9_client_stat(fid); @@ -1066,7 +1084,7 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry, struct p9_fid *fid; v9ses = v9fs_inode2v9ses(dir); - if (!v9fs_extended(v9ses)) { + if (!v9fs_proto_dotu(v9ses)) { P9_DPRINTK(P9_DEBUG_ERROR, "not extended\n"); return -EPERM; } diff --git a/fs/attr.c b/fs/attr.c index 96d394bdadd..0a6ea54cde7 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -12,7 +12,6 @@ #include <linux/capability.h> #include <linux/fsnotify.h> #include <linux/fcntl.h> -#include <linux/quotaops.h> #include <linux/security.h> /* Taken over from the old code... */ @@ -212,14 +211,8 @@ int notify_change(struct dentry * dentry, struct iattr * attr) error = inode->i_op->setattr(dentry, attr); } else { error = inode_change_ok(inode, attr); - if (!error) { - if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || - (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) - error = vfs_dq_transfer(inode, attr) ? - -EDQUOT : 0; - if (!error) - error = inode_setattr(inode, attr); - } + if (!error) + error = inode_setattr(inode, attr); } if (ia_valid & ATTR_SIZE) diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c index 7f8d2e5a7ea..1d081f0cfec 100644 --- a/fs/ext2/balloc.c +++ b/fs/ext2/balloc.c @@ -570,7 +570,7 @@ do_more: error_return: brelse(bitmap_bh); release_blocks(sb, freed); - vfs_dq_free_block(inode, freed); + dquot_free_block(inode, freed); } /** @@ -1236,6 +1236,7 @@ ext2_fsblk_t ext2_new_blocks(struct inode *inode, ext2_fsblk_t goal, unsigned short windowsz = 0; unsigned long ngroups; unsigned long num = *count; + int ret; *errp = -ENOSPC; sb = inode->i_sb; @@ -1247,8 +1248,9 @@ ext2_fsblk_t ext2_new_blocks(struct inode *inode, ext2_fsblk_t goal, /* * Check quota for allocation of this block. */ - if (vfs_dq_alloc_block(inode, num)) { - *errp = -EDQUOT; + ret = dquot_alloc_block(inode, num); + if (ret) { + *errp = ret; return 0; } @@ -1409,7 +1411,7 @@ allocated: *errp = 0; brelse(bitmap_bh); - vfs_dq_free_block(inode, *count-num); + dquot_free_block(inode, *count-num); *count = num; return ret_block; @@ -1420,7 +1422,7 @@ out: * Undo the block allocation */ if (!performed_allocation) - vfs_dq_free_block(inode, *count); + dquot_free_block(inode, *count); brelse(bitmap_bh); return 0; } diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 586e3589d4c..5d198d0697f 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -20,6 +20,7 @@ #include <linux/time.h> #include <linux/pagemap.h> +#include <linux/quotaops.h> #include "ext2.h" #include "xattr.h" #include "acl.h" @@ -70,7 +71,7 @@ const struct file_operations ext2_file_operations = { .compat_ioctl = ext2_compat_ioctl, #endif .mmap = generic_file_mmap, - .open = generic_file_open, + .open = dquot_file_open, .release = ext2_release_file, .fsync = ext2_fsync, .splice_read = generic_file_splice_read, @@ -87,7 +88,7 @@ const struct file_operations ext2_xip_file_operations = { .compat_ioctl = ext2_compat_ioctl, #endif .mmap = xip_file_mmap, - .open = generic_file_open, + .open = dquot_file_open, .release = ext2_release_file, .fsync = ext2_fsync, }; diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index 15387c9c17d..ad7d572ee8d 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -121,8 +121,8 @@ void ext2_free_inode (struct inode * inode) if (!is_bad_inode(inode)) { /* Quota is already initialized in iput() */ ext2_xattr_delete_inode(inode); - vfs_dq_free_inode(inode); - vfs_dq_drop(inode); + dquot_free_inode(inode); + dquot_drop(inode); } es = EXT2_SB(sb)->s_es; @@ -586,10 +586,10 @@ got: goto fail_drop; } - if (vfs_dq_alloc_inode(inode)) { - err = -EDQUOT; + dquot_initialize(inode); + err = dquot_alloc_inode(inode); + if (err) goto fail_drop; - } err = ext2_init_acl(inode, dir); if (err) @@ -605,10 +605,10 @@ got: return inode; fail_free_drop: - vfs_dq_free_inode(inode); + dquot_free_inode(inode); fail_drop: - vfs_dq_drop(inode); + dquot_drop(inode); inode->i_flags |= S_NOQUOTA; inode->i_nlink = 0; unlock_new_inode(inode); diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 36ae1cac767..fc13cc119aa 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -60,6 +60,8 @@ static inline int ext2_inode_is_fast_symlink(struct inode *inode) */ void ext2_delete_inode (struct inode * inode) { + if (!is_bad_inode(inode)) + dquot_initialize(inode); truncate_inode_pages(&inode->i_data, 0); if (is_bad_inode(inode)) @@ -1464,9 +1466,12 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr) error = inode_change_ok(inode, iattr); if (error) return error; + + if (iattr->ia_valid & ATTR_SIZE) + dquot_initialize(inode); if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) || (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) { - error = vfs_dq_transfer(inode, iattr) ? -EDQUOT : 0; + error = dquot_transfer(inode, iattr); if (error) return error; } diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index dd7175ce560..71efb0e9a3f 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -31,6 +31,7 @@ */ #include <linux/pagemap.h> +#include <linux/quotaops.h> #include "ext2.h" #include "xattr.h" #include "acl.h" @@ -99,24 +100,27 @@ struct dentry *ext2_get_parent(struct dentry *child) */ static int ext2_create (struct inode * dir, struct dentry * dentry, int mode, struct nameidata *nd) { - struct inode * inode = ext2_new_inode (dir, mode); - int err = PTR_ERR(inode); - if (!IS_ERR(inode)) { - inode->i_op = &ext2_file_inode_operations; - if (ext2_use_xip(inode->i_sb)) { - inode->i_mapping->a_ops = &ext2_aops_xip; - inode->i_fop = &ext2_xip_file_operations; - } else if (test_opt(inode->i_sb, NOBH)) { - inode->i_mapping->a_ops = &ext2_nobh_aops; - inode->i_fop = &ext2_file_operations; - } else { - inode->i_mapping->a_ops = &ext2_aops; - inode->i_fop = &ext2_file_operations; - } - mark_inode_dirty(inode); - err = ext2_add_nondir(dentry, inode); + struct inode *inode; + + dquot_initialize(dir); + + inode = ext2_new_inode(dir, mode); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + inode->i_op = &ext2_file_inode_operations; + if (ext2_use_xip(inode->i_sb)) { + inode->i_mapping->a_ops = &ext2_aops_xip; + inode->i_fop = &ext2_xip_file_operations; + } else if (test_opt(inode->i_sb, NOBH)) { + inode->i_mapping->a_ops = &ext2_nobh_aops; + inode->i_fop = &ext2_file_operations; + } else { + inode->i_mapping->a_ops = &ext2_aops; + inode->i_fop = &ext2_file_operations; } - return err; + mark_inode_dirty(inode); + return ext2_add_nondir(dentry, inode); } static int ext2_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_t rdev) @@ -127,6 +131,8 @@ static int ext2_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_ if (!new_valid_dev(rdev)) return -EINVAL; + dquot_initialize(dir); + inode = ext2_new_inode (dir, mode); err = PTR_ERR(inode); if (!IS_ERR(inode)) { @@ -151,6 +157,8 @@ static int ext2_symlink (struct inode * dir, struct dentry * dentry, if (l > sb->s_blocksize) goto out; + dquot_initialize(dir); + inode = ext2_new_inode (dir, S_IFLNK | S_IRWXUGO); err = PTR_ERR(inode); if (IS_ERR(inode)) @@ -194,6 +202,8 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir, if (inode->i_nlink >= EXT2_LINK_MAX) return -EMLINK; + dquot_initialize(dir); + inode->i_ctime = CURRENT_TIME_SEC; inode_inc_link_count(inode); atomic_inc(&inode->i_count); @@ -216,6 +226,8 @@ static int ext2_mkdir(struct inode * dir, struct dentry * dentry, int mode) if (dir->i_nlink >= EXT2_LINK_MAX) goto out; + dquot_initialize(dir); + inode_inc_link_count(dir); inode = ext2_new_inode (dir, S_IFDIR | mode); @@ -262,6 +274,8 @@ static int ext2_unlink(struct inode * dir, struct dentry *dentry) struct page * page; int err = -ENOENT; + dquot_initialize(dir); + de = ext2_find_entry (dir, &dentry->d_name, &page); if (!de) goto out; @@ -304,6 +318,9 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry, struct ext2_dir_entry_2 * old_de; int err = -ENOENT; + dquot_initialize(old_dir); + dquot_initialize(new_dir); + old_de = ext2_find_entry (old_dir, &old_dentry->d_name, &old_page); if (!old_de) goto out; diff --git a/fs/ext2/super.c b/fs/ext2/super.c index f9cb54a585c..42e4a303b67 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -194,6 +194,8 @@ static void destroy_inodecache(void) static void ext2_clear_inode(struct inode *inode) { struct ext2_block_alloc_info *rsv = EXT2_I(inode)->i_block_alloc_info; + + dquot_drop(inode); ext2_discard_reservation(inode); EXT2_I(inode)->i_block_alloc_info = NULL; if (unlikely(rsv)) diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index 904f00642f8..e44dc92609b 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -644,8 +644,8 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, the inode. */ ea_bdebug(new_bh, "reusing block"); - error = -EDQUOT; - if (vfs_dq_alloc_block(inode, 1)) { + error = dquot_alloc_block(inode, 1); + if (error) { unlock_buffer(new_bh); goto cleanup; } @@ -702,7 +702,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, * as if nothing happened and cleanup the unused block */ if (error && error != -ENOSPC) { if (new_bh && new_bh != old_bh) - vfs_dq_free_block(inode, 1); + dquot_free_block(inode, 1); goto cleanup; } } else @@ -734,7 +734,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, le32_add_cpu(&HDR(old_bh)->h_refcount, -1); if (ce) mb_cache_entry_release(ce); - vfs_dq_free_block(inode, 1); + dquot_free_block(inode, 1); mark_buffer_dirty(old_bh); ea_bdebug(old_bh, "refcount now=%d", le32_to_cpu(HDR(old_bh)->h_refcount)); @@ -797,7 +797,7 @@ ext2_xattr_delete_inode(struct inode *inode) mark_buffer_dirty(bh); if (IS_SYNC(inode)) sync_dirty_buffer(bh); - vfs_dq_free_block(inode, 1); + dquot_free_block(inode, 1); } EXT2_I(inode)->i_file_acl = 0; diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c index 27967f92e82..161da2d3f89 100644 --- a/fs/ext3/balloc.c +++ b/fs/ext3/balloc.c @@ -676,7 +676,7 @@ void ext3_free_blocks(handle_t *handle, struct inode *inode, } ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks); if (dquot_freed_blocks) - vfs_dq_free_block(inode, dquot_freed_blocks); + dquot_free_block(inode, dquot_freed_blocks); return; } @@ -1502,8 +1502,9 @@ ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode, /* * Check quota for allocation of this block. */ - if (vfs_dq_alloc_block(inode, num)) { - *errp = -EDQUOT; + err = dquot_alloc_block(inode, num); + if (err) { + *errp = err; return 0; } @@ -1713,7 +1714,7 @@ allocated: *errp = 0; brelse(bitmap_bh); - vfs_dq_free_block(inode, *count-num); + dquot_free_block(inode, *count-num); *count = num; return ret_block; @@ -1728,7 +1729,7 @@ out: * Undo the block allocation */ if (!performed_allocation) - vfs_dq_free_block(inode, *count); + dquot_free_block(inode, *count); brelse(bitmap_bh); return 0; } diff --git a/fs/ext3/file.c b/fs/ext3/file.c index 388bbdfa0b4..f55df0e61cb 100644 --- a/fs/ext3/file.c +++ b/fs/ext3/file.c @@ -21,6 +21,7 @@ #include <linux/time.h> #include <linux/fs.h> #include <linux/jbd.h> +#include <linux/quotaops.h> #include <linux/ext3_fs.h> #include <linux/ext3_jbd.h> #include "xattr.h" @@ -33,9 +34,9 @@ */ static int ext3_release_file (struct inode * inode, struct file * filp) { - if (EXT3_I(inode)->i_state & EXT3_STATE_FLUSH_ON_CLOSE) { + if (ext3_test_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE)) { filemap_flush(inode->i_mapping); - EXT3_I(inode)->i_state &= ~EXT3_STATE_FLUSH_ON_CLOSE; + ext3_clear_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE); } /* if we are the last writer on the inode, drop the block reservation */ if ((filp->f_mode & FMODE_WRITE) && @@ -62,7 +63,7 @@ const struct file_operations ext3_file_operations = { .compat_ioctl = ext3_compat_ioctl, #endif .mmap = generic_file_mmap, - .open = generic_file_open, + .open = dquot_file_open, .release = ext3_release_file, .fsync = ext3_sync_file, .splice_read = generic_file_splice_read, diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c index b3999128513..ef9008b885b 100644 --- a/fs/ext3/ialloc.c +++ b/fs/ext3/ialloc.c @@ -123,10 +123,10 @@ void ext3_free_inode (handle_t *handle, struct inode * inode) * Note: we must free any quota before locking the superblock, * as writing the quota to disk may need the lock as well. */ - vfs_dq_init(inode); + dquot_initialize(inode); ext3_xattr_delete_inode(handle, inode); - vfs_dq_free_inode(inode); - vfs_dq_drop(inode); + dquot_free_inode(inode); + dquot_drop(inode); is_directory = S_ISDIR(inode->i_mode); @@ -588,10 +588,10 @@ got: sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE : 0; ret = inode; - if (vfs_dq_alloc_inode(inode)) { - err = -EDQUOT; + dquot_initialize(inode); + err = dquot_alloc_inode(inode); + if (err) goto fail_drop; - } err = ext3_init_acl(handle, inode, dir); if (err) @@ -619,10 +619,10 @@ really_out: return ret; fail_free_drop: - vfs_dq_free_inode(inode); + dquot_free_inode(inode); fail_drop: - vfs_dq_drop(inode); + dquot_drop(inode); inode->i_flags |= S_NOQUOTA; inode->i_nlink = 0; unlock_new_inode(inode); diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 7aca55fcc97..7f920b7263a 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -196,6 +196,9 @@ void ext3_delete_inode (struct inode * inode) { handle_t *handle; + if (!is_bad_inode(inode)) + dquot_initialize(inode); + truncate_inode_pages(&inode->i_data, 0); if (is_bad_inode(inode)) @@ -1378,7 +1381,7 @@ static int ext3_journalled_write_end(struct file *file, */ if (pos + len > inode->i_size && ext3_can_truncate(inode)) ext3_orphan_add(handle, inode); - EXT3_I(inode)->i_state |= EXT3_STATE_JDATA; + ext3_set_inode_state(inode, EXT3_STATE_JDATA); if (inode->i_size > EXT3_I(inode)->i_disksize) { EXT3_I(inode)->i_disksize = inode->i_size; ret2 = ext3_mark_inode_dirty(handle, inode); @@ -1417,7 +1420,7 @@ static sector_t ext3_bmap(struct address_space *mapping, sector_t block) journal_t *journal; int err; - if (EXT3_I(inode)->i_state & EXT3_STATE_JDATA) { + if (ext3_test_inode_state(inode, EXT3_STATE_JDATA)) { /* * This is a REALLY heavyweight approach, but the use of * bmap on dirty files is expected to be extremely rare: @@ -1436,7 +1439,7 @@ static sector_t ext3_bmap(struct address_space *mapping, sector_t block) * everything they get. */ - EXT3_I(inode)->i_state &= ~EXT3_STATE_JDATA; + ext3_clear_inode_state(inode, EXT3_STATE_JDATA); journal = EXT3_JOURNAL(inode); journal_lock_updates(journal); err = journal_flush(journal); @@ -1528,6 +1531,7 @@ static int ext3_ordered_writepage(struct page *page, int err; J_ASSERT(PageLocked(page)); + WARN_ON_ONCE(IS_RDONLY(inode)); /* * We give up here if we're reentered, because it might be for a @@ -1600,6 +1604,9 @@ static int ext3_writeback_writepage(struct page *page, int ret = 0; int err; + J_ASSERT(PageLocked(page)); + WARN_ON_ONCE(IS_RDONLY(inode)); + if (ext3_journal_current_handle()) goto out_fail; @@ -1642,6 +1649,9 @@ static int ext3_journalled_writepage(struct page *page, int ret = 0; int err; + J_ASSERT(PageLocked(page)); + WARN_ON_ONCE(IS_RDONLY(inode)); + if (ext3_journal_current_handle()) goto no_write; @@ -1670,7 +1680,7 @@ static int ext3_journalled_writepage(struct page *page, PAGE_CACHE_SIZE, NULL, write_end_fn); if (ret == 0) ret = err; - EXT3_I(inode)->i_state |= EXT3_STATE_JDATA; + ext3_set_inode_state(inode, EXT3_STATE_JDATA); unlock_page(page); } else { /* @@ -1785,8 +1795,9 @@ retry: handle = ext3_journal_start(inode, 2); if (IS_ERR(handle)) { /* This is really bad luck. We've written the data - * but cannot extend i_size. Bail out and pretend - * the write failed... */ + * but cannot extend i_size. Truncate allocated blocks + * and pretend the write failed... */ + ext3_truncate(inode); ret = PTR_ERR(handle); goto out; } @@ -2402,7 +2413,7 @@ void ext3_truncate(struct inode *inode) goto out_notrans; if (inode->i_size == 0 && ext3_should_writeback_data(inode)) - ei->i_state |= EXT3_STATE_FLUSH_ON_CLOSE; + ext3_set_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE); /* * We have to lock the EOF page here, because lock_page() nests @@ -2721,7 +2732,7 @@ int ext3_get_inode_loc(struct inode *inode, struct ext3_iloc *iloc) { /* We have all inode data except xattrs in memory here. */ return __ext3_get_inode_loc(inode, iloc, - !(EXT3_I(inode)->i_state & EXT3_STATE_XATTR)); + !ext3_test_inode_state(inode, EXT3_STATE_XATTR)); } void ext3_set_inode_flags(struct inode *inode) @@ -2893,7 +2904,7 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino) EXT3_GOOD_OLD_INODE_SIZE + ei->i_extra_isize; if (*magic == cpu_to_le32(EXT3_XATTR_MAGIC)) - ei->i_state |= EXT3_STATE_XATTR; + ext3_set_inode_state(inode, EXT3_STATE_XATTR); } } else ei->i_extra_isize = 0; @@ -2955,7 +2966,7 @@ again: /* For fields not not tracking in the in-memory inode, * initialise them to zero for new inodes. */ - if (ei->i_state & EXT3_STATE_NEW) + if (ext3_test_inode_state(inode, EXT3_STATE_NEW)) memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size); ext3_get_inode_flags(ei); @@ -3052,7 +3063,7 @@ again: rc = ext3_journal_dirty_metadata(handle, bh); if (!err) err = rc; - ei->i_state &= ~EXT3_STATE_NEW; + ext3_clear_inode_state(inode, EXT3_STATE_NEW); atomic_set(&ei->i_sync_tid, handle->h_transaction->t_tid); out_brelse: @@ -3140,6 +3151,8 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr) if (error) return error; + if (ia_valid & ATTR_SIZE) + dquot_initialize(inode); if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { handle_t *handle; @@ -3152,7 +3165,7 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr) error = PTR_ERR(handle); goto err_out; } - error = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0; + error = dquot_transfer(inode, attr); if (error) { ext3_journal_stop(handle); return error; @@ -3237,7 +3250,7 @@ static int ext3_writepage_trans_blocks(struct inode *inode) ret = 2 * (bpp + indirects) + 2; #ifdef CONFIG_QUOTA - /* We know that structure was already allocated during vfs_dq_init so + /* We know that structure was already allocated during dquot_initialize so * we will be updating only the data blocks + inodes */ ret += EXT3_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); #endif @@ -3328,7 +3341,7 @@ int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode) * i_size has been changed by generic_commit_write() and we thus need * to include the updated inode in the current transaction. * - * Also, vfs_dq_alloc_space() will always dirty the inode when blocks + * Also, dquot_alloc_space() will always dirty the inode when blocks * are allocated to the file. * * If the inode is marked synchronous, we don't honour that here - doing diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index 7b0e44f7d66..ee184084ca4 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -1696,6 +1696,8 @@ static int ext3_create (struct inode * dir, struct dentry * dentry, int mode, struct inode * inode; int err, retries = 0; + dquot_initialize(dir); + retry: handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 + @@ -1730,6 +1732,8 @@ static int ext3_mknod (struct inode * dir, struct dentry *dentry, if (!new_valid_dev(rdev)) return -EINVAL; + dquot_initialize(dir); + retry: handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 + @@ -1766,6 +1770,8 @@ static int ext3_mkdir(struct inode * dir, struct dentry * dentry, int mode) if (dir->i_nlink >= EXT3_LINK_MAX) return -EMLINK; + dquot_initialize(dir); + retry: handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 + @@ -2060,7 +2066,9 @@ static int ext3_rmdir (struct inode * dir, struct dentry *dentry) /* Initialize quotas before so that eventual writes go in * separate transaction */ - vfs_dq_init(dentry->d_inode); + dquot_initialize(dir); + dquot_initialize(dentry->d_inode); + handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -2119,7 +2127,9 @@ static int ext3_unlink(struct inode * dir, struct dentry *dentry) /* Initialize quotas before so that eventual writes go * in separate transaction */ - vfs_dq_init(dentry->d_inode); + dquot_initialize(dir); + dquot_initialize(dentry->d_inode); + handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -2174,6 +2184,8 @@ static int ext3_symlink (struct inode * dir, if (l > dir->i_sb->s_blocksize) return -ENAMETOOLONG; + dquot_initialize(dir); + retry: handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 5 + @@ -2228,6 +2240,9 @@ static int ext3_link (struct dentry * old_dentry, if (inode->i_nlink >= EXT3_LINK_MAX) return -EMLINK; + + dquot_initialize(dir); + /* * Return -ENOENT if we've raced with unlink and i_nlink is 0. Doing * otherwise has the potential to corrupt the orphan inode list. @@ -2278,12 +2293,15 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry, struct ext3_dir_entry_2 * old_de, * new_de; int retval, flush_file = 0; + dquot_initialize(old_dir); + dquot_initialize(new_dir); + old_bh = new_bh = dir_bh = NULL; /* Initialize quotas before so that eventual writes go * in separate transaction */ if (new_dentry->d_inode) - vfs_dq_init(new_dentry->d_inode); + dquot_initialize(new_dentry->d_inode); handle = ext3_journal_start(old_dir, 2 * EXT3_DATA_TRANS_BLOCKS(old_dir->i_sb) + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 2); diff --git a/fs/ext3/super.c b/fs/ext3/super.c index afa2b569da1..e844accbf55 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -181,7 +181,7 @@ static void ext3_handle_error(struct super_block *sb) if (!test_opt (sb, ERRORS_CONT)) { journal_t *journal = EXT3_SB(sb)->s_journal; - EXT3_SB(sb)->s_mount_opt |= EXT3_MOUNT_ABORT; + set_opt(EXT3_SB(sb)->s_mount_opt, ABORT); if (journal) journal_abort(journal, -EIO); } @@ -296,7 +296,7 @@ void ext3_abort (struct super_block * sb, const char * function, "error: remounting filesystem read-only"); EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS; sb->s_flags |= MS_RDONLY; - EXT3_SB(sb)->s_mount_opt |= EXT3_MOUNT_ABORT; + set_opt(EXT3_SB(sb)->s_mount_opt, ABORT); if (EXT3_SB(sb)->s_journal) journal_abort(EXT3_SB(sb)->s_journal, -EIO); } @@ -528,6 +528,8 @@ static void destroy_inodecache(void) static void ext3_clear_inode(struct inode *inode) { struct ext3_block_alloc_info *rsv = EXT3_I(inode)->i_block_alloc_info; + + dquot_drop(inode); ext3_discard_reservation(inode); EXT3_I(inode)->i_block_alloc_info = NULL; if (unlikely(rsv)) @@ -562,10 +564,10 @@ static inline void ext3_show_quota_options(struct seq_file *seq, struct super_bl if (sbi->s_qf_names[GRPQUOTA]) seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]); - if (sbi->s_mount_opt & EXT3_MOUNT_USRQUOTA) + if (test_opt(sb, USRQUOTA)) seq_puts(seq, ",usrquota"); - if (sbi->s_mount_opt & EXT3_MOUNT_GRPQUOTA) + if (test_opt(sb, GRPQUOTA)) seq_puts(seq, ",grpquota"); #endif } @@ -656,8 +658,7 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs) if (test_opt(sb, NOBH)) seq_puts(seq, ",nobh"); - seq_printf(seq, ",data=%s", data_mode_string(sbi->s_mount_opt & - EXT3_MOUNT_DATA_FLAGS)); + seq_printf(seq, ",data=%s", data_mode_string(test_opt(sb, DATA_FLAGS))); if (test_opt(sb, DATA_ERR_ABORT)) seq_puts(seq, ",data_err=abort"); @@ -751,13 +752,6 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type, const char *data, size_t len, loff_t off); static const struct dquot_operations ext3_quota_operations = { - .initialize = dquot_initialize, - .drop = dquot_drop, - .alloc_space = dquot_alloc_space, - .alloc_inode = dquot_alloc_inode, - .free_space = dquot_free_space, - .free_inode = dquot_free_inode, - .transfer = dquot_transfer, .write_dquot = ext3_write_dquot, .acquire_dquot = ext3_acquire_dquot, .release_dquot = ext3_release_dquot, @@ -896,6 +890,63 @@ static ext3_fsblk_t get_sb_block(void **data, struct super_block *sb) return sb_block; } +#ifdef CONFIG_QUOTA +static int set_qf_name(struct super_block *sb, int qtype, substring_t *args) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + char *qname; + + if (sb_any_quota_loaded(sb) && + !sbi->s_qf_names[qtype]) { + ext3_msg(sb, KERN_ERR, + "Cannot change journaled " + "quota options when quota turned on"); + return 0; + } + qname = match_strdup(args); + if (!qname) { + ext3_msg(sb, KERN_ERR, + "Not enough memory for storing quotafile name"); + return 0; + } + if (sbi->s_qf_names[qtype] && + strcmp(sbi->s_qf_names[qtype], qname)) { + ext3_msg(sb, KERN_ERR, + "%s quota file already specified", QTYPE2NAME(qtype)); + kfree(qname); + return 0; + } + sbi->s_qf_names[qtype] = qname; + if (strchr(sbi->s_qf_names[qtype], '/')) { + ext3_msg(sb, KERN_ERR, + "quotafile must be on filesystem root"); + kfree(sbi->s_qf_names[qtype]); + sbi->s_qf_names[qtype] = NULL; + return 0; + } + set_opt(sbi->s_mount_opt, QUOTA); + return 1; +} + +static int clear_qf_name(struct super_block *sb, int qtype) { + + struct ext3_sb_info *sbi = EXT3_SB(sb); + + if (sb_any_quota_loaded(sb) && + sbi->s_qf_names[qtype]) { + ext3_msg(sb, KERN_ERR, "Cannot change journaled quota options" + " when quota turned on"); + return 0; + } + /* + * The space will be released later when all options are confirmed + * to be correct + */ + sbi->s_qf_names[qtype] = NULL; + return 1; +} +#endif + static int parse_options (char *options, struct super_block *sb, unsigned int *inum, unsigned long *journal_devnum, ext3_fsblk_t *n_blocks_count, int is_remount) @@ -906,8 +957,7 @@ static int parse_options (char *options, struct super_block *sb, int data_opt = 0; int option; #ifdef CONFIG_QUOTA - int qtype, qfmt; - char *qname; + int qfmt; #endif if (!options) @@ -1065,20 +1115,19 @@ static int parse_options (char *options, struct super_block *sb, data_opt = EXT3_MOUNT_WRITEBACK_DATA; datacheck: if (is_remount) { - if ((sbi->s_mount_opt & EXT3_MOUNT_DATA_FLAGS) - == data_opt) + if (test_opt(sb, DATA_FLAGS) == data_opt) break; ext3_msg(sb, KERN_ERR, "error: cannot change " "data mode on remount. The filesystem " "is mounted in data=%s mode and you " "try to remount it in data=%s mode.", - data_mode_string(sbi->s_mount_opt & - EXT3_MOUNT_DATA_FLAGS), + data_mode_string(test_opt(sb, + DATA_FLAGS)), data_mode_string(data_opt)); return 0; } else { - sbi->s_mount_opt &= ~EXT3_MOUNT_DATA_FLAGS; + clear_opt(sbi->s_mount_opt, DATA_FLAGS); sbi->s_mount_opt |= data_opt; } break; @@ -1090,62 +1139,20 @@ static int parse_options (char *options, struct super_block *sb, break; #ifdef CONFIG_QUOTA case Opt_usrjquota: - qtype = USRQUOTA; - goto set_qf_name; - case Opt_grpjquota: - qtype = GRPQUOTA; -set_qf_name: - if (sb_any_quota_loaded(sb) && - !sbi->s_qf_names[qtype]) { - ext3_msg(sb, KERN_ERR, - "error: cannot change journaled " - "quota options when quota turned on."); - return 0; - } - qname = match_strdup(&args[0]); - if (!qname) { - ext3_msg(sb, KERN_ERR, - "error: not enough memory for " - "storing quotafile name."); + if (!set_qf_name(sb, USRQUOTA, &args[0])) return 0; - } - if (sbi->s_qf_names[qtype] && - strcmp(sbi->s_qf_names[qtype], qname)) { - ext3_msg(sb, KERN_ERR, - "error: %s quota file already " - "specified.", QTYPE2NAME(qtype)); - kfree(qname); - return 0; - } - sbi->s_qf_names[qtype] = qname; - if (strchr(sbi->s_qf_names[qtype], '/')) { - ext3_msg(sb, KERN_ERR, - "error: quotafile must be on " - "filesystem root."); - kfree(sbi->s_qf_names[qtype]); - sbi->s_qf_names[qtype] = NULL; + break; + case Opt_grpjquota: + if (!set_qf_name(sb, GRPQUOTA, &args[0])) return 0; - } - set_opt(sbi->s_mount_opt, QUOTA); break; case Opt_offusrjquota: - qtype = USRQUOTA; - goto clear_qf_name; + if (!clear_qf_name(sb, USRQUOTA)) + return 0; + break; case Opt_offgrpjquota: - qtype = GRPQUOTA; -clear_qf_name: - if (sb_any_quota_loaded(sb) && - sbi->s_qf_names[qtype]) { - ext3_msg(sb, KERN_ERR, "error: cannot change " - "journaled quota options when " - "quota turned on."); + if (!clear_qf_name(sb, GRPQUOTA)) return 0; - } - /* - * The space will be released later when all options - * are confirmed to be correct - */ - sbi->s_qf_names[qtype] = NULL; break; case Opt_jqfmt_vfsold: qfmt = QFMT_VFS_OLD; @@ -1244,18 +1251,12 @@ set_qf_format: } #ifdef CONFIG_QUOTA if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { - if ((sbi->s_mount_opt & EXT3_MOUNT_USRQUOTA) && - sbi->s_qf_names[USRQUOTA]) + if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA]) clear_opt(sbi->s_mount_opt, USRQUOTA); - - if ((sbi->s_mount_opt & EXT3_MOUNT_GRPQUOTA) && - sbi->s_qf_names[GRPQUOTA]) + if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA]) clear_opt(sbi->s_mount_opt, GRPQUOTA); - if ((sbi->s_qf_names[USRQUOTA] && - (sbi->s_mount_opt & EXT3_MOUNT_GRPQUOTA)) || - (sbi->s_qf_names[GRPQUOTA] && - (sbi->s_mount_opt & EXT3_MOUNT_USRQUOTA))) { + if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) { ext3_msg(sb, KERN_ERR, "error: old and new quota " "format mixing."); return 0; @@ -1478,7 +1479,7 @@ static void ext3_orphan_cleanup (struct super_block * sb, } list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan); - vfs_dq_init(inode); + dquot_initialize(inode); if (inode->i_nlink) { printk(KERN_DEBUG "%s: truncating inode %lu to %Ld bytes\n", @@ -1671,11 +1672,11 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) set_opt(sbi->s_mount_opt, POSIX_ACL); #endif if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_DATA) - sbi->s_mount_opt |= EXT3_MOUNT_JOURNAL_DATA; + set_opt(sbi->s_mount_opt, JOURNAL_DATA); else if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_ORDERED) - sbi->s_mount_opt |= EXT3_MOUNT_ORDERED_DATA; + set_opt(sbi->s_mount_opt, ORDERED_DATA); else if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_WBACK) - sbi->s_mount_opt |= EXT3_MOUNT_WRITEBACK_DATA; + set_opt(sbi->s_mount_opt, WRITEBACK_DATA); if (le16_to_cpu(sbi->s_es->s_errors) == EXT3_ERRORS_PANIC) set_opt(sbi->s_mount_opt, ERRORS_PANIC); @@ -1694,7 +1695,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) goto failed_mount; sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | - ((sbi->s_mount_opt & EXT3_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); + (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0); if (le32_to_cpu(es->s_rev_level) == EXT3_GOOD_OLD_REV && (EXT3_HAS_COMPAT_FEATURE(sb, ~0U) || @@ -2561,11 +2562,11 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data) goto restore_opts; } - if (sbi->s_mount_opt & EXT3_MOUNT_ABORT) + if (test_opt(sb, ABORT)) ext3_abort(sb, __func__, "Abort forced by user"); sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | - ((sbi->s_mount_opt & EXT3_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); + (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0); es = sbi->s_es; @@ -2573,7 +2574,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data) if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) || n_blocks_count > le32_to_cpu(es->s_blocks_count)) { - if (sbi->s_mount_opt & EXT3_MOUNT_ABORT) { + if (test_opt(sb, ABORT)) { err = -EROFS; goto restore_opts; } @@ -2734,7 +2735,7 @@ static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf) * Process 1 Process 2 * ext3_create() quota_sync() * journal_start() write_dquot() - * vfs_dq_init() down(dqio_mutex) + * dquot_initialize() down(dqio_mutex) * down(dqio_mutex) journal_start() * */ @@ -2942,9 +2943,7 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type, sector_t blk = off >> EXT3_BLOCK_SIZE_BITS(sb); int err = 0; int offset = off & (sb->s_blocksize - 1); - int tocopy; int journal_quota = EXT3_SB(sb)->s_qf_names[type] != NULL; - size_t towrite = len; struct buffer_head *bh; handle_t *handle = journal_current_handle(); @@ -2955,53 +2954,54 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type, (unsigned long long)off, (unsigned long long)len); return -EIO; } + + /* + * Since we account only one data block in transaction credits, + * then it is impossible to cross a block boundary. + */ + if (sb->s_blocksize - offset < len) { + ext3_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)" + " cancelled because not block aligned", + (unsigned long long)off, (unsigned long long)len); + return -EIO; + } mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA); - while (towrite > 0) { - tocopy = sb->s_blocksize - offset < towrite ? - sb->s_blocksize - offset : towrite; - bh = ext3_bread(handle, inode, blk, 1, &err); - if (!bh) + bh = ext3_bread(handle, inode, blk, 1, &err); + if (!bh) + goto out; + if (journal_quota) { + err = ext3_journal_get_write_access(handle, bh); + if (err) { + brelse(bh); goto out; - if (journal_quota) { - err = ext3_journal_get_write_access(handle, bh); - if (err) { - brelse(bh); - goto out; - } - } - lock_buffer(bh); - memcpy(bh->b_data+offset, data, tocopy); - flush_dcache_page(bh->b_page); - unlock_buffer(bh); - if (journal_quota) - err = ext3_journal_dirty_metadata(handle, bh); - else { - /* Always do at least ordered writes for quotas */ - err = ext3_journal_dirty_data(handle, bh); - mark_buffer_dirty(bh); } - brelse(bh); - if (err) - goto out; - offset = 0; - towrite -= tocopy; - data += tocopy; - blk++; } + lock_buffer(bh); + memcpy(bh->b_data+offset, data, len); + flush_dcache_page(bh->b_page); + unlock_buffer(bh); + if (journal_quota) + err = ext3_journal_dirty_metadata(handle, bh); + else { + /* Always do at least ordered writes for quotas */ + err = ext3_journal_dirty_data(handle, bh); + mark_buffer_dirty(bh); + } + brelse(bh); out: - if (len == towrite) { + if (err) { mutex_unlock(&inode->i_mutex); return err; } - if (inode->i_size < off+len-towrite) { - i_size_write(inode, off+len-towrite); + if (inode->i_size < off + len) { + i_size_write(inode, off + len); EXT3_I(inode)->i_disksize = inode->i_size; } inode->i_version++; inode->i_mtime = inode->i_ctime = CURRENT_TIME; ext3_mark_inode_dirty(handle, inode); mutex_unlock(&inode->i_mutex); - return len - towrite; + return len; } #endif diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c index 66895ccf76c..534a94c3a93 100644 --- a/fs/ext3/xattr.c +++ b/fs/ext3/xattr.c @@ -274,7 +274,7 @@ ext3_xattr_ibody_get(struct inode *inode, int name_index, const char *name, void *end; int error; - if (!(EXT3_I(inode)->i_state & EXT3_STATE_XATTR)) + if (!ext3_test_inode_state(inode, EXT3_STATE_XATTR)) return -ENODATA; error = ext3_get_inode_loc(inode, &iloc); if (error) @@ -403,7 +403,7 @@ ext3_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size) void *end; int error; - if (!(EXT3_I(inode)->i_state & EXT3_STATE_XATTR)) + if (!ext3_test_inode_state(inode, EXT3_STATE_XATTR)) return 0; error = ext3_get_inode_loc(inode, &iloc); if (error) @@ -500,7 +500,7 @@ ext3_xattr_release_block(handle_t *handle, struct inode *inode, error = ext3_journal_dirty_metadata(handle, bh); if (IS_SYNC(inode)) handle->h_sync = 1; - vfs_dq_free_block(inode, 1); + dquot_free_block(inode, 1); ea_bdebug(bh, "refcount now=%d; releasing", le32_to_cpu(BHDR(bh)->h_refcount)); if (ce) @@ -775,8 +775,8 @@ inserted: else { /* The old block is released after updating the inode. */ - error = -EDQUOT; - if (vfs_dq_alloc_block(inode, 1)) + error = dquot_alloc_block(inode, 1); + if (error) goto cleanup; error = ext3_journal_get_write_access(handle, new_bh); @@ -850,7 +850,7 @@ cleanup: return error; cleanup_dquot: - vfs_dq_free_block(inode, 1); + dquot_free_block(inode, 1); goto cleanup; bad_block: @@ -882,7 +882,7 @@ ext3_xattr_ibody_find(struct inode *inode, struct ext3_xattr_info *i, is->s.base = is->s.first = IFIRST(header); is->s.here = is->s.first; is->s.end = (void *)raw_inode + EXT3_SB(inode->i_sb)->s_inode_size; - if (EXT3_I(inode)->i_state & EXT3_STATE_XATTR) { + if (ext3_test_inode_state(inode, EXT3_STATE_XATTR)) { error = ext3_xattr_check_names(IFIRST(header), is->s.end); if (error) return error; @@ -914,10 +914,10 @@ ext3_xattr_ibody_set(handle_t *handle, struct inode *inode, header = IHDR(inode, ext3_raw_inode(&is->iloc)); if (!IS_LAST_ENTRY(s->first)) { header->h_magic = cpu_to_le32(EXT3_XATTR_MAGIC); - EXT3_I(inode)->i_state |= EXT3_STATE_XATTR; + ext3_set_inode_state(inode, EXT3_STATE_XATTR); } else { header->h_magic = cpu_to_le32(0); - EXT3_I(inode)->i_state &= ~EXT3_STATE_XATTR; + ext3_clear_inode_state(inode, EXT3_STATE_XATTR); } return 0; } @@ -967,10 +967,10 @@ ext3_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, if (error) goto cleanup; - if (EXT3_I(inode)->i_state & EXT3_STATE_NEW) { + if (ext3_test_inode_state(inode, EXT3_STATE_NEW)) { struct ext3_inode *raw_inode = ext3_raw_inode(&is.iloc); memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size); - EXT3_I(inode)->i_state &= ~EXT3_STATE_NEW; + ext3_clear_inode_state(inode, EXT3_STATE_NEW); } error = ext3_xattr_ibody_find(inode, &i, &is); diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 503a4892740..d0776e410f3 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -23,6 +23,7 @@ #include <linux/jbd2.h> #include <linux/mount.h> #include <linux/path.h> +#include <linux/quotaops.h> #include "ext4.h" #include "ext4_jbd2.h" #include "xattr.h" @@ -125,7 +126,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp) sb->s_dirt = 1; } } - return generic_file_open(inode, filp); + return dquot_file_open(inode, filp); } const struct file_operations ext4_file_operations = { diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 004c9da9e5c..361c0b9962a 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -214,10 +214,10 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) * Note: we must free any quota before locking the superblock, * as writing the quota to disk may need the lock as well. */ - vfs_dq_init(inode); + dquot_initialize(inode); ext4_xattr_delete_inode(handle, inode); - vfs_dq_free_inode(inode); - vfs_dq_drop(inode); + dquot_free_inode(inode); + dquot_drop(inode); is_directory = S_ISDIR(inode->i_mode); @@ -1029,10 +1029,10 @@ got: ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize; ret = inode; - if (vfs_dq_alloc_inode(inode)) { - err = -EDQUOT; + dquot_initialize(inode); + err = dquot_alloc_inode(inode); + if (err) goto fail_drop; - } err = ext4_init_acl(handle, inode, dir); if (err) @@ -1069,10 +1069,10 @@ really_out: return ret; fail_free_drop: - vfs_dq_free_inode(inode); + dquot_free_inode(inode); fail_drop: - vfs_dq_drop(inode); + dquot_drop(inode); inode->i_flags |= S_NOQUOTA; inode->i_nlink = 0; unlock_new_inode(inode); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index f977aade0d1..986120f3006 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -171,6 +171,9 @@ void ext4_delete_inode(struct inode *inode) handle_t *handle; int err; + if (!is_bad_inode(inode)) + dquot_initialize(inode); + if (ext4_should_order_data(inode)) ext4_begin_ordered_truncate(inode, 0); truncate_inode_pages(&inode->i_data, 0); @@ -1108,9 +1111,9 @@ void ext4_da_update_reserve_space(struct inode *inode, /* Update quota subsystem */ if (quota_claim) { - vfs_dq_claim_block(inode, used); + dquot_claim_block(inode, used); if (mdb_free) - vfs_dq_release_reservation_block(inode, mdb_free); + dquot_release_reservation_block(inode, mdb_free); } else { /* * We did fallocate with an offset that is already delayed @@ -1121,8 +1124,8 @@ void ext4_da_update_reserve_space(struct inode *inode, * that */ if (allocated_meta_blocks) - vfs_dq_claim_block(inode, allocated_meta_blocks); - vfs_dq_release_reservation_block(inode, mdb_free + used); + dquot_claim_block(inode, allocated_meta_blocks); + dquot_release_reservation_block(inode, mdb_free + used); } /* @@ -1857,6 +1860,7 @@ static int ext4_da_reserve_space(struct inode *inode, sector_t lblock) struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); unsigned long md_needed, md_reserved; + int ret; /* * recalculate the amount of metadata blocks to reserve @@ -1875,11 +1879,12 @@ repeat: * later. Real quota accounting is done at pages writeout * time. */ - if (vfs_dq_reserve_block(inode, md_needed + 1)) - return -EDQUOT; + ret = dquot_reserve_block(inode, md_needed + 1); + if (ret) + return ret; if (ext4_claim_free_blocks(sbi, md_needed + 1)) { - vfs_dq_release_reservation_block(inode, md_needed + 1); + dquot_release_reservation_block(inode, md_needed + 1); if (ext4_should_retry_alloc(inode->i_sb, &retries)) { yield(); goto repeat; @@ -1936,7 +1941,7 @@ static void ext4_da_release_space(struct inode *inode, int to_free) spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); - vfs_dq_release_reservation_block(inode, to_free); + dquot_release_reservation_block(inode, to_free); } static void ext4_da_page_release_reservation(struct page *page, @@ -5418,6 +5423,8 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) if (error) return error; + if (ia_valid & ATTR_SIZE) + dquot_initialize(inode); if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { handle_t *handle; @@ -5430,7 +5437,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) error = PTR_ERR(handle); goto err_out; } - error = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0; + error = dquot_transfer(inode, attr); if (error) { ext4_journal_stop(handle); return error; @@ -5816,7 +5823,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) * i_size has been changed by generic_commit_write() and we thus need * to include the updated inode in the current transaction. * - * Also, vfs_dq_alloc_block() will always dirty the inode when blocks + * Also, dquot_alloc_block() will always dirty the inode when blocks * are allocated to the file. * * If the inode is marked synchronous, we don't honour that here - doing diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index abb11e328b6..506713a2ebd 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4240,7 +4240,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, return 0; } reserv_blks = ar->len; - while (ar->len && vfs_dq_alloc_block(ar->inode, ar->len)) { + while (ar->len && dquot_alloc_block(ar->inode, ar->len)) { ar->flags |= EXT4_MB_HINT_NOPREALLOC; ar->len--; } @@ -4317,7 +4317,7 @@ out2: kmem_cache_free(ext4_ac_cachep, ac); out1: if (inquota && ar->len < inquota) - vfs_dq_free_block(ar->inode, inquota - ar->len); + dquot_free_block(ar->inode, inquota - ar->len); out3: if (!ar->len) { if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) @@ -4631,7 +4631,7 @@ do_more: sb->s_dirt = 1; error_return: if (freed) - vfs_dq_free_block(inode, freed); + dquot_free_block(inode, freed); brelse(bitmap_bh); ext4_std_error(sb, err); if (ac) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 608d21f873e..0c070fabd10 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1759,6 +1759,8 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, int mode, struct inode *inode; int err, retries = 0; + dquot_initialize(dir); + retry: handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + @@ -1793,6 +1795,8 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry, if (!new_valid_dev(rdev)) return -EINVAL; + dquot_initialize(dir); + retry: handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + @@ -1830,6 +1834,8 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode) if (EXT4_DIR_LINK_MAX(dir)) return -EMLINK; + dquot_initialize(dir); + retry: handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + @@ -2137,7 +2143,9 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) /* Initialize quotas before so that eventual writes go in * separate transaction */ - vfs_dq_init(dentry->d_inode); + dquot_initialize(dir); + dquot_initialize(dentry->d_inode); + handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -2196,7 +2204,9 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) /* Initialize quotas before so that eventual writes go * in separate transaction */ - vfs_dq_init(dentry->d_inode); + dquot_initialize(dir); + dquot_initialize(dentry->d_inode); + handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -2251,6 +2261,8 @@ static int ext4_symlink(struct inode *dir, if (l > dir->i_sb->s_blocksize) return -ENAMETOOLONG; + dquot_initialize(dir); + retry: handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 5 + @@ -2309,6 +2321,8 @@ static int ext4_link(struct dentry *old_dentry, if (inode->i_nlink >= EXT4_LINK_MAX) return -EMLINK; + dquot_initialize(dir); + /* * Return -ENOENT if we've raced with unlink and i_nlink is 0. Doing * otherwise has the potential to corrupt the orphan inode list. @@ -2359,12 +2373,15 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, struct ext4_dir_entry_2 *old_de, *new_de; int retval, force_da_alloc = 0; + dquot_initialize(old_dir); + dquot_initialize(new_dir); + old_bh = new_bh = dir_bh = NULL; /* Initialize quotas before so that eventual writes go * in separate transaction */ if (new_dentry->d_inode) - vfs_dq_init(new_dentry->d_inode); + dquot_initialize(new_dentry->d_inode); handle = ext4_journal_start(old_dir, 2 * EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index ad1ee5f21ba..2b83b96cb2e 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -798,6 +798,7 @@ static void destroy_inodecache(void) static void ext4_clear_inode(struct inode *inode) { + dquot_drop(inode); ext4_discard_preallocations(inode); if (EXT4_JOURNAL(inode)) jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal, @@ -1052,19 +1053,9 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, const char *data, size_t len, loff_t off); static const struct dquot_operations ext4_quota_operations = { - .initialize = dquot_initialize, - .drop = dquot_drop, - .alloc_space = dquot_alloc_space, - .reserve_space = dquot_reserve_space, - .claim_space = dquot_claim_space, - .release_rsv = dquot_release_reserved_space, #ifdef CONFIG_QUOTA .get_reserved_space = ext4_get_reserved_space, #endif - .alloc_inode = dquot_alloc_inode, - .free_space = dquot_free_space, - .free_inode = dquot_free_inode, - .transfer = dquot_transfer, .write_dquot = ext4_write_dquot, .acquire_dquot = ext4_acquire_dquot, .release_dquot = ext4_release_dquot, @@ -2014,7 +2005,7 @@ static void ext4_orphan_cleanup(struct super_block *sb, } list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan); - vfs_dq_init(inode); + dquot_initialize(inode); if (inode->i_nlink) { ext4_msg(sb, KERN_DEBUG, "%s: truncating inode %lu to %lld bytes", @@ -3801,7 +3792,7 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) * Process 1 Process 2 * ext4_create() quota_sync() * jbd2_journal_start() write_dquot() - * vfs_dq_init() down(dqio_mutex) + * dquot_initialize() down(dqio_mutex) * down(dqio_mutex) jbd2_journal_start() * */ diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index efc16a4b7ce..b4c5aa8489d 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -495,7 +495,7 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, error = ext4_handle_dirty_metadata(handle, inode, bh); if (IS_SYNC(inode)) ext4_handle_sync(handle); - vfs_dq_free_block(inode, 1); + dquot_free_block(inode, 1); ea_bdebug(bh, "refcount now=%d; releasing", le32_to_cpu(BHDR(bh)->h_refcount)); if (ce) @@ -787,8 +787,8 @@ inserted: else { /* The old block is released after updating the inode. */ - error = -EDQUOT; - if (vfs_dq_alloc_block(inode, 1)) + error = dquot_alloc_block(inode, 1); + if (error) goto cleanup; error = ext4_journal_get_write_access(handle, new_bh); @@ -876,7 +876,7 @@ cleanup: return error; cleanup_dquot: - vfs_dq_free_block(inode, 1); + dquot_free_block(inode, 1); goto cleanup; bad_block: diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index e3bf6eab875..6dbcbad6ab1 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -1083,7 +1083,7 @@ void gfs2_quota_change(struct gfs2_inode *ip, s64 change, } } -int gfs2_quota_sync(struct super_block *sb, int type) +int gfs2_quota_sync(struct super_block *sb, int type, int wait) { struct gfs2_sbd *sdp = sb->s_fs_info; struct gfs2_quota_data **qda; @@ -1127,6 +1127,11 @@ int gfs2_quota_sync(struct super_block *sb, int type) return error; } +static int gfs2_quota_sync_timeo(struct super_block *sb, int type) +{ + return gfs2_quota_sync(sb, type, 0); +} + int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id) { struct gfs2_quota_data *qd; @@ -1382,7 +1387,7 @@ int gfs2_quotad(void *data) &tune->gt_statfs_quantum); /* Update quota file */ - quotad_check_timeo(sdp, "sync", gfs2_quota_sync, t, + quotad_check_timeo(sdp, "sync", gfs2_quota_sync_timeo, t, "ad_timeo, &tune->gt_quota_quantum); /* Check for & recover partially truncated inodes */ diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h index e271fa07ad0..195f60c8bd1 100644 --- a/fs/gfs2/quota.h +++ b/fs/gfs2/quota.h @@ -25,7 +25,7 @@ extern int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid); extern void gfs2_quota_change(struct gfs2_inode *ip, s64 change, u32 uid, u32 gid); -extern int gfs2_quota_sync(struct super_block *sb, int type); +extern int gfs2_quota_sync(struct super_block *sb, int type, int wait); extern int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id); extern int gfs2_quota_init(struct gfs2_sbd *sdp); diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index ca87598ead7..50aac606b99 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -764,7 +764,7 @@ static int gfs2_make_fs_ro(struct gfs2_sbd *sdp) int error; flush_workqueue(gfs2_delete_workqueue); - gfs2_quota_sync(sdp->sd_vfs, 0); + gfs2_quota_sync(sdp->sd_vfs, 0, 1); gfs2_statfs_sync(sdp->sd_vfs, 0); error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE, diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index a0db1c94317..b5f1a46133c 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -167,7 +167,7 @@ static ssize_t quota_sync_store(struct gfs2_sbd *sdp, const char *buf, if (simple_strtol(buf, NULL, 0) != 1) return -EINVAL; - gfs2_quota_sync(sdp->sd_vfs, 0); + gfs2_quota_sync(sdp->sd_vfs, 0, 1); return len; } diff --git a/fs/inode.c b/fs/inode.c index 03dfeb2e392..407bf392e20 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -8,7 +8,6 @@ #include <linux/mm.h> #include <linux/dcache.h> #include <linux/init.h> -#include <linux/quotaops.h> #include <linux/slab.h> #include <linux/writeback.h> #include <linux/module.h> @@ -314,7 +313,6 @@ void clear_inode(struct inode *inode) BUG_ON(!(inode->i_state & I_FREEING)); BUG_ON(inode->i_state & I_CLEAR); inode_sync_wait(inode); - vfs_dq_drop(inode); if (inode->i_sb->s_op->clear_inode) inode->i_sb->s_op->clear_inode(inode); if (S_ISBLK(inode->i_mode) && inode->i_bdev) @@ -1211,8 +1209,6 @@ void generic_delete_inode(struct inode *inode) if (op->delete_inode) { void (*delete)(struct inode *) = op->delete_inode; - if (!is_bad_inode(inode)) - vfs_dq_init(inode); /* Filesystems implementing their own * s_op->delete_inode are required to call * truncate_inode_pages and clear_inode() diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index 4bd882548c4..2c90e3ef625 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -862,12 +862,12 @@ restart_loop: /* A buffer which has been freed while still being * journaled by a previous transaction may end up still * being dirty here, but we want to avoid writing back - * that buffer in the future now that the last use has - * been committed. That's not only a performance gain, - * it also stops aliasing problems if the buffer is left - * behind for writeback and gets reallocated for another + * that buffer in the future after the "add to orphan" + * operation been committed, That's not only a performance + * gain, it also stops aliasing problems if the buffer is + * left behind for writeback and gets reallocated for another * use in a different page. */ - if (buffer_freed(bh)) { + if (buffer_freed(bh) && !jh->b_next_transaction) { clear_buffer_freed(bh); clear_buffer_jbddirty(bh); } diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index 006f9ad838a..99e9fea1107 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -1864,6 +1864,21 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) if (!jh) goto zap_buffer_no_jh; + /* + * We cannot remove the buffer from checkpoint lists until the + * transaction adding inode to orphan list (let's call it T) + * is committed. Otherwise if the transaction changing the + * buffer would be cleaned from the journal before T is + * committed, a crash will cause that the correct contents of + * the buffer will be lost. On the other hand we have to + * clear the buffer dirty bit at latest at the moment when the + * transaction marking the buffer as freed in the filesystem + * structures is committed because from that moment on the + * buffer can be reallocated and used by a different page. + * Since the block hasn't been freed yet but the inode has + * already been added to orphan list, it is safe for us to add + * the buffer to BJ_Forget list of the newest transaction. + */ transaction = jh->b_transaction; if (transaction == NULL) { /* First case: not on any transaction. If it @@ -1929,16 +1944,15 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) goto zap_buffer; } /* - * If it is committing, we simply cannot touch it. We - * can remove it's next_transaction pointer from the - * running transaction if that is set, but nothing - * else. */ + * The buffer is committing, we simply cannot touch + * it. So we just set j_next_transaction to the + * running transaction (if there is one) and mark + * buffer as freed so that commit code knows it should + * clear dirty bits when it is done with the buffer. + */ set_buffer_freed(bh); - if (jh->b_next_transaction) { - J_ASSERT(jh->b_next_transaction == - journal->j_running_transaction); - jh->b_next_transaction = NULL; - } + if (journal->j_running_transaction && buffer_jbddirty(bh)) + jh->b_next_transaction = journal->j_running_transaction; journal_put_journal_head(jh); spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); @@ -2120,7 +2134,7 @@ void journal_file_buffer(struct journal_head *jh, */ void __journal_refile_buffer(struct journal_head *jh) { - int was_dirty; + int was_dirty, jlist; struct buffer_head *bh = jh2bh(jh); J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); @@ -2142,8 +2156,13 @@ void __journal_refile_buffer(struct journal_head *jh) __journal_temp_unlink_buffer(jh); jh->b_transaction = jh->b_next_transaction; jh->b_next_transaction = NULL; - __journal_file_buffer(jh, jh->b_transaction, - jh->b_modified ? BJ_Metadata : BJ_Reserved); + if (buffer_freed(bh)) + jlist = BJ_Forget; + else if (jh->b_modified) + jlist = BJ_Metadata; + else + jlist = BJ_Reserved; + __journal_file_buffer(jh, jh->b_transaction, jlist); J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING); if (was_dirty) diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c index d66477c3430..213169780b6 100644 --- a/fs/jfs/acl.c +++ b/fs/jfs/acl.c @@ -20,7 +20,6 @@ #include <linux/sched.h> #include <linux/fs.h> -#include <linux/quotaops.h> #include <linux/posix_acl_xattr.h> #include "jfs_incore.h" #include "jfs_txnmgr.h" @@ -174,7 +173,7 @@ cleanup: return rc; } -static int jfs_acl_chmod(struct inode *inode) +int jfs_acl_chmod(struct inode *inode) { struct posix_acl *acl, *clone; int rc; @@ -205,26 +204,3 @@ static int jfs_acl_chmod(struct inode *inode) posix_acl_release(clone); return rc; } - -int jfs_setattr(struct dentry *dentry, struct iattr *iattr) -{ - struct inode *inode = dentry->d_inode; - int rc; - - rc = inode_change_ok(inode, iattr); - if (rc) - return rc; - - if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) || - (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) { - if (vfs_dq_transfer(inode, iattr)) - return -EDQUOT; - } - - rc = inode_setattr(inode, iattr); - - if (!rc && (iattr->ia_valid & ATTR_MODE)) - rc = jfs_acl_chmod(inode); - - return rc; -} diff --git a/fs/jfs/file.c b/fs/jfs/file.c index 2b70fa78e4a..14ba982b3f2 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -18,6 +18,7 @@ */ #include <linux/fs.h> +#include <linux/quotaops.h> #include "jfs_incore.h" #include "jfs_inode.h" #include "jfs_dmap.h" @@ -47,7 +48,7 @@ static int jfs_open(struct inode *inode, struct file *file) { int rc; - if ((rc = generic_file_open(inode, file))) + if ((rc = dquot_file_open(inode, file))) return rc; /* @@ -88,14 +89,40 @@ static int jfs_release(struct inode *inode, struct file *file) return 0; } +int jfs_setattr(struct dentry *dentry, struct iattr *iattr) +{ + struct inode *inode = dentry->d_inode; + int rc; + + rc = inode_change_ok(inode, iattr); + if (rc) + return rc; + + if (iattr->ia_valid & ATTR_SIZE) + dquot_initialize(inode); + if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) || + (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) { + rc = dquot_transfer(inode, iattr); + if (rc) + return rc; + } + + rc = inode_setattr(inode, iattr); + + if (!rc && (iattr->ia_valid & ATTR_MODE)) + rc = jfs_acl_chmod(inode); + + return rc; +} + const struct inode_operations jfs_file_inode_operations = { .truncate = jfs_truncate, .setxattr = jfs_setxattr, .getxattr = jfs_getxattr, .listxattr = jfs_listxattr, .removexattr = jfs_removexattr, -#ifdef CONFIG_JFS_POSIX_ACL .setattr = jfs_setattr, +#ifdef CONFIG_JFS_POSIX_ACL .check_acl = jfs_check_acl, #endif }; diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 182b78cc3e6..9dd126276c9 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -149,6 +149,9 @@ void jfs_delete_inode(struct inode *inode) { jfs_info("In jfs_delete_inode, inode = 0x%p", inode); + if (!is_bad_inode(inode)) + dquot_initialize(inode); + if (!is_bad_inode(inode) && (JFS_IP(inode)->fileset == FILESYSTEM_I)) { truncate_inode_pages(&inode->i_data, 0); @@ -161,9 +164,9 @@ void jfs_delete_inode(struct inode *inode) /* * Free the inode from the quota allocation. */ - vfs_dq_init(inode); - vfs_dq_free_inode(inode); - vfs_dq_drop(inode); + dquot_initialize(inode); + dquot_free_inode(inode); + dquot_drop(inode); } clear_inode(inode); diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h index b07bd417ef8..54e07559878 100644 --- a/fs/jfs/jfs_acl.h +++ b/fs/jfs/jfs_acl.h @@ -22,7 +22,7 @@ int jfs_check_acl(struct inode *, int); int jfs_init_acl(tid_t, struct inode *, struct inode *); -int jfs_setattr(struct dentry *, struct iattr *); +int jfs_acl_chmod(struct inode *inode); #else @@ -32,5 +32,10 @@ static inline int jfs_init_acl(tid_t tid, struct inode *inode, return 0; } +static inline int jfs_acl_chmod(struct inode *inode) +{ + return 0; +} + #endif #endif /* _H_JFS_ACL */ diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c index 925871e9887..0e4623be70c 100644 --- a/fs/jfs/jfs_dtree.c +++ b/fs/jfs/jfs_dtree.c @@ -381,10 +381,10 @@ static u32 add_index(tid_t tid, struct inode *ip, s64 bn, int slot) * It's time to move the inline table to an external * page and begin to build the xtree */ - if (vfs_dq_alloc_block(ip, sbi->nbperpage)) + if (dquot_alloc_block(ip, sbi->nbperpage)) goto clean_up; if (dbAlloc(ip, 0, sbi->nbperpage, &xaddr)) { - vfs_dq_free_block(ip, sbi->nbperpage); + dquot_free_block(ip, sbi->nbperpage); goto clean_up; } @@ -408,7 +408,7 @@ static u32 add_index(tid_t tid, struct inode *ip, s64 bn, int slot) memcpy(&jfs_ip->i_dirtable, temp_table, sizeof (temp_table)); dbFree(ip, xaddr, sbi->nbperpage); - vfs_dq_free_block(ip, sbi->nbperpage); + dquot_free_block(ip, sbi->nbperpage); goto clean_up; } ip->i_size = PSIZE; @@ -1027,10 +1027,9 @@ static int dtSplitUp(tid_t tid, n = xlen; /* Allocate blocks to quota. */ - if (vfs_dq_alloc_block(ip, n)) { - rc = -EDQUOT; + rc = dquot_alloc_block(ip, n); + if (rc) goto extendOut; - } quota_allocation += n; if ((rc = dbReAlloc(sbi->ipbmap, xaddr, (s64) xlen, @@ -1308,7 +1307,7 @@ static int dtSplitUp(tid_t tid, /* Rollback quota allocation */ if (rc && quota_allocation) - vfs_dq_free_block(ip, quota_allocation); + dquot_free_block(ip, quota_allocation); dtSplitUp_Exit: @@ -1369,9 +1368,10 @@ static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split, return -EIO; /* Allocate blocks to quota. */ - if (vfs_dq_alloc_block(ip, lengthPXD(pxd))) { + rc = dquot_alloc_block(ip, lengthPXD(pxd)); + if (rc) { release_metapage(rmp); - return -EDQUOT; + return rc; } jfs_info("dtSplitPage: ip:0x%p smp:0x%p rmp:0x%p", ip, smp, rmp); @@ -1892,6 +1892,7 @@ static int dtSplitRoot(tid_t tid, struct dt_lock *dtlck; struct tlock *tlck; struct lv *lv; + int rc; /* get split root page */ smp = split->mp; @@ -1916,9 +1917,10 @@ static int dtSplitRoot(tid_t tid, rp = rmp->data; /* Allocate blocks to quota. */ - if (vfs_dq_alloc_block(ip, lengthPXD(pxd))) { + rc = dquot_alloc_block(ip, lengthPXD(pxd)); + if (rc) { release_metapage(rmp); - return -EDQUOT; + return rc; } BT_MARK_DIRTY(rmp, ip); @@ -2287,7 +2289,7 @@ static int dtDeleteUp(tid_t tid, struct inode *ip, xlen = lengthPXD(&fp->header.self); /* Free quota allocation. */ - vfs_dq_free_block(ip, xlen); + dquot_free_block(ip, xlen); /* free/invalidate its buffer page */ discard_metapage(fmp); @@ -2363,7 +2365,7 @@ static int dtDeleteUp(tid_t tid, struct inode *ip, xlen = lengthPXD(&p->header.self); /* Free quota allocation */ - vfs_dq_free_block(ip, xlen); + dquot_free_block(ip, xlen); /* free/invalidate its buffer page */ discard_metapage(mp); diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c index 41d6045dbeb..5d3bbd10f8d 100644 --- a/fs/jfs/jfs_extent.c +++ b/fs/jfs/jfs_extent.c @@ -141,10 +141,11 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr) } /* Allocate blocks to quota. */ - if (vfs_dq_alloc_block(ip, nxlen)) { + rc = dquot_alloc_block(ip, nxlen); + if (rc) { dbFree(ip, nxaddr, (s64) nxlen); mutex_unlock(&JFS_IP(ip)->commit_mutex); - return -EDQUOT; + return rc; } /* determine the value of the extent flag */ @@ -164,7 +165,7 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr) */ if (rc) { dbFree(ip, nxaddr, nxlen); - vfs_dq_free_block(ip, nxlen); + dquot_free_block(ip, nxlen); mutex_unlock(&JFS_IP(ip)->commit_mutex); return (rc); } @@ -256,10 +257,11 @@ int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, bool abnr) goto exit; /* Allocat blocks to quota. */ - if (vfs_dq_alloc_block(ip, nxlen)) { + rc = dquot_alloc_block(ip, nxlen); + if (rc) { dbFree(ip, nxaddr, (s64) nxlen); mutex_unlock(&JFS_IP(ip)->commit_mutex); - return -EDQUOT; + return rc; } delta = nxlen - xlen; @@ -297,7 +299,7 @@ int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, bool abnr) /* extend the extent */ if ((rc = xtExtend(0, ip, xoff + xlen, (int) nextend, 0))) { dbFree(ip, xaddr + xlen, delta); - vfs_dq_free_block(ip, nxlen); + dquot_free_block(ip, nxlen); goto exit; } } else { @@ -308,7 +310,7 @@ int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, bool abnr) */ if ((rc = xtTailgate(0, ip, xoff, (int) ntail, nxaddr, 0))) { dbFree(ip, nxaddr, nxlen); - vfs_dq_free_block(ip, nxlen); + dquot_free_block(ip, nxlen); goto exit; } } diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c index dc0e02159ac..829921b6776 100644 --- a/fs/jfs/jfs_inode.c +++ b/fs/jfs/jfs_inode.c @@ -116,10 +116,10 @@ struct inode *ialloc(struct inode *parent, umode_t mode) /* * Allocate inode to quota. */ - if (vfs_dq_alloc_inode(inode)) { - rc = -EDQUOT; + dquot_initialize(inode); + rc = dquot_alloc_inode(inode); + if (rc) goto fail_drop; - } inode->i_mode = mode; /* inherit flags from parent */ @@ -162,7 +162,7 @@ struct inode *ialloc(struct inode *parent, umode_t mode) return inode; fail_drop: - vfs_dq_drop(inode); + dquot_drop(inode); inode->i_flags |= S_NOQUOTA; fail_unlock: inode->i_nlink = 0; diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h index 15902b03c2a..79e2c79661d 100644 --- a/fs/jfs/jfs_inode.h +++ b/fs/jfs/jfs_inode.h @@ -40,6 +40,7 @@ extern struct dentry *jfs_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, int fh_type); extern void jfs_set_inode_flags(struct inode *); extern int jfs_get_block(struct inode *, sector_t, struct buffer_head *, int); +extern int jfs_setattr(struct dentry *, struct iattr *); extern const struct address_space_operations jfs_aops; extern const struct inode_operations jfs_dir_inode_operations; diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c index d654a645864..6c50871e622 100644 --- a/fs/jfs/jfs_xtree.c +++ b/fs/jfs/jfs_xtree.c @@ -585,10 +585,10 @@ int xtInsert(tid_t tid, /* transaction id */ hint = addressXAD(xad) + lengthXAD(xad) - 1; } else hint = 0; - if ((rc = vfs_dq_alloc_block(ip, xlen))) + if ((rc = dquot_alloc_block(ip, xlen))) goto out; if ((rc = dbAlloc(ip, hint, (s64) xlen, &xaddr))) { - vfs_dq_free_block(ip, xlen); + dquot_free_block(ip, xlen); goto out; } } @@ -617,7 +617,7 @@ int xtInsert(tid_t tid, /* transaction id */ /* undo data extent allocation */ if (*xaddrp == 0) { dbFree(ip, xaddr, (s64) xlen); - vfs_dq_free_block(ip, xlen); + dquot_free_block(ip, xlen); } return rc; } @@ -985,10 +985,9 @@ xtSplitPage(tid_t tid, struct inode *ip, rbn = addressPXD(pxd); /* Allocate blocks to quota. */ - if (vfs_dq_alloc_block(ip, lengthPXD(pxd))) { - rc = -EDQUOT; + rc = dquot_alloc_block(ip, lengthPXD(pxd)); + if (rc) goto clean_up; - } quota_allocation += lengthPXD(pxd); @@ -1195,7 +1194,7 @@ xtSplitPage(tid_t tid, struct inode *ip, /* Rollback quota allocation. */ if (quota_allocation) - vfs_dq_free_block(ip, quota_allocation); + dquot_free_block(ip, quota_allocation); return (rc); } @@ -1235,6 +1234,7 @@ xtSplitRoot(tid_t tid, struct pxdlist *pxdlist; struct tlock *tlck; struct xtlock *xtlck; + int rc; sp = &JFS_IP(ip)->i_xtroot; @@ -1252,9 +1252,10 @@ xtSplitRoot(tid_t tid, return -EIO; /* Allocate blocks to quota. */ - if (vfs_dq_alloc_block(ip, lengthPXD(pxd))) { + rc = dquot_alloc_block(ip, lengthPXD(pxd)); + if (rc) { release_metapage(rmp); - return -EDQUOT; + return rc; } jfs_info("xtSplitRoot: ip:0x%p rmp:0x%p", ip, rmp); @@ -3680,7 +3681,7 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag) ip->i_size = newsize; /* update quota allocation to reflect freed blocks */ - vfs_dq_free_block(ip, nfreed); + dquot_free_block(ip, nfreed); /* * free tlock of invalidated pages diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index c79a4270f08..4a3e9f39c21 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -85,6 +85,8 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, int mode, jfs_info("jfs_create: dip:0x%p name:%s", dip, dentry->d_name.name); + dquot_initialize(dip); + /* * search parent directory for entry/freespace * (dtSearch() returns parent directory page pinned) @@ -215,6 +217,8 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode) jfs_info("jfs_mkdir: dip:0x%p name:%s", dip, dentry->d_name.name); + dquot_initialize(dip); + /* link count overflow on parent directory ? */ if (dip->i_nlink == JFS_LINK_MAX) { rc = -EMLINK; @@ -356,7 +360,8 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry) jfs_info("jfs_rmdir: dip:0x%p name:%s", dip, dentry->d_name.name); /* Init inode for quota operations. */ - vfs_dq_init(ip); + dquot_initialize(dip); + dquot_initialize(ip); /* directory must be empty to be removed */ if (!dtEmpty(ip)) { @@ -483,7 +488,8 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry) jfs_info("jfs_unlink: dip:0x%p name:%s", dip, dentry->d_name.name); /* Init inode for quota operations. */ - vfs_dq_init(ip); + dquot_initialize(dip); + dquot_initialize(ip); if ((rc = get_UCSname(&dname, dentry))) goto out; @@ -805,6 +811,8 @@ static int jfs_link(struct dentry *old_dentry, if (ip->i_nlink == 0) return -ENOENT; + dquot_initialize(dir); + tid = txBegin(ip->i_sb, 0); mutex_lock_nested(&JFS_IP(dir)->commit_mutex, COMMIT_MUTEX_PARENT); @@ -896,6 +904,8 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry, jfs_info("jfs_symlink: dip:0x%p name:%s", dip, name); + dquot_initialize(dip); + ssize = strlen(name) + 1; /* @@ -1087,6 +1097,9 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, jfs_info("jfs_rename: %s %s", old_dentry->d_name.name, new_dentry->d_name.name); + dquot_initialize(old_dir); + dquot_initialize(new_dir); + old_ip = old_dentry->d_inode; new_ip = new_dentry->d_inode; @@ -1136,7 +1149,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, } else if (new_ip) { IWRITE_LOCK(new_ip, RDWRLOCK_NORMAL); /* Init inode for quota operations. */ - vfs_dq_init(new_ip); + dquot_initialize(new_ip); } /* @@ -1360,6 +1373,8 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry, jfs_info("jfs_mknod: %s", dentry->d_name.name); + dquot_initialize(dir); + if ((rc = get_UCSname(&dname, dentry))) goto out; @@ -1541,8 +1556,8 @@ const struct inode_operations jfs_dir_inode_operations = { .getxattr = jfs_getxattr, .listxattr = jfs_listxattr, .removexattr = jfs_removexattr, -#ifdef CONFIG_JFS_POSIX_ACL .setattr = jfs_setattr, +#ifdef CONFIG_JFS_POSIX_ACL .check_acl = jfs_check_acl, #endif }; diff --git a/fs/jfs/super.c b/fs/jfs/super.c index d929a822a74..266699deb1c 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -131,6 +131,11 @@ static void jfs_destroy_inode(struct inode *inode) kmem_cache_free(jfs_inode_cachep, ji); } +static void jfs_clear_inode(struct inode *inode) +{ + dquot_drop(inode); +} + static int jfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct jfs_sb_info *sbi = JFS_SBI(dentry->d_sb); @@ -745,6 +750,7 @@ static const struct super_operations jfs_super_operations = { .dirty_inode = jfs_dirty_inode, .write_inode = jfs_write_inode, .delete_inode = jfs_delete_inode, + .clear_inode = jfs_clear_inode, .put_super = jfs_put_super, .sync_fs = jfs_sync_fs, .freeze_fs = jfs_freeze, diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c index fad364548bc..1f594ab2189 100644 --- a/fs/jfs/xattr.c +++ b/fs/jfs/xattr.c @@ -260,14 +260,14 @@ static int ea_write(struct inode *ip, struct jfs_ea_list *ealist, int size, nblocks = (size + (sb->s_blocksize - 1)) >> sb->s_blocksize_bits; /* Allocate new blocks to quota. */ - if (vfs_dq_alloc_block(ip, nblocks)) { - return -EDQUOT; - } + rc = dquot_alloc_block(ip, nblocks); + if (rc) + return rc; rc = dbAlloc(ip, INOHINT(ip), nblocks, &blkno); if (rc) { /*Rollback quota allocation. */ - vfs_dq_free_block(ip, nblocks); + dquot_free_block(ip, nblocks); return rc; } @@ -332,7 +332,7 @@ static int ea_write(struct inode *ip, struct jfs_ea_list *ealist, int size, failed: /* Rollback quota allocation. */ - vfs_dq_free_block(ip, nblocks); + dquot_free_block(ip, nblocks); dbFree(ip, blkno, nblocks); return rc; @@ -538,7 +538,8 @@ static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size) if (blocks_needed > current_blocks) { /* Allocate new blocks to quota. */ - if (vfs_dq_alloc_block(inode, blocks_needed)) + rc = dquot_alloc_block(inode, blocks_needed); + if (rc) return -EDQUOT; quota_allocation = blocks_needed; @@ -602,7 +603,7 @@ static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size) clean_up: /* Rollback quota allocation */ if (quota_allocation) - vfs_dq_free_block(inode, quota_allocation); + dquot_free_block(inode, quota_allocation); return (rc); } @@ -677,7 +678,7 @@ static int ea_put(tid_t tid, struct inode *inode, struct ea_buffer *ea_buf, /* If old blocks exist, they must be removed from quota allocation. */ if (old_blocks) - vfs_dq_free_block(inode, old_blocks); + dquot_free_block(inode, old_blocks); inode->i_ctime = CURRENT_TIME; diff --git a/fs/namei.c b/fs/namei.c index 9a6456099f1..3d9d2f965f8 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -19,7 +19,6 @@ #include <linux/slab.h> #include <linux/fs.h> #include <linux/namei.h> -#include <linux/quotaops.h> #include <linux/pagemap.h> #include <linux/fsnotify.h> #include <linux/personality.h> @@ -1416,7 +1415,6 @@ int vfs_create(struct inode *dir, struct dentry *dentry, int mode, error = security_inode_create(dir, dentry, mode); if (error) return error; - vfs_dq_init(dir); error = dir->i_op->create(dir, dentry, mode, nd); if (!error) fsnotify_create(dir, dentry); @@ -1586,9 +1584,6 @@ static struct file *finish_open(struct nameidata *nd, } } if (!IS_ERR(filp)) { - if (acc_mode & MAY_WRITE) - vfs_dq_init(nd->path.dentry->d_inode); - if (will_truncate) { error = handle_truncate(&nd->path); if (error) { @@ -1986,7 +1981,6 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) if (error) return error; - vfs_dq_init(dir); error = dir->i_op->mknod(dir, dentry, mode, dev); if (!error) fsnotify_create(dir, dentry); @@ -2085,7 +2079,6 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) if (error) return error; - vfs_dq_init(dir); error = dir->i_op->mkdir(dir, dentry, mode); if (!error) fsnotify_mkdir(dir, dentry); @@ -2171,8 +2164,6 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry) if (!dir->i_op->rmdir) return -EPERM; - vfs_dq_init(dir); - mutex_lock(&dentry->d_inode->i_mutex); dentry_unhash(dentry); if (d_mountpoint(dentry)) @@ -2258,8 +2249,6 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry) if (!dir->i_op->unlink) return -EPERM; - vfs_dq_init(dir); - mutex_lock(&dentry->d_inode->i_mutex); if (d_mountpoint(dentry)) error = -EBUSY; @@ -2372,7 +2361,6 @@ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname) if (error) return error; - vfs_dq_init(dir); error = dir->i_op->symlink(dir, dentry, oldname); if (!error) fsnotify_create(dir, dentry); @@ -2456,7 +2444,6 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de return error; mutex_lock(&inode->i_mutex); - vfs_dq_init(dir); error = dir->i_op->link(old_dentry, dir, new_dentry); mutex_unlock(&inode->i_mutex); if (!error) @@ -2657,9 +2644,6 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (!old_dir->i_op->rename) return -EPERM; - vfs_dq_init(old_dir); - vfs_dq_init(new_dir); - old_name = fsnotify_oldname_init(old_dentry->d_name.name); if (is_dir) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 15dc2deaac5..8eca17df4f6 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -20,7 +20,6 @@ #include <linux/fcntl.h> #include <linux/namei.h> #include <linux/delay.h> -#include <linux/quotaops.h> #include <linux/fsnotify.h> #include <linux/posix_acl_xattr.h> #include <linux/xattr.h> @@ -377,7 +376,6 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, put_write_access(inode); goto out_nfserr; } - vfs_dq_init(inode); } /* sanitize the mode change */ @@ -745,8 +743,6 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, flags = O_RDWR|O_LARGEFILE; else flags = O_WRONLY|O_LARGEFILE; - - vfs_dq_init(inode); } *filp = dentry_open(dget(dentry), mntget(fhp->fh_export->ex_path.mnt), flags, current_cred()); diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 2bbe1ecc08c..9f8bd913c51 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -5713,7 +5713,7 @@ int ocfs2_remove_btree_range(struct inode *inode, goto out; } - vfs_dq_free_space_nodirty(inode, + dquot_free_space_nodirty(inode, ocfs2_clusters_to_bytes(inode->i_sb, len)); ret = ocfs2_remove_extent(handle, et, cpos, len, meta_ac, dealloc); @@ -6936,7 +6936,7 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb, goto bail; } - vfs_dq_free_space_nodirty(inode, + dquot_free_space_nodirty(inode, ocfs2_clusters_to_bytes(osb->sb, clusters_to_del)); spin_lock(&OCFS2_I(inode)->ip_lock); OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) - @@ -7301,11 +7301,10 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, unsigned int page_end; u64 phys; - if (vfs_dq_alloc_space_nodirty(inode, - ocfs2_clusters_to_bytes(osb->sb, 1))) { - ret = -EDQUOT; + ret = dquot_alloc_space_nodirty(inode, + ocfs2_clusters_to_bytes(osb->sb, 1)); + if (ret) goto out_commit; - } did_quota = 1; ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, @@ -7381,7 +7380,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, out_commit: if (ret < 0 && did_quota) - vfs_dq_free_space_nodirty(inode, + dquot_free_space_nodirty(inode, ocfs2_clusters_to_bytes(osb->sb, 1)); ocfs2_commit_trans(osb, handle); diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 4c2a6d282c4..21441ddb550 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -1764,10 +1764,11 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, wc->w_handle = handle; - if (clusters_to_alloc && vfs_dq_alloc_space_nodirty(inode, - ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc))) { - ret = -EDQUOT; - goto out_commit; + if (clusters_to_alloc) { + ret = dquot_alloc_space_nodirty(inode, + ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc)); + if (ret) + goto out_commit; } /* * We don't want this to fail in ocfs2_write_end(), so do it @@ -1810,7 +1811,7 @@ success: return 0; out_quota: if (clusters_to_alloc) - vfs_dq_free_space(inode, + dquot_free_space(inode, ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc)); out_commit: ocfs2_commit_trans(osb, handle); diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 765d66c7098..efd77d071c8 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -2964,12 +2964,10 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, goto out; } - if (vfs_dq_alloc_space_nodirty(dir, - ocfs2_clusters_to_bytes(osb->sb, - alloc + dx_alloc))) { - ret = -EDQUOT; + ret = dquot_alloc_space_nodirty(dir, + ocfs2_clusters_to_bytes(osb->sb, alloc + dx_alloc)); + if (ret) goto out_commit; - } did_quota = 1; if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) { @@ -3178,7 +3176,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, out_commit: if (ret < 0 && did_quota) - vfs_dq_free_space_nodirty(dir, bytes_allocated); + dquot_free_space_nodirty(dir, bytes_allocated); ocfs2_commit_trans(osb, handle); @@ -3221,11 +3219,10 @@ static int ocfs2_do_extend_dir(struct super_block *sb, if (extend) { u32 offset = OCFS2_I(dir)->ip_clusters; - if (vfs_dq_alloc_space_nodirty(dir, - ocfs2_clusters_to_bytes(sb, 1))) { - status = -EDQUOT; + status = dquot_alloc_space_nodirty(dir, + ocfs2_clusters_to_bytes(sb, 1)); + if (status) goto bail; - } did_quota = 1; status = ocfs2_add_inode_data(OCFS2_SB(sb), dir, &offset, @@ -3254,7 +3251,7 @@ static int ocfs2_do_extend_dir(struct super_block *sb, status = 0; bail: if (did_quota && status < 0) - vfs_dq_free_space_nodirty(dir, ocfs2_clusters_to_bytes(sb, 1)); + dquot_free_space_nodirty(dir, ocfs2_clusters_to_bytes(sb, 1)); mlog_exit(status); return status; } @@ -3889,11 +3886,10 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir, goto out; } - if (vfs_dq_alloc_space_nodirty(dir, - ocfs2_clusters_to_bytes(dir->i_sb, 1))) { - ret = -EDQUOT; + ret = dquot_alloc_space_nodirty(dir, + ocfs2_clusters_to_bytes(dir->i_sb, 1)); + if (ret) goto out_commit; - } did_quota = 1; ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), dx_leaf_bh, @@ -3983,7 +3979,7 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir, out_commit: if (ret < 0 && did_quota) - vfs_dq_free_space_nodirty(dir, + dquot_free_space_nodirty(dir, ocfs2_clusters_to_bytes(dir->i_sb, 1)); ocfs2_commit_trans(osb, handle); @@ -4165,11 +4161,10 @@ static int ocfs2_expand_inline_dx_root(struct inode *dir, goto out; } - if (vfs_dq_alloc_space_nodirty(dir, - ocfs2_clusters_to_bytes(osb->sb, 1))) { - ret = -EDQUOT; + ret = dquot_alloc_space_nodirty(dir, + ocfs2_clusters_to_bytes(osb->sb, 1)); + if (ret) goto out_commit; - } did_quota = 1; /* @@ -4229,7 +4224,7 @@ static int ocfs2_expand_inline_dx_root(struct inode *dir, out_commit: if (ret < 0 && did_quota) - vfs_dq_free_space_nodirty(dir, + dquot_free_space_nodirty(dir, ocfs2_clusters_to_bytes(dir->i_sb, 1)); ocfs2_commit_trans(osb, handle); diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 5b52547d629..17947dc8341 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -107,6 +107,9 @@ static int ocfs2_file_open(struct inode *inode, struct file *file) mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file, file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name); + if (file->f_mode & FMODE_WRITE) + dquot_initialize(inode); + spin_lock(&oi->ip_lock); /* Check that the inode hasn't been wiped from disk by another @@ -629,11 +632,10 @@ restart_all: } restarted_transaction: - if (vfs_dq_alloc_space_nodirty(inode, ocfs2_clusters_to_bytes(osb->sb, - clusters_to_add))) { - status = -EDQUOT; + status = dquot_alloc_space_nodirty(inode, + ocfs2_clusters_to_bytes(osb->sb, clusters_to_add)); + if (status) goto leave; - } did_quota = 1; /* reserve a write to the file entry early on - that we if we @@ -674,7 +676,7 @@ restarted_transaction: clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters); spin_unlock(&OCFS2_I(inode)->ip_lock); /* Release unused quota reservation */ - vfs_dq_free_space(inode, + dquot_free_space(inode, ocfs2_clusters_to_bytes(osb->sb, clusters_to_add)); did_quota = 0; @@ -710,7 +712,7 @@ restarted_transaction: leave: if (status < 0 && did_quota) - vfs_dq_free_space(inode, + dquot_free_space(inode, ocfs2_clusters_to_bytes(osb->sb, clusters_to_add)); if (handle) { ocfs2_commit_trans(osb, handle); @@ -978,6 +980,8 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE; if (size_change) { + dquot_initialize(inode); + status = ocfs2_rw_lock(inode, 1); if (status < 0) { mlog_errno(status); @@ -1020,7 +1024,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) /* * Gather pointers to quota structures so that allocation / * freeing of quota structures happens here and not inside - * vfs_dq_transfer() where we have problems with lock ordering + * dquot_transfer() where we have problems with lock ordering */ if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid && OCFS2_HAS_RO_COMPAT_FEATURE(sb, @@ -1053,7 +1057,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) mlog_errno(status); goto bail_unlock; } - status = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0; + status = dquot_transfer(inode, attr); if (status < 0) goto bail_commit; } else { diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 88459bdd1ff..278a223aae1 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -665,7 +665,7 @@ static int ocfs2_remove_inode(struct inode *inode, } ocfs2_remove_from_cache(INODE_CACHE(inode), di_bh); - vfs_dq_free_inode(inode); + dquot_free_inode(inode); status = ocfs2_free_dinode(handle, inode_alloc_inode, inode_alloc_bh, di); @@ -971,6 +971,8 @@ void ocfs2_delete_inode(struct inode *inode) goto bail; } + dquot_initialize(inode); + if (!ocfs2_inode_is_valid_to_delete(inode)) { /* It's probably not necessary to truncate_inode_pages * here but we do it for safety anyway (it will most @@ -1087,6 +1089,8 @@ void ocfs2_clear_inode(struct inode *inode) mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL, "Inode=%lu\n", inode->i_ino); + dquot_drop(inode); + /* To preven remote deletes we hold open lock before, now it * is time to unlock PR and EX open locks. */ ocfs2_open_unlock(inode); diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 50fb26a6a5f..d9cd4e373a5 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -212,7 +212,7 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, int mode) } else inode->i_gid = current_fsgid(); inode->i_mode = mode; - vfs_dq_init(inode); + dquot_initialize(inode); return inode; } @@ -244,6 +244,8 @@ static int ocfs2_mknod(struct inode *dir, (unsigned long)dev, dentry->d_name.len, dentry->d_name.name); + dquot_initialize(dir); + /* get our super block */ osb = OCFS2_SB(dir->i_sb); @@ -348,13 +350,9 @@ static int ocfs2_mknod(struct inode *dir, goto leave; } - /* We don't use standard VFS wrapper because we don't want vfs_dq_init - * to be called. */ - if (sb_any_quota_active(osb->sb) && - osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) { - status = -EDQUOT; + status = dquot_alloc_inode(inode); + if (status) goto leave; - } did_quota_inode = 1; mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, @@ -431,7 +429,7 @@ static int ocfs2_mknod(struct inode *dir, status = 0; leave: if (status < 0 && did_quota_inode) - vfs_dq_free_inode(inode); + dquot_free_inode(inode); if (handle) ocfs2_commit_trans(osb, handle); @@ -636,6 +634,8 @@ static int ocfs2_link(struct dentry *old_dentry, if (S_ISDIR(inode->i_mode)) return -EPERM; + dquot_initialize(dir); + err = ocfs2_inode_lock_nested(dir, &parent_fe_bh, 1, OI_LS_PARENT); if (err < 0) { if (err != -ENOENT) @@ -791,6 +791,8 @@ static int ocfs2_unlink(struct inode *dir, mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry, dentry->d_name.len, dentry->d_name.name); + dquot_initialize(dir); + BUG_ON(dentry->d_parent->d_inode != dir); mlog(0, "ino = %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno); @@ -1051,6 +1053,9 @@ static int ocfs2_rename(struct inode *old_dir, old_dentry->d_name.len, old_dentry->d_name.name, new_dentry->d_name.len, new_dentry->d_name.name); + dquot_initialize(old_dir); + dquot_initialize(new_dir); + osb = OCFS2_SB(old_dir->i_sb); if (new_inode) { @@ -1599,6 +1604,8 @@ static int ocfs2_symlink(struct inode *dir, mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir, dentry, symname, dentry->d_name.len, dentry->d_name.name); + dquot_initialize(dir); + sb = dir->i_sb; osb = OCFS2_SB(sb); @@ -1688,13 +1695,9 @@ static int ocfs2_symlink(struct inode *dir, goto bail; } - /* We don't use standard VFS wrapper because we don't want vfs_dq_init - * to be called. */ - if (sb_any_quota_active(osb->sb) && - osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) { - status = -EDQUOT; + status = dquot_alloc_inode(inode); + if (status) goto bail; - } did_quota_inode = 1; mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry, @@ -1716,11 +1719,10 @@ static int ocfs2_symlink(struct inode *dir, u32 offset = 0; inode->i_op = &ocfs2_symlink_inode_operations; - if (vfs_dq_alloc_space_nodirty(inode, - ocfs2_clusters_to_bytes(osb->sb, 1))) { - status = -EDQUOT; + status = dquot_alloc_space_nodirty(inode, + ocfs2_clusters_to_bytes(osb->sb, 1)); + if (status) goto bail; - } did_quota = 1; status = ocfs2_add_inode_data(osb, inode, &offset, 1, 0, new_fe_bh, @@ -1788,10 +1790,10 @@ static int ocfs2_symlink(struct inode *dir, d_instantiate(dentry, inode); bail: if (status < 0 && did_quota) - vfs_dq_free_space_nodirty(inode, + dquot_free_space_nodirty(inode, ocfs2_clusters_to_bytes(osb->sb, 1)); if (status < 0 && did_quota_inode) - vfs_dq_free_inode(inode); + dquot_free_inode(inode); if (handle) ocfs2_commit_trans(osb, handle); @@ -2099,13 +2101,9 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, goto leave; } - /* We don't use standard VFS wrapper because we don't want vfs_dq_init - * to be called. */ - if (sb_any_quota_active(osb->sb) && - osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) { - status = -EDQUOT; + status = dquot_alloc_inode(inode); + if (status) goto leave; - } did_quota_inode = 1; inode->i_nlink = 0; @@ -2140,7 +2138,7 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, insert_inode_hash(inode); leave: if (status < 0 && did_quota_inode) - vfs_dq_free_inode(inode); + dquot_free_inode(inode); if (handle) ocfs2_commit_trans(osb, handle); diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index b437dc0c4ca..355f41d1d52 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -851,13 +851,6 @@ static void ocfs2_destroy_dquot(struct dquot *dquot) } const struct dquot_operations ocfs2_quota_operations = { - .initialize = dquot_initialize, - .drop = dquot_drop, - .alloc_space = dquot_alloc_space, - .alloc_inode = dquot_alloc_inode, - .free_space = dquot_free_space, - .free_inode = dquot_free_inode, - .transfer = dquot_transfer, .write_dquot = ocfs2_write_dquot, .acquire_dquot = ocfs2_acquire_dquot, .release_dquot = ocfs2_release_dquot, diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index fb6aa7acf54..9e96921dffd 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4390,7 +4390,7 @@ static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir, } mutex_lock(&inode->i_mutex); - vfs_dq_init(dir); + dquot_initialize(dir); error = ocfs2_reflink(old_dentry, dir, new_dentry, preserve); mutex_unlock(&inode->i_mutex); if (!error) diff --git a/fs/open.c b/fs/open.c index e0b2d88b038..e17f54454b5 100644 --- a/fs/open.c +++ b/fs/open.c @@ -8,7 +8,6 @@ #include <linux/mm.h> #include <linux/file.h> #include <linux/fdtable.h> -#include <linux/quotaops.h> #include <linux/fsnotify.h> #include <linux/module.h> #include <linux/slab.h> @@ -278,10 +277,8 @@ static long do_sys_truncate(const char __user *pathname, loff_t length) error = locks_verify_truncate(inode, NULL, length); if (!error) error = security_path_truncate(&path, length, 0); - if (!error) { - vfs_dq_init(inode); + if (!error) error = do_truncate(path.dentry, length, 0, NULL); - } put_write_and_out: put_write_access(inode); diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig index efc02ebb8c7..dad7fb247dd 100644 --- a/fs/quota/Kconfig +++ b/fs/quota/Kconfig @@ -59,3 +59,8 @@ config QUOTACTL bool depends on XFS_QUOTA || QUOTA default y + +config QUOTACTL_COMPAT + bool + depends on QUOTACTL && COMPAT_FOR_U64_ALIGNMENT + default y diff --git a/fs/quota/Makefile b/fs/quota/Makefile index 68d4f6dc057..5f9e9e276af 100644 --- a/fs/quota/Makefile +++ b/fs/quota/Makefile @@ -3,3 +3,5 @@ obj-$(CONFIG_QFMT_V1) += quota_v1.o obj-$(CONFIG_QFMT_V2) += quota_v2.o obj-$(CONFIG_QUOTA_TREE) += quota_tree.o obj-$(CONFIG_QUOTACTL) += quota.o +obj-$(CONFIG_QUOTACTL_COMPAT) += compat.o +obj-$(CONFIG_QUOTA_NETLINK_INTERFACE) += netlink.o diff --git a/fs/quota/compat.c b/fs/quota/compat.c new file mode 100644 index 00000000000..fb1892fe3e5 --- /dev/null +++ b/fs/quota/compat.c @@ -0,0 +1,118 @@ + +#include <linux/syscalls.h> +#include <linux/compat.h> +#include <linux/quotaops.h> + +/* + * This code works only for 32 bit quota tools over 64 bit OS (x86_64, ia64) + * and is necessary due to alignment problems. + */ +struct compat_if_dqblk { + compat_u64 dqb_bhardlimit; + compat_u64 dqb_bsoftlimit; + compat_u64 dqb_curspace; + compat_u64 dqb_ihardlimit; + compat_u64 dqb_isoftlimit; + compat_u64 dqb_curinodes; + compat_u64 dqb_btime; + compat_u64 dqb_itime; + compat_uint_t dqb_valid; +}; + +/* XFS structures */ +struct compat_fs_qfilestat { + compat_u64 dqb_bhardlimit; + compat_u64 qfs_nblks; + compat_uint_t qfs_nextents; +}; + +struct compat_fs_quota_stat { + __s8 qs_version; + __u16 qs_flags; + __s8 qs_pad; + struct compat_fs_qfilestat qs_uquota; + struct compat_fs_qfilestat qs_gquota; + compat_uint_t qs_incoredqs; + compat_int_t qs_btimelimit; + compat_int_t qs_itimelimit; + compat_int_t qs_rtbtimelimit; + __u16 qs_bwarnlimit; + __u16 qs_iwarnlimit; +}; + +asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special, + qid_t id, void __user *addr) +{ + unsigned int cmds; + struct if_dqblk __user *dqblk; + struct compat_if_dqblk __user *compat_dqblk; + struct fs_quota_stat __user *fsqstat; + struct compat_fs_quota_stat __user *compat_fsqstat; + compat_uint_t data; + u16 xdata; + long ret; + + cmds = cmd >> SUBCMDSHIFT; + + switch (cmds) { + case Q_GETQUOTA: + dqblk = compat_alloc_user_space(sizeof(struct if_dqblk)); + compat_dqblk = addr; + ret = sys_quotactl(cmd, special, id, dqblk); + if (ret) + break; + if (copy_in_user(compat_dqblk, dqblk, sizeof(*compat_dqblk)) || + get_user(data, &dqblk->dqb_valid) || + put_user(data, &compat_dqblk->dqb_valid)) + ret = -EFAULT; + break; + case Q_SETQUOTA: + dqblk = compat_alloc_user_space(sizeof(struct if_dqblk)); + compat_dqblk = addr; + ret = -EFAULT; + if (copy_in_user(dqblk, compat_dqblk, sizeof(*compat_dqblk)) || + get_user(data, &compat_dqblk->dqb_valid) || + put_user(data, &dqblk->dqb_valid)) + break; + ret = sys_quotactl(cmd, special, id, dqblk); + break; + case Q_XGETQSTAT: + fsqstat = compat_alloc_user_space(sizeof(struct fs_quota_stat)); + compat_fsqstat = addr; + ret = sys_quotactl(cmd, special, id, fsqstat); + if (ret) + break; + ret = -EFAULT; + /* Copying qs_version, qs_flags, qs_pad */ + if (copy_in_user(compat_fsqstat, fsqstat, + offsetof(struct compat_fs_quota_stat, qs_uquota))) + break; + /* Copying qs_uquota */ + if (copy_in_user(&compat_fsqstat->qs_uquota, + &fsqstat->qs_uquota, + sizeof(compat_fsqstat->qs_uquota)) || + get_user(data, &fsqstat->qs_uquota.qfs_nextents) || + put_user(data, &compat_fsqstat->qs_uquota.qfs_nextents)) + break; + /* Copying qs_gquota */ + if (copy_in_user(&compat_fsqstat->qs_gquota, + &fsqstat->qs_gquota, + sizeof(compat_fsqstat->qs_gquota)) || + get_user(data, &fsqstat->qs_gquota.qfs_nextents) || + put_user(data, &compat_fsqstat->qs_gquota.qfs_nextents)) + break; + /* Copying the rest */ + if (copy_in_user(&compat_fsqstat->qs_incoredqs, + &fsqstat->qs_incoredqs, + sizeof(struct compat_fs_quota_stat) - + offsetof(struct compat_fs_quota_stat, qs_incoredqs)) || + get_user(xdata, &fsqstat->qs_iwarnlimit) || + put_user(xdata, &compat_fsqstat->qs_iwarnlimit)) + break; + ret = 0; + break; + default: + ret = sys_quotactl(cmd, special, id, addr); + } + return ret; +} diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 3fc62b097be..e0b870f4749 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -100,9 +100,13 @@ * * Any operation working on dquots via inode pointers must hold dqptr_sem. If * operation is just reading pointers from inode (or not using them at all) the - * read lock is enough. If pointers are altered function must hold write lock - * (these locking rules also apply for S_NOQUOTA flag in the inode - note that - * for altering the flag i_mutex is also needed). + * read lock is enough. If pointers are altered function must hold write lock. + * Special care needs to be taken about S_NOQUOTA inode flag (marking that + * inode is a quota file). Functions adding pointers from inode to dquots have + * to check this flag under dqptr_sem and then (if S_NOQUOTA is not set) they + * have to do all pointer modifications before dropping dqptr_sem. This makes + * sure they cannot race with quotaon which first sets S_NOQUOTA flag and + * then drops all pointers to dquots from an inode. * * Each dquot has its dq_lock mutex. Locked dquots might not be referenced * from inodes (dquot_alloc_space() and such don't check the dq_lock). @@ -225,6 +229,9 @@ static struct hlist_head *dquot_hash; struct dqstats dqstats; EXPORT_SYMBOL(dqstats); +static qsize_t inode_get_rsv_space(struct inode *inode); +static void __dquot_initialize(struct inode *inode, int type); + static inline unsigned int hashfn(const struct super_block *sb, unsigned int id, int type) { @@ -564,7 +571,7 @@ out: } EXPORT_SYMBOL(dquot_scan_active); -int vfs_quota_sync(struct super_block *sb, int type) +int vfs_quota_sync(struct super_block *sb, int type, int wait) { struct list_head *dirty; struct dquot *dquot; @@ -609,6 +616,33 @@ int vfs_quota_sync(struct super_block *sb, int type) spin_unlock(&dq_list_lock); mutex_unlock(&dqopt->dqonoff_mutex); + if (!wait || (sb_dqopt(sb)->flags & DQUOT_QUOTA_SYS_FILE)) + return 0; + + /* This is not very clever (and fast) but currently I don't know about + * any other simple way of getting quota data to disk and we must get + * them there for userspace to be visible... */ + if (sb->s_op->sync_fs) + sb->s_op->sync_fs(sb, 1); + sync_blockdev(sb->s_bdev); + + /* + * Now when everything is written we can discard the pagecache so + * that userspace sees the changes. + */ + mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (type != -1 && cnt != type) + continue; + if (!sb_has_quota_active(sb, cnt)) + continue; + mutex_lock_nested(&sb_dqopt(sb)->files[cnt]->i_mutex, + I_MUTEX_QUOTA); + truncate_inode_pages(&sb_dqopt(sb)->files[cnt]->i_data, 0); + mutex_unlock(&sb_dqopt(sb)->files[cnt]->i_mutex); + } + mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); + return 0; } EXPORT_SYMBOL(vfs_quota_sync); @@ -840,11 +874,14 @@ static int dqinit_needed(struct inode *inode, int type) static void add_dquot_ref(struct super_block *sb, int type) { struct inode *inode, *old_inode = NULL; + int reserved = 0; spin_lock(&inode_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW)) continue; + if (unlikely(inode_get_rsv_space(inode) > 0)) + reserved = 1; if (!atomic_read(&inode->i_writecount)) continue; if (!dqinit_needed(inode, type)) @@ -854,7 +891,7 @@ static void add_dquot_ref(struct super_block *sb, int type) spin_unlock(&inode_lock); iput(old_inode); - sb->dq_op->initialize(inode, type); + __dquot_initialize(inode, type); /* We hold a reference to 'inode' so it couldn't have been * removed from s_inodes list while we dropped the inode_lock. * We cannot iput the inode now as we can be holding the last @@ -865,6 +902,12 @@ static void add_dquot_ref(struct super_block *sb, int type) } spin_unlock(&inode_lock); iput(old_inode); + + if (reserved) { + printk(KERN_WARNING "VFS (%s): Writes happened before quota" + " was turned on thus quota information is probably " + "inconsistent. Please run quotacheck(8).\n", sb->s_id); + } } /* @@ -978,10 +1021,12 @@ static inline void dquot_resv_space(struct dquot *dquot, qsize_t number) /* * Claim reserved quota space */ -static void dquot_claim_reserved_space(struct dquot *dquot, - qsize_t number) +static void dquot_claim_reserved_space(struct dquot *dquot, qsize_t number) { - WARN_ON(dquot->dq_dqb.dqb_rsvspace < number); + if (dquot->dq_dqb.dqb_rsvspace < number) { + WARN_ON_ONCE(1); + number = dquot->dq_dqb.dqb_rsvspace; + } dquot->dq_dqb.dqb_curspace += number; dquot->dq_dqb.dqb_rsvspace -= number; } @@ -989,7 +1034,12 @@ static void dquot_claim_reserved_space(struct dquot *dquot, static inline void dquot_free_reserved_space(struct dquot *dquot, qsize_t number) { - dquot->dq_dqb.dqb_rsvspace -= number; + if (dquot->dq_dqb.dqb_rsvspace >= number) + dquot->dq_dqb.dqb_rsvspace -= number; + else { + WARN_ON_ONCE(1); + dquot->dq_dqb.dqb_rsvspace = 0; + } } static void dquot_decr_inodes(struct dquot *dquot, qsize_t number) @@ -1131,13 +1181,13 @@ static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype) *warntype = QUOTA_NL_NOWARN; if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type) || test_bit(DQ_FAKE_B, &dquot->dq_flags)) - return QUOTA_OK; + return 0; if (dquot->dq_dqb.dqb_ihardlimit && newinodes > dquot->dq_dqb.dqb_ihardlimit && !ignore_hardlimit(dquot)) { *warntype = QUOTA_NL_IHARDWARN; - return NO_QUOTA; + return -EDQUOT; } if (dquot->dq_dqb.dqb_isoftlimit && @@ -1146,7 +1196,7 @@ static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype) get_seconds() >= dquot->dq_dqb.dqb_itime && !ignore_hardlimit(dquot)) { *warntype = QUOTA_NL_ISOFTLONGWARN; - return NO_QUOTA; + return -EDQUOT; } if (dquot->dq_dqb.dqb_isoftlimit && @@ -1157,7 +1207,7 @@ static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype) sb_dqopt(dquot->dq_sb)->info[dquot->dq_type].dqi_igrace; } - return QUOTA_OK; + return 0; } /* needs dq_data_lock */ @@ -1169,7 +1219,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war *warntype = QUOTA_NL_NOWARN; if (!sb_has_quota_limits_enabled(sb, dquot->dq_type) || test_bit(DQ_FAKE_B, &dquot->dq_flags)) - return QUOTA_OK; + return 0; tspace = dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace + space; @@ -1179,7 +1229,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war !ignore_hardlimit(dquot)) { if (!prealloc) *warntype = QUOTA_NL_BHARDWARN; - return NO_QUOTA; + return -EDQUOT; } if (dquot->dq_dqb.dqb_bsoftlimit && @@ -1189,7 +1239,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war !ignore_hardlimit(dquot)) { if (!prealloc) *warntype = QUOTA_NL_BSOFTLONGWARN; - return NO_QUOTA; + return -EDQUOT; } if (dquot->dq_dqb.dqb_bsoftlimit && @@ -1205,10 +1255,10 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war * We don't allow preallocation to exceed softlimit so exceeding will * be always printed */ - return NO_QUOTA; + return -EDQUOT; } - return QUOTA_OK; + return 0; } static int info_idq_free(struct dquot *dquot, qsize_t inodes) @@ -1242,25 +1292,32 @@ static int info_bdq_free(struct dquot *dquot, qsize_t space) return QUOTA_NL_BHARDBELOW; return QUOTA_NL_NOWARN; } + /* - * Initialize quota pointers in inode - * We do things in a bit complicated way but by that we avoid calling - * dqget() and thus filesystem callbacks under dqptr_sem. + * Initialize quota pointers in inode + * + * We do things in a bit complicated way but by that we avoid calling + * dqget() and thus filesystem callbacks under dqptr_sem. + * + * It is better to call this function outside of any transaction as it + * might need a lot of space in journal for dquot structure allocation. */ -int dquot_initialize(struct inode *inode, int type) +static void __dquot_initialize(struct inode *inode, int type) { unsigned int id = 0; - int cnt, ret = 0; - struct dquot *got[MAXQUOTAS] = { NULL, NULL }; + int cnt; + struct dquot *got[MAXQUOTAS]; struct super_block *sb = inode->i_sb; + qsize_t rsv; /* First test before acquiring mutex - solves deadlocks when we * re-enter the quota code and are already holding the mutex */ - if (IS_NOQUOTA(inode)) - return 0; + if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) + return; /* First get references to structures we might need. */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + got[cnt] = NULL; if (type != -1 && cnt != type) continue; switch (cnt) { @@ -1275,7 +1332,6 @@ int dquot_initialize(struct inode *inode, int type) } down_write(&sb_dqopt(sb)->dqptr_sem); - /* Having dqptr_sem we know NOQUOTA flags can't be altered... */ if (IS_NOQUOTA(inode)) goto out_err; for (cnt = 0; cnt < MAXQUOTAS; cnt++) { @@ -1287,20 +1343,31 @@ int dquot_initialize(struct inode *inode, int type) if (!inode->i_dquot[cnt]) { inode->i_dquot[cnt] = got[cnt]; got[cnt] = NULL; + /* + * Make quota reservation system happy if someone + * did a write before quota was turned on + */ + rsv = inode_get_rsv_space(inode); + if (unlikely(rsv)) + dquot_resv_space(inode->i_dquot[cnt], rsv); } } out_err: up_write(&sb_dqopt(sb)->dqptr_sem); /* Drop unused references */ dqput_all(got); - return ret; +} + +void dquot_initialize(struct inode *inode) +{ + __dquot_initialize(inode, -1); } EXPORT_SYMBOL(dquot_initialize); /* * Release all quotas referenced by inode */ -int dquot_drop(struct inode *inode) +static void __dquot_drop(struct inode *inode) { int cnt; struct dquot *put[MAXQUOTAS]; @@ -1312,32 +1379,31 @@ int dquot_drop(struct inode *inode) } up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); dqput_all(put); - return 0; } -EXPORT_SYMBOL(dquot_drop); -/* Wrapper to remove references to quota structures from inode */ -void vfs_dq_drop(struct inode *inode) -{ - /* Here we can get arbitrary inode from clear_inode() so we have - * to be careful. OTOH we don't need locking as quota operations - * are allowed to change only at mount time */ - if (!IS_NOQUOTA(inode) && inode->i_sb && inode->i_sb->dq_op - && inode->i_sb->dq_op->drop) { - int cnt; - /* Test before calling to rule out calls from proc and such - * where we are not allowed to block. Note that this is - * actually reliable test even without the lock - the caller - * must assure that nobody can come after the DQUOT_DROP and - * add quota pointers back anyway */ - for (cnt = 0; cnt < MAXQUOTAS; cnt++) - if (inode->i_dquot[cnt]) - break; - if (cnt < MAXQUOTAS) - inode->i_sb->dq_op->drop(inode); - } -} -EXPORT_SYMBOL(vfs_dq_drop); +void dquot_drop(struct inode *inode) +{ + int cnt; + + if (IS_NOQUOTA(inode)) + return; + + /* + * Test before calling to rule out calls from proc and such + * where we are not allowed to block. Note that this is + * actually reliable test even without the lock - the caller + * must assure that nobody can come after the DQUOT_DROP and + * add quota pointers back anyway. + */ + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (inode->i_dquot[cnt]) + break; + } + + if (cnt < MAXQUOTAS) + __dquot_drop(inode); +} +EXPORT_SYMBOL(dquot_drop); /* * inode_reserved_space is managed internally by quota, and protected by @@ -1351,28 +1417,30 @@ static qsize_t *inode_reserved_space(struct inode * inode) return inode->i_sb->dq_op->get_reserved_space(inode); } -static void inode_add_rsv_space(struct inode *inode, qsize_t number) +void inode_add_rsv_space(struct inode *inode, qsize_t number) { spin_lock(&inode->i_lock); *inode_reserved_space(inode) += number; spin_unlock(&inode->i_lock); } +EXPORT_SYMBOL(inode_add_rsv_space); - -static void inode_claim_rsv_space(struct inode *inode, qsize_t number) +void inode_claim_rsv_space(struct inode *inode, qsize_t number) { spin_lock(&inode->i_lock); *inode_reserved_space(inode) -= number; __inode_add_bytes(inode, number); spin_unlock(&inode->i_lock); } +EXPORT_SYMBOL(inode_claim_rsv_space); -static void inode_sub_rsv_space(struct inode *inode, qsize_t number) +void inode_sub_rsv_space(struct inode *inode, qsize_t number) { spin_lock(&inode->i_lock); *inode_reserved_space(inode) -= number; spin_unlock(&inode->i_lock); } +EXPORT_SYMBOL(inode_sub_rsv_space); static qsize_t inode_get_rsv_space(struct inode *inode) { @@ -1404,38 +1472,34 @@ static void inode_decr_space(struct inode *inode, qsize_t number, int reserve) } /* - * Following four functions update i_blocks+i_bytes fields and - * quota information (together with appropriate checks) - * NOTE: We absolutely rely on the fact that caller dirties - * the inode (usually macros in quotaops.h care about this) and - * holds a handle for the current transaction so that dquot write and - * inode write go into the same transaction. + * This functions updates i_blocks+i_bytes fields and quota information + * (together with appropriate checks). + * + * NOTE: We absolutely rely on the fact that caller dirties the inode + * (usually helpers in quotaops.h care about this) and holds a handle for + * the current transaction so that dquot write and inode write go into the + * same transaction. */ /* * This operation can block, but only after everything is updated */ int __dquot_alloc_space(struct inode *inode, qsize_t number, - int warn, int reserve) + int warn, int reserve) { - int cnt, ret = QUOTA_OK; + int cnt, ret = 0; char warntype[MAXQUOTAS]; /* * First test before acquiring mutex - solves deadlocks when we * re-enter the quota code and are already holding the mutex */ - if (IS_NOQUOTA(inode)) { + if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) { inode_incr_space(inode, number, reserve); goto out; } down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); - if (IS_NOQUOTA(inode)) { - inode_incr_space(inode, number, reserve); - goto out_unlock; - } - for (cnt = 0; cnt < MAXQUOTAS; cnt++) warntype[cnt] = QUOTA_NL_NOWARN; @@ -1443,9 +1507,9 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (!inode->i_dquot[cnt]) continue; - if (check_bdq(inode->i_dquot[cnt], number, warn, warntype+cnt) - == NO_QUOTA) { - ret = NO_QUOTA; + ret = check_bdq(inode->i_dquot[cnt], number, !warn, + warntype+cnt); + if (ret) { spin_unlock(&dq_data_lock); goto out_flush_warn; } @@ -1466,61 +1530,45 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, mark_all_dquot_dirty(inode->i_dquot); out_flush_warn: flush_warnings(inode->i_dquot, warntype); -out_unlock: up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); out: return ret; } - -int dquot_alloc_space(struct inode *inode, qsize_t number, int warn) -{ - return __dquot_alloc_space(inode, number, warn, 0); -} -EXPORT_SYMBOL(dquot_alloc_space); - -int dquot_reserve_space(struct inode *inode, qsize_t number, int warn) -{ - return __dquot_alloc_space(inode, number, warn, 1); -} -EXPORT_SYMBOL(dquot_reserve_space); +EXPORT_SYMBOL(__dquot_alloc_space); /* * This operation can block, but only after everything is updated */ -int dquot_alloc_inode(const struct inode *inode, qsize_t number) +int dquot_alloc_inode(const struct inode *inode) { - int cnt, ret = NO_QUOTA; + int cnt, ret = 0; char warntype[MAXQUOTAS]; /* First test before acquiring mutex - solves deadlocks when we * re-enter the quota code and are already holding the mutex */ - if (IS_NOQUOTA(inode)) - return QUOTA_OK; + if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) + return 0; for (cnt = 0; cnt < MAXQUOTAS; cnt++) warntype[cnt] = QUOTA_NL_NOWARN; down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); - if (IS_NOQUOTA(inode)) { - up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); - return QUOTA_OK; - } spin_lock(&dq_data_lock); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (!inode->i_dquot[cnt]) continue; - if (check_idq(inode->i_dquot[cnt], number, warntype+cnt) - == NO_QUOTA) + ret = check_idq(inode->i_dquot[cnt], 1, warntype + cnt); + if (ret) goto warn_put_all; } for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (!inode->i_dquot[cnt]) continue; - dquot_incr_inodes(inode->i_dquot[cnt], number); + dquot_incr_inodes(inode->i_dquot[cnt], 1); } - ret = QUOTA_OK; + warn_put_all: spin_unlock(&dq_data_lock); - if (ret == QUOTA_OK) + if (ret == 0) mark_all_dquot_dirty(inode->i_dquot); flush_warnings(inode->i_dquot, warntype); up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); @@ -1528,23 +1576,19 @@ warn_put_all: } EXPORT_SYMBOL(dquot_alloc_inode); -int dquot_claim_space(struct inode *inode, qsize_t number) +/* + * Convert in-memory reserved quotas to real consumed quotas + */ +int dquot_claim_space_nodirty(struct inode *inode, qsize_t number) { int cnt; - int ret = QUOTA_OK; - if (IS_NOQUOTA(inode)) { + if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) { inode_claim_rsv_space(inode, number); - goto out; + return 0; } down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); - if (IS_NOQUOTA(inode)) { - up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); - inode_claim_rsv_space(inode, number); - goto out; - } - spin_lock(&dq_data_lock); /* Claim reserved quotas to allocated quotas */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { @@ -1557,33 +1601,26 @@ int dquot_claim_space(struct inode *inode, qsize_t number) spin_unlock(&dq_data_lock); mark_all_dquot_dirty(inode->i_dquot); up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); -out: - return ret; + return 0; } -EXPORT_SYMBOL(dquot_claim_space); +EXPORT_SYMBOL(dquot_claim_space_nodirty); /* * This operation can block, but only after everything is updated */ -int __dquot_free_space(struct inode *inode, qsize_t number, int reserve) +void __dquot_free_space(struct inode *inode, qsize_t number, int reserve) { unsigned int cnt; char warntype[MAXQUOTAS]; /* First test before acquiring mutex - solves deadlocks when we * re-enter the quota code and are already holding the mutex */ - if (IS_NOQUOTA(inode)) { -out_sub: + if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) { inode_decr_space(inode, number, reserve); - return QUOTA_OK; + return; } down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); - /* Now recheck reliably when holding dqptr_sem */ - if (IS_NOQUOTA(inode)) { - up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); - goto out_sub; - } spin_lock(&dq_data_lock); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (!inode->i_dquot[cnt]) @@ -1603,56 +1640,34 @@ out_sub: out_unlock: flush_warnings(inode->i_dquot, warntype); up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); - return QUOTA_OK; -} - -int dquot_free_space(struct inode *inode, qsize_t number) -{ - return __dquot_free_space(inode, number, 0); } -EXPORT_SYMBOL(dquot_free_space); - -/* - * Release reserved quota space - */ -void dquot_release_reserved_space(struct inode *inode, qsize_t number) -{ - __dquot_free_space(inode, number, 1); - -} -EXPORT_SYMBOL(dquot_release_reserved_space); +EXPORT_SYMBOL(__dquot_free_space); /* * This operation can block, but only after everything is updated */ -int dquot_free_inode(const struct inode *inode, qsize_t number) +void dquot_free_inode(const struct inode *inode) { unsigned int cnt; char warntype[MAXQUOTAS]; /* First test before acquiring mutex - solves deadlocks when we * re-enter the quota code and are already holding the mutex */ - if (IS_NOQUOTA(inode)) - return QUOTA_OK; + if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) + return; down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); - /* Now recheck reliably when holding dqptr_sem */ - if (IS_NOQUOTA(inode)) { - up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); - return QUOTA_OK; - } spin_lock(&dq_data_lock); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (!inode->i_dquot[cnt]) continue; - warntype[cnt] = info_idq_free(inode->i_dquot[cnt], number); - dquot_decr_inodes(inode->i_dquot[cnt], number); + warntype[cnt] = info_idq_free(inode->i_dquot[cnt], 1); + dquot_decr_inodes(inode->i_dquot[cnt], 1); } spin_unlock(&dq_data_lock); mark_all_dquot_dirty(inode->i_dquot); flush_warnings(inode->i_dquot, warntype); up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); - return QUOTA_OK; } EXPORT_SYMBOL(dquot_free_inode); @@ -1662,37 +1677,31 @@ EXPORT_SYMBOL(dquot_free_inode); * This operation can block, but only after everything is updated * A transaction must be started when entering this function. */ -int dquot_transfer(struct inode *inode, struct iattr *iattr) +static int __dquot_transfer(struct inode *inode, qid_t *chid, unsigned long mask) { qsize_t space, cur_space; qsize_t rsv_space = 0; struct dquot *transfer_from[MAXQUOTAS]; struct dquot *transfer_to[MAXQUOTAS]; - int cnt, ret = QUOTA_OK; - int chuid = iattr->ia_valid & ATTR_UID && inode->i_uid != iattr->ia_uid, - chgid = iattr->ia_valid & ATTR_GID && inode->i_gid != iattr->ia_gid; + int cnt, ret = 0; char warntype_to[MAXQUOTAS]; char warntype_from_inodes[MAXQUOTAS], warntype_from_space[MAXQUOTAS]; /* First test before acquiring mutex - solves deadlocks when we * re-enter the quota code and are already holding the mutex */ if (IS_NOQUOTA(inode)) - return QUOTA_OK; + return 0; /* Initialize the arrays */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { transfer_from[cnt] = NULL; transfer_to[cnt] = NULL; warntype_to[cnt] = QUOTA_NL_NOWARN; } - if (chuid) - transfer_to[USRQUOTA] = dqget(inode->i_sb, iattr->ia_uid, - USRQUOTA); - if (chgid) - transfer_to[GRPQUOTA] = dqget(inode->i_sb, iattr->ia_gid, - GRPQUOTA); - + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (mask & (1 << cnt)) + transfer_to[cnt] = dqget(inode->i_sb, chid[cnt], cnt); + } down_write(&sb_dqopt(inode->i_sb)->dqptr_sem); - /* Now recheck reliably when holding dqptr_sem */ if (IS_NOQUOTA(inode)) { /* File without quota accounting? */ up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); goto put_all; @@ -1706,9 +1715,11 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr) if (!transfer_to[cnt]) continue; transfer_from[cnt] = inode->i_dquot[cnt]; - if (check_idq(transfer_to[cnt], 1, warntype_to + cnt) == - NO_QUOTA || check_bdq(transfer_to[cnt], space, 0, - warntype_to + cnt) == NO_QUOTA) + ret = check_idq(transfer_to[cnt], 1, warntype_to + cnt); + if (ret) + goto over_quota; + ret = check_bdq(transfer_to[cnt], space, 0, warntype_to + cnt); + if (ret) goto over_quota; } @@ -1762,22 +1773,32 @@ over_quota: /* Clear dquot pointers we don't want to dqput() */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) transfer_from[cnt] = NULL; - ret = NO_QUOTA; goto warn_put_all; } -EXPORT_SYMBOL(dquot_transfer); -/* Wrapper for transferring ownership of an inode */ -int vfs_dq_transfer(struct inode *inode, struct iattr *iattr) +/* Wrapper for transferring ownership of an inode for uid/gid only + * Called from FSXXX_setattr() + */ +int dquot_transfer(struct inode *inode, struct iattr *iattr) { + qid_t chid[MAXQUOTAS]; + unsigned long mask = 0; + + if (iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) { + mask |= 1 << USRQUOTA; + chid[USRQUOTA] = iattr->ia_uid; + } + if (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid) { + mask |= 1 << GRPQUOTA; + chid[GRPQUOTA] = iattr->ia_gid; + } if (sb_any_quota_active(inode->i_sb) && !IS_NOQUOTA(inode)) { - vfs_dq_init(inode); - if (inode->i_sb->dq_op->transfer(inode, iattr) == NO_QUOTA) - return 1; + dquot_initialize(inode); + return __dquot_transfer(inode, chid, mask); } return 0; } -EXPORT_SYMBOL(vfs_dq_transfer); +EXPORT_SYMBOL(dquot_transfer); /* * Write info of quota file to disk @@ -1798,13 +1819,6 @@ EXPORT_SYMBOL(dquot_commit_info); * Definitions of diskquota operations. */ const struct dquot_operations dquot_operations = { - .initialize = dquot_initialize, - .drop = dquot_drop, - .alloc_space = dquot_alloc_space, - .alloc_inode = dquot_alloc_inode, - .free_space = dquot_free_space, - .free_inode = dquot_free_inode, - .transfer = dquot_transfer, .write_dquot = dquot_commit, .acquire_dquot = dquot_acquire, .release_dquot = dquot_release, @@ -1815,6 +1829,20 @@ const struct dquot_operations dquot_operations = { }; /* + * Generic helper for ->open on filesystems supporting disk quotas. + */ +int dquot_file_open(struct inode *inode, struct file *file) +{ + int error; + + error = generic_file_open(inode, file); + if (!error && (file->f_mode & FMODE_WRITE)) + dquot_initialize(inode); + return error; +} +EXPORT_SYMBOL(dquot_file_open); + +/* * Turn quota off on a device. type == -1 ==> quotaoff for all types (umount) */ int vfs_quota_disable(struct super_block *sb, int type, unsigned int flags) @@ -1993,11 +2021,13 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id, } if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) { - /* As we bypass the pagecache we must now flush the inode so - * that we see all the changes from userspace... */ - write_inode_now(inode, 1); - /* And now flush the block cache so that kernel sees the - * changes */ + /* As we bypass the pagecache we must now flush all the + * dirty data and invalidate caches so that kernel sees + * changes from userspace. It is not enough to just flush + * the quota file since if blocksize < pagesize, invalidation + * of the cache could fail because of other unrelated dirty + * data */ + sync_filesystem(sb); invalidate_bdev(sb->s_bdev); } mutex_lock(&dqopt->dqonoff_mutex); @@ -2010,14 +2040,16 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id, /* We don't want quota and atime on quota files (deadlocks * possible) Also nobody should write to the file - we use * special IO operations which ignore the immutable bit. */ - down_write(&dqopt->dqptr_sem); mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA); oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE | S_NOQUOTA); inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE; mutex_unlock(&inode->i_mutex); - up_write(&dqopt->dqptr_sem); - sb->dq_op->drop(inode); + /* + * When S_NOQUOTA is set, remove dquot references as no more + * references can be added + */ + __dquot_drop(inode); } error = -EIO; @@ -2053,14 +2085,12 @@ out_file_init: iput(inode); out_lock: if (oldflags != -1) { - down_write(&dqopt->dqptr_sem); mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA); /* Set the flags back (in the case of accidental quotaon() * on a wrong file we don't want to mess up the flags) */ inode->i_flags &= ~(S_NOATIME | S_NOQUOTA | S_IMMUTABLE); inode->i_flags |= oldflags; mutex_unlock(&inode->i_mutex); - up_write(&dqopt->dqptr_sem); } mutex_unlock(&dqopt->dqonoff_mutex); out_fmt: diff --git a/fs/quota/netlink.c b/fs/quota/netlink.c new file mode 100644 index 00000000000..2663ed90fb0 --- /dev/null +++ b/fs/quota/netlink.c @@ -0,0 +1,95 @@ + +#include <linux/cred.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/quotaops.h> +#include <linux/sched.h> +#include <net/netlink.h> +#include <net/genetlink.h> + +/* Netlink family structure for quota */ +static struct genl_family quota_genl_family = { + .id = GENL_ID_GENERATE, + .hdrsize = 0, + .name = "VFS_DQUOT", + .version = 1, + .maxattr = QUOTA_NL_A_MAX, +}; + +/** + * quota_send_warning - Send warning to userspace about exceeded quota + * @type: The quota type: USRQQUOTA, GRPQUOTA,... + * @id: The user or group id of the quota that was exceeded + * @dev: The device on which the fs is mounted (sb->s_dev) + * @warntype: The type of the warning: QUOTA_NL_... + * + * This can be used by filesystems (including those which don't use + * dquot) to send a message to userspace relating to quota limits. + * + */ + +void quota_send_warning(short type, unsigned int id, dev_t dev, + const char warntype) +{ + static atomic_t seq; + struct sk_buff *skb; + void *msg_head; + int ret; + int msg_size = 4 * nla_total_size(sizeof(u32)) + + 2 * nla_total_size(sizeof(u64)); + + /* We have to allocate using GFP_NOFS as we are called from a + * filesystem performing write and thus further recursion into + * the fs to free some data could cause deadlocks. */ + skb = genlmsg_new(msg_size, GFP_NOFS); + if (!skb) { + printk(KERN_ERR + "VFS: Not enough memory to send quota warning.\n"); + return; + } + msg_head = genlmsg_put(skb, 0, atomic_add_return(1, &seq), + "a_genl_family, 0, QUOTA_NL_C_WARNING); + if (!msg_head) { + printk(KERN_ERR + "VFS: Cannot store netlink header in quota warning.\n"); + goto err_out; + } + ret = nla_put_u32(skb, QUOTA_NL_A_QTYPE, type); + if (ret) + goto attr_err_out; + ret = nla_put_u64(skb, QUOTA_NL_A_EXCESS_ID, id); + if (ret) + goto attr_err_out; + ret = nla_put_u32(skb, QUOTA_NL_A_WARNING, warntype); + if (ret) + goto attr_err_out; + ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MAJOR, MAJOR(dev)); + if (ret) + goto attr_err_out; + ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MINOR, MINOR(dev)); + if (ret) + goto attr_err_out; + ret = nla_put_u64(skb, QUOTA_NL_A_CAUSED_ID, current_uid()); + if (ret) + goto attr_err_out; + genlmsg_end(skb, msg_head); + + genlmsg_multicast(skb, 0, quota_genl_family.id, GFP_NOFS); + return; +attr_err_out: + printk(KERN_ERR "VFS: Not enough space to compose quota message!\n"); +err_out: + kfree_skb(skb); +} +EXPORT_SYMBOL(quota_send_warning); + +static int __init quota_init(void) +{ + if (genl_register_family("a_genl_family) != 0) + printk(KERN_ERR + "VFS: Failed to create quota netlink interface.\n"); + return 0; +}; + +module_init(quota_init); diff --git a/fs/quota/quota.c b/fs/quota/quota.c index ee91e275695..95388f9b735 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -10,7 +10,6 @@ #include <linux/slab.h> #include <asm/current.h> #include <asm/uaccess.h> -#include <linux/compat.h> #include <linux/kernel.h> #include <linux/security.h> #include <linux/syscalls.h> @@ -18,220 +17,205 @@ #include <linux/capability.h> #include <linux/quotaops.h> #include <linux/types.h> -#include <net/netlink.h> -#include <net/genetlink.h> +#include <linux/writeback.h> -/* Check validity of generic quotactl commands */ -static int generic_quotactl_valid(struct super_block *sb, int type, int cmd, - qid_t id) +static int check_quotactl_permission(struct super_block *sb, int type, int cmd, + qid_t id) { - if (type >= MAXQUOTAS) - return -EINVAL; - if (!sb && cmd != Q_SYNC) - return -ENODEV; - /* Is operation supported? */ - if (sb && !sb->s_qcop) - return -ENOSYS; - switch (cmd) { - case Q_GETFMT: - break; - case Q_QUOTAON: - if (!sb->s_qcop->quota_on) - return -ENOSYS; - break; - case Q_QUOTAOFF: - if (!sb->s_qcop->quota_off) - return -ENOSYS; - break; - case Q_SETINFO: - if (!sb->s_qcop->set_info) - return -ENOSYS; - break; - case Q_GETINFO: - if (!sb->s_qcop->get_info) - return -ENOSYS; - break; - case Q_SETQUOTA: - if (!sb->s_qcop->set_dqblk) - return -ENOSYS; - break; - case Q_GETQUOTA: - if (!sb->s_qcop->get_dqblk) - return -ENOSYS; - break; - case Q_SYNC: - if (sb && !sb->s_qcop->quota_sync) - return -ENOSYS; + /* these commands do not require any special privilegues */ + case Q_GETFMT: + case Q_SYNC: + case Q_GETINFO: + case Q_XGETQSTAT: + case Q_XQUOTASYNC: + break; + /* allow to query information for dquots we "own" */ + case Q_GETQUOTA: + case Q_XGETQUOTA: + if ((type == USRQUOTA && current_euid() == id) || + (type == GRPQUOTA && in_egroup_p(id))) break; - default: - return -EINVAL; + /*FALLTHROUGH*/ + default: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; } - /* Is quota turned on for commands which need it? */ - switch (cmd) { - case Q_GETFMT: - case Q_GETINFO: - case Q_SETINFO: - case Q_SETQUOTA: - case Q_GETQUOTA: - /* This is just an informative test so we are satisfied - * without the lock */ - if (!sb_has_quota_active(sb, type)) - return -ESRCH; - } + return security_quotactl(cmd, type, id, sb); +} - /* Check privileges */ - if (cmd == Q_GETQUOTA) { - if (((type == USRQUOTA && current_euid() != id) || - (type == GRPQUOTA && !in_egroup_p(id))) && - !capable(CAP_SYS_ADMIN)) - return -EPERM; +static int quota_sync_all(int type) +{ + struct super_block *sb; + int ret; + + if (type >= MAXQUOTAS) + return -EINVAL; + ret = security_quotactl(Q_SYNC, type, 0, NULL); + if (ret) + return ret; + + spin_lock(&sb_lock); +restart: + list_for_each_entry(sb, &super_blocks, s_list) { + if (!sb->s_qcop || !sb->s_qcop->quota_sync) + continue; + + sb->s_count++; + spin_unlock(&sb_lock); + down_read(&sb->s_umount); + if (sb->s_root) + sb->s_qcop->quota_sync(sb, type, 1); + up_read(&sb->s_umount); + spin_lock(&sb_lock); + if (__put_super_and_need_restart(sb)) + goto restart; } - else if (cmd != Q_GETFMT && cmd != Q_SYNC && cmd != Q_GETINFO) - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; + spin_unlock(&sb_lock); return 0; } -/* Check validity of XFS Quota Manager commands */ -static int xqm_quotactl_valid(struct super_block *sb, int type, int cmd, - qid_t id) +static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id, + void __user *addr) { - if (type >= XQM_MAXQUOTAS) - return -EINVAL; - if (!sb) - return -ENODEV; - if (!sb->s_qcop) - return -ENOSYS; + char *pathname; + int ret = -ENOSYS; + + pathname = getname(addr); + if (IS_ERR(pathname)) + return PTR_ERR(pathname); + if (sb->s_qcop->quota_on) + ret = sb->s_qcop->quota_on(sb, type, id, pathname, 0); + putname(pathname); + return ret; +} - switch (cmd) { - case Q_XQUOTAON: - case Q_XQUOTAOFF: - case Q_XQUOTARM: - if (!sb->s_qcop->set_xstate) - return -ENOSYS; - break; - case Q_XGETQSTAT: - if (!sb->s_qcop->get_xstate) - return -ENOSYS; - break; - case Q_XSETQLIM: - if (!sb->s_qcop->set_xquota) - return -ENOSYS; - break; - case Q_XGETQUOTA: - if (!sb->s_qcop->get_xquota) - return -ENOSYS; - break; - case Q_XQUOTASYNC: - if (!sb->s_qcop->quota_sync) - return -ENOSYS; - break; - default: - return -EINVAL; - } +static int quota_getfmt(struct super_block *sb, int type, void __user *addr) +{ + __u32 fmt; - /* Check privileges */ - if (cmd == Q_XGETQUOTA) { - if (((type == XQM_USRQUOTA && current_euid() != id) || - (type == XQM_GRPQUOTA && !in_egroup_p(id))) && - !capable(CAP_SYS_ADMIN)) - return -EPERM; - } else if (cmd != Q_XGETQSTAT && cmd != Q_XQUOTASYNC) { - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; + down_read(&sb_dqopt(sb)->dqptr_sem); + if (!sb_has_quota_active(sb, type)) { + up_read(&sb_dqopt(sb)->dqptr_sem); + return -ESRCH; } + fmt = sb_dqopt(sb)->info[type].dqi_format->qf_fmt_id; + up_read(&sb_dqopt(sb)->dqptr_sem); + if (copy_to_user(addr, &fmt, sizeof(fmt))) + return -EFAULT; + return 0; +} +static int quota_getinfo(struct super_block *sb, int type, void __user *addr) +{ + struct if_dqinfo info; + int ret; + + if (!sb_has_quota_active(sb, type)) + return -ESRCH; + if (!sb->s_qcop->get_info) + return -ENOSYS; + ret = sb->s_qcop->get_info(sb, type, &info); + if (!ret && copy_to_user(addr, &info, sizeof(info))) + return -EFAULT; + return ret; +} + +static int quota_setinfo(struct super_block *sb, int type, void __user *addr) +{ + struct if_dqinfo info; + + if (copy_from_user(&info, addr, sizeof(info))) + return -EFAULT; + if (!sb_has_quota_active(sb, type)) + return -ESRCH; + if (!sb->s_qcop->set_info) + return -ENOSYS; + return sb->s_qcop->set_info(sb, type, &info); +} + +static int quota_getquota(struct super_block *sb, int type, qid_t id, + void __user *addr) +{ + struct if_dqblk idq; + int ret; + + if (!sb_has_quota_active(sb, type)) + return -ESRCH; + if (!sb->s_qcop->get_dqblk) + return -ENOSYS; + ret = sb->s_qcop->get_dqblk(sb, type, id, &idq); + if (ret) + return ret; + if (copy_to_user(addr, &idq, sizeof(idq))) + return -EFAULT; return 0; } -static int check_quotactl_valid(struct super_block *sb, int type, int cmd, - qid_t id) +static int quota_setquota(struct super_block *sb, int type, qid_t id, + void __user *addr) { - int error; - - if (XQM_COMMAND(cmd)) - error = xqm_quotactl_valid(sb, type, cmd, id); - else - error = generic_quotactl_valid(sb, type, cmd, id); - if (!error) - error = security_quotactl(cmd, type, id, sb); - return error; + struct if_dqblk idq; + + if (copy_from_user(&idq, addr, sizeof(idq))) + return -EFAULT; + if (!sb_has_quota_active(sb, type)) + return -ESRCH; + if (!sb->s_qcop->set_dqblk) + return -ENOSYS; + return sb->s_qcop->set_dqblk(sb, type, id, &idq); } -#ifdef CONFIG_QUOTA -void sync_quota_sb(struct super_block *sb, int type) +static int quota_setxstate(struct super_block *sb, int cmd, void __user *addr) { - int cnt; + __u32 flags; - if (!sb->s_qcop->quota_sync) - return; + if (copy_from_user(&flags, addr, sizeof(flags))) + return -EFAULT; + if (!sb->s_qcop->set_xstate) + return -ENOSYS; + return sb->s_qcop->set_xstate(sb, flags, cmd); +} - sb->s_qcop->quota_sync(sb, type); +static int quota_getxstate(struct super_block *sb, void __user *addr) +{ + struct fs_quota_stat fqs; + int ret; - if (sb_dqopt(sb)->flags & DQUOT_QUOTA_SYS_FILE) - return; - /* This is not very clever (and fast) but currently I don't know about - * any other simple way of getting quota data to disk and we must get - * them there for userspace to be visible... */ - if (sb->s_op->sync_fs) - sb->s_op->sync_fs(sb, 1); - sync_blockdev(sb->s_bdev); + if (!sb->s_qcop->get_xstate) + return -ENOSYS; + ret = sb->s_qcop->get_xstate(sb, &fqs); + if (!ret && copy_to_user(addr, &fqs, sizeof(fqs))) + return -EFAULT; + return ret; +} - /* - * Now when everything is written we can discard the pagecache so - * that userspace sees the changes. - */ - mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); - for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - if (type != -1 && cnt != type) - continue; - if (!sb_has_quota_active(sb, cnt)) - continue; - mutex_lock_nested(&sb_dqopt(sb)->files[cnt]->i_mutex, - I_MUTEX_QUOTA); - truncate_inode_pages(&sb_dqopt(sb)->files[cnt]->i_data, 0); - mutex_unlock(&sb_dqopt(sb)->files[cnt]->i_mutex); - } - mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); +static int quota_setxquota(struct super_block *sb, int type, qid_t id, + void __user *addr) +{ + struct fs_disk_quota fdq; + + if (copy_from_user(&fdq, addr, sizeof(fdq))) + return -EFAULT; + if (!sb->s_qcop->set_xquota) + return -ENOSYS; + return sb->s_qcop->set_xquota(sb, type, id, &fdq); } -#endif -static void sync_dquots(int type) +static int quota_getxquota(struct super_block *sb, int type, qid_t id, + void __user *addr) { - struct super_block *sb; - int cnt; + struct fs_disk_quota fdq; + int ret; - spin_lock(&sb_lock); -restart: - list_for_each_entry(sb, &super_blocks, s_list) { - /* This test just improves performance so it needn't be - * reliable... */ - for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - if (type != -1 && type != cnt) - continue; - if (!sb_has_quota_active(sb, cnt)) - continue; - if (!info_dirty(&sb_dqopt(sb)->info[cnt]) && - list_empty(&sb_dqopt(sb)->info[cnt].dqi_dirty_list)) - continue; - break; - } - if (cnt == MAXQUOTAS) - continue; - sb->s_count++; - spin_unlock(&sb_lock); - down_read(&sb->s_umount); - if (sb->s_root) - sync_quota_sb(sb, type); - up_read(&sb->s_umount); - spin_lock(&sb_lock); - if (__put_super_and_need_restart(sb)) - goto restart; - } - spin_unlock(&sb_lock); + if (!sb->s_qcop->get_xquota) + return -ENOSYS; + ret = sb->s_qcop->get_xquota(sb, type, id, &fdq); + if (!ret && copy_to_user(addr, &fdq, sizeof(fdq))) + return -EFAULT; + return ret; } /* Copy parameters and call proper function */ @@ -240,117 +224,55 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, { int ret; + if (type >= (XQM_COMMAND(cmd) ? XQM_MAXQUOTAS : MAXQUOTAS)) + return -EINVAL; + if (!sb->s_qcop) + return -ENOSYS; + + ret = check_quotactl_permission(sb, type, cmd, id); + if (ret < 0) + return ret; + switch (cmd) { - case Q_QUOTAON: { - char *pathname; - - pathname = getname(addr); - if (IS_ERR(pathname)) - return PTR_ERR(pathname); - ret = sb->s_qcop->quota_on(sb, type, id, pathname, 0); - putname(pathname); - return ret; - } - case Q_QUOTAOFF: - return sb->s_qcop->quota_off(sb, type, 0); - - case Q_GETFMT: { - __u32 fmt; - - down_read(&sb_dqopt(sb)->dqptr_sem); - if (!sb_has_quota_active(sb, type)) { - up_read(&sb_dqopt(sb)->dqptr_sem); - return -ESRCH; - } - fmt = sb_dqopt(sb)->info[type].dqi_format->qf_fmt_id; - up_read(&sb_dqopt(sb)->dqptr_sem); - if (copy_to_user(addr, &fmt, sizeof(fmt))) - return -EFAULT; - return 0; - } - case Q_GETINFO: { - struct if_dqinfo info; - - ret = sb->s_qcop->get_info(sb, type, &info); - if (ret) - return ret; - if (copy_to_user(addr, &info, sizeof(info))) - return -EFAULT; - return 0; - } - case Q_SETINFO: { - struct if_dqinfo info; - - if (copy_from_user(&info, addr, sizeof(info))) - return -EFAULT; - return sb->s_qcop->set_info(sb, type, &info); - } - case Q_GETQUOTA: { - struct if_dqblk idq; - - ret = sb->s_qcop->get_dqblk(sb, type, id, &idq); - if (ret) - return ret; - if (copy_to_user(addr, &idq, sizeof(idq))) - return -EFAULT; - return 0; - } - case Q_SETQUOTA: { - struct if_dqblk idq; - - if (copy_from_user(&idq, addr, sizeof(idq))) - return -EFAULT; - return sb->s_qcop->set_dqblk(sb, type, id, &idq); - } - case Q_SYNC: - if (sb) - sync_quota_sb(sb, type); - else - sync_dquots(type); - return 0; - - case Q_XQUOTAON: - case Q_XQUOTAOFF: - case Q_XQUOTARM: { - __u32 flags; - - if (copy_from_user(&flags, addr, sizeof(flags))) - return -EFAULT; - return sb->s_qcop->set_xstate(sb, flags, cmd); - } - case Q_XGETQSTAT: { - struct fs_quota_stat fqs; - - if ((ret = sb->s_qcop->get_xstate(sb, &fqs))) - return ret; - if (copy_to_user(addr, &fqs, sizeof(fqs))) - return -EFAULT; - return 0; - } - case Q_XSETQLIM: { - struct fs_disk_quota fdq; - - if (copy_from_user(&fdq, addr, sizeof(fdq))) - return -EFAULT; - return sb->s_qcop->set_xquota(sb, type, id, &fdq); - } - case Q_XGETQUOTA: { - struct fs_disk_quota fdq; - - ret = sb->s_qcop->get_xquota(sb, type, id, &fdq); - if (ret) - return ret; - if (copy_to_user(addr, &fdq, sizeof(fdq))) - return -EFAULT; - return 0; - } - case Q_XQUOTASYNC: - return sb->s_qcop->quota_sync(sb, type); - /* We never reach here unless validity check is broken */ - default: - BUG(); + case Q_QUOTAON: + return quota_quotaon(sb, type, cmd, id, addr); + case Q_QUOTAOFF: + if (!sb->s_qcop->quota_off) + return -ENOSYS; + return sb->s_qcop->quota_off(sb, type, 0); + case Q_GETFMT: + return quota_getfmt(sb, type, addr); + case Q_GETINFO: + return quota_getinfo(sb, type, addr); + case Q_SETINFO: + return quota_setinfo(sb, type, addr); + case Q_GETQUOTA: + return quota_getquota(sb, type, id, addr); + case Q_SETQUOTA: + return quota_setquota(sb, type, id, addr); + case Q_SYNC: + if (!sb->s_qcop->quota_sync) + return -ENOSYS; + return sb->s_qcop->quota_sync(sb, type, 1); + case Q_XQUOTAON: + case Q_XQUOTAOFF: + case Q_XQUOTARM: + return quota_setxstate(sb, cmd, addr); + case Q_XGETQSTAT: + return quota_getxstate(sb, addr); + case Q_XSETQLIM: + return quota_setxquota(sb, type, id, addr); + case Q_XGETQUOTA: + return quota_getxquota(sb, type, id, addr); + case Q_XQUOTASYNC: + /* caller already holds s_umount */ + if (sb->s_flags & MS_RDONLY) + return -EROFS; + writeback_inodes_sb(sb); + return 0; + default: + return -EINVAL; } - return 0; } /* @@ -397,224 +319,23 @@ SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special, cmds = cmd >> SUBCMDSHIFT; type = cmd & SUBCMDMASK; - if (cmds != Q_SYNC || special) { - sb = quotactl_block(special); - if (IS_ERR(sb)) - return PTR_ERR(sb); + /* + * As a special case Q_SYNC can be called without a specific device. + * It will iterate all superblocks that have quota enabled and call + * the sync action on each of them. + */ + if (!special) { + if (cmds == Q_SYNC) + return quota_sync_all(type); + return -ENODEV; } - ret = check_quotactl_valid(sb, type, cmds, id); - if (ret >= 0) - ret = do_quotactl(sb, type, cmds, id, addr); - if (sb) - drop_super(sb); + sb = quotactl_block(special); + if (IS_ERR(sb)) + return PTR_ERR(sb); - return ret; -} - -#if defined(CONFIG_COMPAT_FOR_U64_ALIGNMENT) -/* - * This code works only for 32 bit quota tools over 64 bit OS (x86_64, ia64) - * and is necessary due to alignment problems. - */ -struct compat_if_dqblk { - compat_u64 dqb_bhardlimit; - compat_u64 dqb_bsoftlimit; - compat_u64 dqb_curspace; - compat_u64 dqb_ihardlimit; - compat_u64 dqb_isoftlimit; - compat_u64 dqb_curinodes; - compat_u64 dqb_btime; - compat_u64 dqb_itime; - compat_uint_t dqb_valid; -}; - -/* XFS structures */ -struct compat_fs_qfilestat { - compat_u64 dqb_bhardlimit; - compat_u64 qfs_nblks; - compat_uint_t qfs_nextents; -}; - -struct compat_fs_quota_stat { - __s8 qs_version; - __u16 qs_flags; - __s8 qs_pad; - struct compat_fs_qfilestat qs_uquota; - struct compat_fs_qfilestat qs_gquota; - compat_uint_t qs_incoredqs; - compat_int_t qs_btimelimit; - compat_int_t qs_itimelimit; - compat_int_t qs_rtbtimelimit; - __u16 qs_bwarnlimit; - __u16 qs_iwarnlimit; -}; - -asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special, - qid_t id, void __user *addr) -{ - unsigned int cmds; - struct if_dqblk __user *dqblk; - struct compat_if_dqblk __user *compat_dqblk; - struct fs_quota_stat __user *fsqstat; - struct compat_fs_quota_stat __user *compat_fsqstat; - compat_uint_t data; - u16 xdata; - long ret; + ret = do_quotactl(sb, type, cmds, id, addr); - cmds = cmd >> SUBCMDSHIFT; - - switch (cmds) { - case Q_GETQUOTA: - dqblk = compat_alloc_user_space(sizeof(struct if_dqblk)); - compat_dqblk = addr; - ret = sys_quotactl(cmd, special, id, dqblk); - if (ret) - break; - if (copy_in_user(compat_dqblk, dqblk, sizeof(*compat_dqblk)) || - get_user(data, &dqblk->dqb_valid) || - put_user(data, &compat_dqblk->dqb_valid)) - ret = -EFAULT; - break; - case Q_SETQUOTA: - dqblk = compat_alloc_user_space(sizeof(struct if_dqblk)); - compat_dqblk = addr; - ret = -EFAULT; - if (copy_in_user(dqblk, compat_dqblk, sizeof(*compat_dqblk)) || - get_user(data, &compat_dqblk->dqb_valid) || - put_user(data, &dqblk->dqb_valid)) - break; - ret = sys_quotactl(cmd, special, id, dqblk); - break; - case Q_XGETQSTAT: - fsqstat = compat_alloc_user_space(sizeof(struct fs_quota_stat)); - compat_fsqstat = addr; - ret = sys_quotactl(cmd, special, id, fsqstat); - if (ret) - break; - ret = -EFAULT; - /* Copying qs_version, qs_flags, qs_pad */ - if (copy_in_user(compat_fsqstat, fsqstat, - offsetof(struct compat_fs_quota_stat, qs_uquota))) - break; - /* Copying qs_uquota */ - if (copy_in_user(&compat_fsqstat->qs_uquota, - &fsqstat->qs_uquota, - sizeof(compat_fsqstat->qs_uquota)) || - get_user(data, &fsqstat->qs_uquota.qfs_nextents) || - put_user(data, &compat_fsqstat->qs_uquota.qfs_nextents)) - break; - /* Copying qs_gquota */ - if (copy_in_user(&compat_fsqstat->qs_gquota, - &fsqstat->qs_gquota, - sizeof(compat_fsqstat->qs_gquota)) || - get_user(data, &fsqstat->qs_gquota.qfs_nextents) || - put_user(data, &compat_fsqstat->qs_gquota.qfs_nextents)) - break; - /* Copying the rest */ - if (copy_in_user(&compat_fsqstat->qs_incoredqs, - &fsqstat->qs_incoredqs, - sizeof(struct compat_fs_quota_stat) - - offsetof(struct compat_fs_quota_stat, qs_incoredqs)) || - get_user(xdata, &fsqstat->qs_iwarnlimit) || - put_user(xdata, &compat_fsqstat->qs_iwarnlimit)) - break; - ret = 0; - break; - default: - ret = sys_quotactl(cmd, special, id, addr); - } + drop_super(sb); return ret; } -#endif - - -#ifdef CONFIG_QUOTA_NETLINK_INTERFACE - -/* Netlink family structure for quota */ -static struct genl_family quota_genl_family = { - .id = GENL_ID_GENERATE, - .hdrsize = 0, - .name = "VFS_DQUOT", - .version = 1, - .maxattr = QUOTA_NL_A_MAX, -}; - -/** - * quota_send_warning - Send warning to userspace about exceeded quota - * @type: The quota type: USRQQUOTA, GRPQUOTA,... - * @id: The user or group id of the quota that was exceeded - * @dev: The device on which the fs is mounted (sb->s_dev) - * @warntype: The type of the warning: QUOTA_NL_... - * - * This can be used by filesystems (including those which don't use - * dquot) to send a message to userspace relating to quota limits. - * - */ - -void quota_send_warning(short type, unsigned int id, dev_t dev, - const char warntype) -{ - static atomic_t seq; - struct sk_buff *skb; - void *msg_head; - int ret; - int msg_size = 4 * nla_total_size(sizeof(u32)) + - 2 * nla_total_size(sizeof(u64)); - - /* We have to allocate using GFP_NOFS as we are called from a - * filesystem performing write and thus further recursion into - * the fs to free some data could cause deadlocks. */ - skb = genlmsg_new(msg_size, GFP_NOFS); - if (!skb) { - printk(KERN_ERR - "VFS: Not enough memory to send quota warning.\n"); - return; - } - msg_head = genlmsg_put(skb, 0, atomic_add_return(1, &seq), - "a_genl_family, 0, QUOTA_NL_C_WARNING); - if (!msg_head) { - printk(KERN_ERR - "VFS: Cannot store netlink header in quota warning.\n"); - goto err_out; - } - ret = nla_put_u32(skb, QUOTA_NL_A_QTYPE, type); - if (ret) - goto attr_err_out; - ret = nla_put_u64(skb, QUOTA_NL_A_EXCESS_ID, id); - if (ret) - goto attr_err_out; - ret = nla_put_u32(skb, QUOTA_NL_A_WARNING, warntype); - if (ret) - goto attr_err_out; - ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MAJOR, MAJOR(dev)); - if (ret) - goto attr_err_out; - ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MINOR, MINOR(dev)); - if (ret) - goto attr_err_out; - ret = nla_put_u64(skb, QUOTA_NL_A_CAUSED_ID, current_uid()); - if (ret) - goto attr_err_out; - genlmsg_end(skb, msg_head); - - genlmsg_multicast(skb, 0, quota_genl_family.id, GFP_NOFS); - return; -attr_err_out: - printk(KERN_ERR "VFS: Not enough space to compose quota message!\n"); -err_out: - kfree_skb(skb); -} -EXPORT_SYMBOL(quota_send_warning); - -static int __init quota_init(void) -{ - if (genl_register_family("a_genl_family) != 0) - printk(KERN_ERR - "VFS: Failed to create quota netlink interface.\n"); - return 0; -}; - -module_init(quota_init); -#endif - diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c index 65c87276117..dc014f7def0 100644 --- a/fs/reiserfs/bitmap.c +++ b/fs/reiserfs/bitmap.c @@ -425,7 +425,7 @@ static void _reiserfs_free_block(struct reiserfs_transaction_handle *th, journal_mark_dirty(th, s, sbh); if (for_unformatted) - vfs_dq_free_block_nodirty(inode, 1); + dquot_free_block_nodirty(inode, 1); } void reiserfs_free_block(struct reiserfs_transaction_handle *th, @@ -1049,7 +1049,7 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start amount_needed, hint->inode->i_uid); #endif quota_ret = - vfs_dq_alloc_block_nodirty(hint->inode, amount_needed); + dquot_alloc_block_nodirty(hint->inode, amount_needed); if (quota_ret) /* Quota exceeded? */ return QUOTA_EXCEEDED; if (hint->preallocate && hint->prealloc_size) { @@ -1058,7 +1058,7 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start "reiserquota: allocating (prealloc) %d blocks id=%u", hint->prealloc_size, hint->inode->i_uid); #endif - quota_ret = vfs_dq_prealloc_block_nodirty(hint->inode, + quota_ret = dquot_prealloc_block_nodirty(hint->inode, hint->prealloc_size); if (quota_ret) hint->preallocate = hint->prealloc_size = 0; @@ -1092,7 +1092,7 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start hint->inode->i_uid); #endif /* Free not allocated blocks */ - vfs_dq_free_block_nodirty(hint->inode, + dquot_free_block_nodirty(hint->inode, amount_needed + hint->prealloc_size - nr_allocated); } @@ -1125,7 +1125,7 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start REISERFS_I(hint->inode)->i_prealloc_count, hint->inode->i_uid); #endif - vfs_dq_free_block_nodirty(hint->inode, amount_needed + + dquot_free_block_nodirty(hint->inode, amount_needed + hint->prealloc_size - nr_allocated - REISERFS_I(hint->inode)-> i_prealloc_count); diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index da2dba082e2..1d9c12714c5 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -289,7 +289,7 @@ const struct file_operations reiserfs_file_operations = { .compat_ioctl = reiserfs_compat_ioctl, #endif .mmap = reiserfs_file_mmap, - .open = generic_file_open, + .open = dquot_file_open, .release = reiserfs_file_release, .fsync = reiserfs_sync_file, .aio_read = generic_file_aio_read, diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 0d651f980a8..d1da94b82d8 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -34,6 +34,9 @@ void reiserfs_delete_inode(struct inode *inode) int depth; int err; + if (!is_bad_inode(inode)) + dquot_initialize(inode); + truncate_inode_pages(&inode->i_data, 0); depth = reiserfs_write_lock_once(inode->i_sb); @@ -54,7 +57,7 @@ void reiserfs_delete_inode(struct inode *inode) * after delete_object so that quota updates go into the same transaction as * stat data deletion */ if (!err) - vfs_dq_free_inode(inode); + dquot_free_inode(inode); if (journal_end(&th, inode->i_sb, jbegin_count)) goto out; @@ -1765,10 +1768,10 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, BUG_ON(!th->t_trans_id); - if (vfs_dq_alloc_inode(inode)) { - err = -EDQUOT; + dquot_initialize(inode); + err = dquot_alloc_inode(inode); + if (err) goto out_end_trans; - } if (!dir->i_nlink) { err = -EPERM; goto out_bad_inode; @@ -1959,12 +1962,12 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, INODE_PKEY(inode)->k_objectid = 0; /* Quota change must be inside a transaction for journaling */ - vfs_dq_free_inode(inode); + dquot_free_inode(inode); out_end_trans: journal_end(th, th->t_super, th->t_blocks_allocated); /* Drop can be outside and it needs more credits so it's better to have it outside */ - vfs_dq_drop(inode); + dquot_drop(inode); inode->i_flags |= S_NOQUOTA; make_bad_inode(inode); @@ -3073,6 +3076,8 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) depth = reiserfs_write_lock_once(inode->i_sb); if (attr->ia_valid & ATTR_SIZE) { + dquot_initialize(inode); + /* version 2 items will be caught by the s_maxbytes check ** done for us in vmtruncate */ @@ -3134,8 +3139,7 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) jbegin_count); if (error) goto out; - error = - vfs_dq_transfer(inode, attr) ? -EDQUOT : 0; + error = dquot_transfer(inode, attr); if (error) { journal_end(&th, inode->i_sb, jbegin_count); diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index 9d4dcf0b07c..96e4cbbfaa1 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -546,7 +546,7 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th, */ static int drop_new_inode(struct inode *inode) { - vfs_dq_drop(inode); + dquot_drop(inode); make_bad_inode(inode); inode->i_flags |= S_NOQUOTA; iput(inode); @@ -554,7 +554,7 @@ static int drop_new_inode(struct inode *inode) } /* utility function that does setup for reiserfs_new_inode. -** vfs_dq_init needs lots of credits so it's better to have it +** dquot_initialize needs lots of credits so it's better to have it ** outside of a transaction, so we had to pull some bits of ** reiserfs_new_inode out into this func. */ @@ -577,7 +577,7 @@ static int new_inode_init(struct inode *inode, struct inode *dir, int mode) } else { inode->i_gid = current_fsgid(); } - vfs_dq_init(inode); + dquot_initialize(inode); return 0; } @@ -594,6 +594,8 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode, struct reiserfs_transaction_handle th; struct reiserfs_security_handle security; + dquot_initialize(dir); + if (!(inode = new_inode(dir->i_sb))) { return -ENOMEM; } @@ -666,6 +668,8 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, int mode, if (!new_valid_dev(rdev)) return -EINVAL; + dquot_initialize(dir); + if (!(inode = new_inode(dir->i_sb))) { return -ENOMEM; } @@ -739,6 +743,8 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) 2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) + REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb)); + dquot_initialize(dir); + #ifdef DISPLACE_NEW_PACKING_LOCALITIES /* set flag that new packing locality created and new blocks for the content * of that directory are not displaced yet */ REISERFS_I(dir)->new_packing_locality = 1; @@ -842,6 +848,8 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry) JOURNAL_PER_BALANCE_CNT * 2 + 2 + 4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb); + dquot_initialize(dir); + reiserfs_write_lock(dir->i_sb); retval = journal_begin(&th, dir->i_sb, jbegin_count); if (retval) @@ -923,6 +931,8 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry) unsigned long savelink; int depth; + dquot_initialize(dir); + inode = dentry->d_inode; /* in this transaction we can be doing at max two balancings and update @@ -1024,6 +1034,8 @@ static int reiserfs_symlink(struct inode *parent_dir, 2 * (REISERFS_QUOTA_INIT_BLOCKS(parent_dir->i_sb) + REISERFS_QUOTA_TRANS_BLOCKS(parent_dir->i_sb)); + dquot_initialize(parent_dir); + if (!(inode = new_inode(parent_dir->i_sb))) { return -ENOMEM; } @@ -1111,6 +1123,8 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir, JOURNAL_PER_BALANCE_CNT * 3 + 2 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb); + dquot_initialize(dir); + reiserfs_write_lock(dir->i_sb); if (inode->i_nlink >= REISERFS_LINK_MAX) { //FIXME: sd_nlink is 32 bit for new files @@ -1235,6 +1249,9 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry, JOURNAL_PER_BALANCE_CNT * 3 + 5 + 4 * REISERFS_QUOTA_TRANS_BLOCKS(old_dir->i_sb); + dquot_initialize(old_dir); + dquot_initialize(new_dir); + old_inode = old_dentry->d_inode; new_dentry_inode = new_dentry->d_inode; diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c index 5fa7118f04e..313d39d639e 100644 --- a/fs/reiserfs/stree.c +++ b/fs/reiserfs/stree.c @@ -1299,7 +1299,7 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th, "reiserquota delete_item(): freeing %u, id=%u type=%c", quota_cut_bytes, inode->i_uid, head2type(&s_ih)); #endif - vfs_dq_free_space_nodirty(inode, quota_cut_bytes); + dquot_free_space_nodirty(inode, quota_cut_bytes); /* Return deleted body length */ return ret_value; @@ -1383,7 +1383,7 @@ void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th, quota_cut_bytes, inode->i_uid, key2type(key)); #endif - vfs_dq_free_space_nodirty(inode, + dquot_free_space_nodirty(inode, quota_cut_bytes); } break; @@ -1733,7 +1733,7 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th, "reiserquota cut_from_item(): freeing %u id=%u type=%c", quota_cut_bytes, inode->i_uid, '?'); #endif - vfs_dq_free_space_nodirty(inode, quota_cut_bytes); + dquot_free_space_nodirty(inode, quota_cut_bytes); return ret_value; } @@ -1968,9 +1968,10 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree key2type(&(key->on_disk_key))); #endif - if (vfs_dq_alloc_space_nodirty(inode, pasted_size)) { + retval = dquot_alloc_space_nodirty(inode, pasted_size); + if (retval) { pathrelse(search_path); - return -EDQUOT; + return retval; } init_tb_struct(th, &s_paste_balance, th->t_super, search_path, pasted_size); @@ -2024,7 +2025,7 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree pasted_size, inode->i_uid, key2type(&(key->on_disk_key))); #endif - vfs_dq_free_space_nodirty(inode, pasted_size); + dquot_free_space_nodirty(inode, pasted_size); return retval; } @@ -2062,9 +2063,10 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th, #endif /* We can't dirty inode here. It would be immediately written but * appropriate stat item isn't inserted yet... */ - if (vfs_dq_alloc_space_nodirty(inode, quota_bytes)) { + retval = dquot_alloc_space_nodirty(inode, quota_bytes); + if (retval) { pathrelse(path); - return -EDQUOT; + return retval; } } init_tb_struct(th, &s_ins_balance, th->t_super, path, @@ -2113,6 +2115,6 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th, quota_bytes, inode->i_uid, head2type(ih)); #endif if (inode) - vfs_dq_free_space_nodirty(inode, quota_bytes); + dquot_free_space_nodirty(inode, quota_bytes); return retval; } diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index b4a7dd03bdb..04bf5d791bd 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -246,7 +246,7 @@ static int finish_unfinished(struct super_block *s) retval = remove_save_link_only(s, &save_link_key, 0); continue; } - vfs_dq_init(inode); + dquot_initialize(inode); if (truncate && S_ISDIR(inode->i_mode)) { /* We got a truncate request for a dir which is impossible. @@ -578,6 +578,11 @@ out: reiserfs_write_unlock_once(inode->i_sb, lock_depth); } +static void reiserfs_clear_inode(struct inode *inode) +{ + dquot_drop(inode); +} + #ifdef CONFIG_QUOTA static ssize_t reiserfs_quota_write(struct super_block *, int, const char *, size_t, loff_t); @@ -590,6 +595,7 @@ static const struct super_operations reiserfs_sops = { .destroy_inode = reiserfs_destroy_inode, .write_inode = reiserfs_write_inode, .dirty_inode = reiserfs_dirty_inode, + .clear_inode = reiserfs_clear_inode, .delete_inode = reiserfs_delete_inode, .put_super = reiserfs_put_super, .write_super = reiserfs_write_super, @@ -616,13 +622,6 @@ static int reiserfs_write_info(struct super_block *, int); static int reiserfs_quota_on(struct super_block *, int, int, char *, int); static const struct dquot_operations reiserfs_quota_operations = { - .initialize = dquot_initialize, - .drop = dquot_drop, - .alloc_space = dquot_alloc_space, - .alloc_inode = dquot_alloc_inode, - .free_space = dquot_free_space, - .free_inode = dquot_free_inode, - .transfer = dquot_transfer, .write_dquot = reiserfs_write_dquot, .acquire_dquot = reiserfs_acquire_dquot, .release_dquot = reiserfs_release_dquot, diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 81f09fab8ae..37d034ca7d9 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -61,7 +61,6 @@ static int xattr_create(struct inode *dir, struct dentry *dentry, int mode) { BUG_ON(!mutex_is_locked(&dir->i_mutex)); - vfs_dq_init(dir); return dir->i_op->create(dir, dentry, mode, NULL); } #endif @@ -69,7 +68,6 @@ static int xattr_create(struct inode *dir, struct dentry *dentry, int mode) static int xattr_mkdir(struct inode *dir, struct dentry *dentry, int mode) { BUG_ON(!mutex_is_locked(&dir->i_mutex)); - vfs_dq_init(dir); return dir->i_op->mkdir(dir, dentry, mode); } @@ -81,7 +79,6 @@ static int xattr_unlink(struct inode *dir, struct dentry *dentry) { int error; BUG_ON(!mutex_is_locked(&dir->i_mutex)); - vfs_dq_init(dir); reiserfs_mutex_lock_nested_safe(&dentry->d_inode->i_mutex, I_MUTEX_CHILD, dir->i_sb); @@ -97,7 +94,6 @@ static int xattr_rmdir(struct inode *dir, struct dentry *dentry) { int error; BUG_ON(!mutex_is_locked(&dir->i_mutex)); - vfs_dq_init(dir); reiserfs_mutex_lock_nested_safe(&dentry->d_inode->i_mutex, I_MUTEX_CHILD, dir->i_sb); diff --git a/fs/sync.c b/fs/sync.c index 418727a2a23..f557d71cb09 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -34,14 +34,14 @@ static int __sync_filesystem(struct super_block *sb, int wait) if (!sb->s_bdi) return 0; - /* Avoid doing twice syncing and cache pruning for quota sync */ - if (!wait) { - writeout_quota_sb(sb, -1); - writeback_inodes_sb(sb); - } else { - sync_quota_sb(sb, -1); + if (sb->s_qcop && sb->s_qcop->quota_sync) + sb->s_qcop->quota_sync(sb, -1, wait); + + if (wait) sync_inodes_sb(sb); - } + else + writeback_inodes_sb(sb); + if (sb->s_op->sync_fs) sb->s_op->sync_fs(sb, wait); return __sync_blockdev(sb->s_bdev, wait); diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c index b2d96f45c12..ccc3ad7242d 100644 --- a/fs/udf/balloc.c +++ b/fs/udf/balloc.c @@ -208,7 +208,7 @@ static void udf_bitmap_free_blocks(struct super_block *sb, ((char *)bh->b_data)[(bit + i) >> 3]); } else { if (inode) - vfs_dq_free_block(inode, 1); + dquot_free_block(inode, 1); udf_add_free_space(sb, sbi->s_partition, 1); } } @@ -260,11 +260,11 @@ static int udf_bitmap_prealloc_blocks(struct super_block *sb, while (bit < (sb->s_blocksize << 3) && block_count > 0) { if (!udf_test_bit(bit, bh->b_data)) goto out; - else if (vfs_dq_prealloc_block(inode, 1)) + else if (dquot_prealloc_block(inode, 1)) goto out; else if (!udf_clear_bit(bit, bh->b_data)) { udf_debug("bit already cleared for block %d\n", bit); - vfs_dq_free_block(inode, 1); + dquot_free_block(inode, 1); goto out; } block_count--; @@ -390,10 +390,14 @@ got_block: /* * Check quota for allocation of this block. */ - if (inode && vfs_dq_alloc_block(inode, 1)) { - mutex_unlock(&sbi->s_alloc_mutex); - *err = -EDQUOT; - return 0; + if (inode) { + int ret = dquot_alloc_block(inode, 1); + + if (ret) { + mutex_unlock(&sbi->s_alloc_mutex); + *err = ret; + return 0; + } } newblock = bit + (block_group << (sb->s_blocksize_bits + 3)) - @@ -449,7 +453,7 @@ static void udf_table_free_blocks(struct super_block *sb, /* We do this up front - There are some error conditions that could occure, but.. oh well */ if (inode) - vfs_dq_free_block(inode, count); + dquot_free_block(inode, count); udf_add_free_space(sb, sbi->s_partition, count); start = bloc->logicalBlockNum + offset; @@ -694,7 +698,7 @@ static int udf_table_prealloc_blocks(struct super_block *sb, epos.offset -= adsize; alloc_count = (elen >> sb->s_blocksize_bits); - if (inode && vfs_dq_prealloc_block(inode, + if (inode && dquot_prealloc_block(inode, alloc_count > block_count ? block_count : alloc_count)) alloc_count = 0; else if (alloc_count > block_count) { @@ -797,12 +801,13 @@ static int udf_table_new_block(struct super_block *sb, newblock = goal_eloc.logicalBlockNum; goal_eloc.logicalBlockNum++; goal_elen -= sb->s_blocksize; - - if (inode && vfs_dq_alloc_block(inode, 1)) { - brelse(goal_epos.bh); - mutex_unlock(&sbi->s_alloc_mutex); - *err = -EDQUOT; - return 0; + if (inode) { + *err = dquot_alloc_block(inode, 1); + if (*err) { + brelse(goal_epos.bh); + mutex_unlock(&sbi->s_alloc_mutex); + return 0; + } } if (goal_elen) diff --git a/fs/udf/file.c b/fs/udf/file.c index f311d509b6a..1eb06774ed9 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -34,6 +34,7 @@ #include <linux/errno.h> #include <linux/smp_lock.h> #include <linux/pagemap.h> +#include <linux/quotaops.h> #include <linux/buffer_head.h> #include <linux/aio.h> @@ -207,7 +208,7 @@ const struct file_operations udf_file_operations = { .read = do_sync_read, .aio_read = generic_file_aio_read, .ioctl = udf_ioctl, - .open = generic_file_open, + .open = dquot_file_open, .mmap = generic_file_mmap, .write = do_sync_write, .aio_write = udf_file_aio_write, @@ -217,6 +218,29 @@ const struct file_operations udf_file_operations = { .llseek = generic_file_llseek, }; +static int udf_setattr(struct dentry *dentry, struct iattr *iattr) +{ + struct inode *inode = dentry->d_inode; + int error; + + error = inode_change_ok(inode, iattr); + if (error) + return error; + + if (iattr->ia_valid & ATTR_SIZE) + dquot_initialize(inode); + + if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) || + (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) { + error = dquot_transfer(inode, iattr); + if (error) + return error; + } + + return inode_setattr(inode, iattr); +} + const struct inode_operations udf_file_inode_operations = { - .truncate = udf_truncate, + .truncate = udf_truncate, + .setattr = udf_setattr, }; diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c index c10fa39f97e..fb68c9cd0c3 100644 --- a/fs/udf/ialloc.c +++ b/fs/udf/ialloc.c @@ -36,8 +36,8 @@ void udf_free_inode(struct inode *inode) * Note: we must free any quota before locking the superblock, * as writing the quota to disk may need the lock as well. */ - vfs_dq_free_inode(inode); - vfs_dq_drop(inode); + dquot_free_inode(inode); + dquot_drop(inode); clear_inode(inode); @@ -61,7 +61,7 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err) struct super_block *sb = dir->i_sb; struct udf_sb_info *sbi = UDF_SB(sb); struct inode *inode; - int block; + int block, ret; uint32_t start = UDF_I(dir)->i_location.logicalBlockNum; struct udf_inode_info *iinfo; struct udf_inode_info *dinfo = UDF_I(dir); @@ -153,12 +153,14 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err) insert_inode_hash(inode); mark_inode_dirty(inode); - if (vfs_dq_alloc_inode(inode)) { - vfs_dq_drop(inode); + dquot_initialize(inode); + ret = dquot_alloc_inode(inode); + if (ret) { + dquot_drop(inode); inode->i_flags |= S_NOQUOTA; inode->i_nlink = 0; iput(inode); - *err = -EDQUOT; + *err = ret; return NULL; } diff --git a/fs/udf/inode.c b/fs/udf/inode.c index b0208924729..b57ab0402d8 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -36,6 +36,7 @@ #include <linux/pagemap.h> #include <linux/buffer_head.h> #include <linux/writeback.h> +#include <linux/quotaops.h> #include <linux/slab.h> #include <linux/crc-itu-t.h> @@ -70,6 +71,9 @@ static int udf_get_block(struct inode *, sector_t, struct buffer_head *, int); void udf_delete_inode(struct inode *inode) { + if (!is_bad_inode(inode)) + dquot_initialize(inode); + truncate_inode_pages(&inode->i_data, 0); if (is_bad_inode(inode)) @@ -108,6 +112,8 @@ void udf_clear_inode(struct inode *inode) (unsigned long long)inode->i_size, (unsigned long long)iinfo->i_lenExtents); } + + dquot_drop(inode); kfree(iinfo->i_ext.i_data); iinfo->i_ext.i_data = NULL; } diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 7c56ff00cd5..db423ab078b 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -563,6 +563,8 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode, int err; struct udf_inode_info *iinfo; + dquot_initialize(dir); + lock_kernel(); inode = udf_new_inode(dir, mode, &err); if (!inode) { @@ -616,6 +618,8 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode, if (!old_valid_dev(rdev)) return -EINVAL; + dquot_initialize(dir); + lock_kernel(); err = -EIO; inode = udf_new_inode(dir, mode, &err); @@ -662,6 +666,8 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode) struct udf_inode_info *dinfo = UDF_I(dir); struct udf_inode_info *iinfo; + dquot_initialize(dir); + lock_kernel(); err = -EMLINK; if (dir->i_nlink >= (256 << sizeof(dir->i_nlink)) - 1) @@ -799,6 +805,8 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry) struct fileIdentDesc *fi, cfi; struct kernel_lb_addr tloc; + dquot_initialize(dir); + retval = -ENOENT; lock_kernel(); fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi); @@ -845,6 +853,8 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry) struct fileIdentDesc cfi; struct kernel_lb_addr tloc; + dquot_initialize(dir); + retval = -ENOENT; lock_kernel(); fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi); @@ -899,6 +909,8 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry, struct buffer_head *bh; struct udf_inode_info *iinfo; + dquot_initialize(dir); + lock_kernel(); inode = udf_new_inode(dir, S_IFLNK, &err); if (!inode) @@ -1069,6 +1081,8 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir, int err; struct buffer_head *bh; + dquot_initialize(dir); + lock_kernel(); if (inode->i_nlink >= (256 << sizeof(inode->i_nlink)) - 1) { unlock_kernel(); @@ -1131,6 +1145,9 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry, struct kernel_lb_addr tloc; struct udf_inode_info *old_iinfo = UDF_I(old_inode); + dquot_initialize(old_dir); + dquot_initialize(new_dir); + lock_kernel(); ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi); if (ofi) { diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c index 54c16ec95df..5cfa4d85ccf 100644 --- a/fs/ufs/balloc.c +++ b/fs/ufs/balloc.c @@ -85,7 +85,7 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count) "bit already cleared for fragment %u", i); } - vfs_dq_free_block(inode, count); + dquot_free_block(inode, count); fs32_add(sb, &ucg->cg_cs.cs_nffree, count); @@ -195,7 +195,7 @@ do_more: ubh_setblock(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno); if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD) ufs_clusteracct (sb, ucpi, blkno, 1); - vfs_dq_free_block(inode, uspi->s_fpb); + dquot_free_block(inode, uspi->s_fpb); fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1); uspi->cs_total.cs_nbfree++; @@ -511,6 +511,7 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment, struct ufs_cg_private_info * ucpi; struct ufs_cylinder_group * ucg; unsigned cgno, fragno, fragoff, count, fragsize, i; + int ret; UFSD("ENTER, fragment %llu, oldcount %u, newcount %u\n", (unsigned long long)fragment, oldcount, newcount); @@ -556,8 +557,9 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment, fs32_add(sb, &ucg->cg_frsum[fragsize - count], 1); for (i = oldcount; i < newcount; i++) ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i); - if (vfs_dq_alloc_block(inode, count)) { - *err = -EDQUOT; + ret = dquot_alloc_block(inode, count); + if (ret) { + *err = ret; return 0; } @@ -596,6 +598,7 @@ static u64 ufs_alloc_fragments(struct inode *inode, unsigned cgno, struct ufs_cylinder_group * ucg; unsigned oldcg, i, j, k, allocsize; u64 result; + int ret; UFSD("ENTER, ino %lu, cgno %u, goal %llu, count %u\n", inode->i_ino, cgno, (unsigned long long)goal, count); @@ -664,7 +667,7 @@ cg_found: for (i = count; i < uspi->s_fpb; i++) ubh_setbit (UCPI_UBH(ucpi), ucpi->c_freeoff, goal + i); i = uspi->s_fpb - count; - vfs_dq_free_block(inode, i); + dquot_free_block(inode, i); fs32_add(sb, &ucg->cg_cs.cs_nffree, i); uspi->cs_total.cs_nffree += i; @@ -676,8 +679,9 @@ cg_found: result = ufs_bitmap_search (sb, ucpi, goal, allocsize); if (result == INVBLOCK) return 0; - if (vfs_dq_alloc_block(inode, count)) { - *err = -EDQUOT; + ret = dquot_alloc_block(inode, count); + if (ret) { + *err = ret; return 0; } for (i = 0; i < count; i++) @@ -714,6 +718,7 @@ static u64 ufs_alloccg_block(struct inode *inode, struct ufs_super_block_first * usb1; struct ufs_cylinder_group * ucg; u64 result, blkno; + int ret; UFSD("ENTER, goal %llu\n", (unsigned long long)goal); @@ -747,8 +752,9 @@ gotit: ubh_clrblock (UCPI_UBH(ucpi), ucpi->c_freeoff, blkno); if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD) ufs_clusteracct (sb, ucpi, blkno, -1); - if (vfs_dq_alloc_block(inode, uspi->s_fpb)) { - *err = -EDQUOT; + ret = dquot_alloc_block(inode, uspi->s_fpb); + if (ret) { + *err = ret; return INVBLOCK; } diff --git a/fs/ufs/file.c b/fs/ufs/file.c index 73655c61240..a8962cecde5 100644 --- a/fs/ufs/file.c +++ b/fs/ufs/file.c @@ -24,6 +24,7 @@ */ #include <linux/fs.h> +#include <linux/quotaops.h> #include "ufs_fs.h" #include "ufs.h" @@ -40,7 +41,7 @@ const struct file_operations ufs_file_operations = { .write = do_sync_write, .aio_write = generic_file_aio_write, .mmap = generic_file_mmap, - .open = generic_file_open, + .open = dquot_file_open, .fsync = simple_fsync, .splice_read = generic_file_splice_read, }; diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c index 3527c00fef0..230ecf60802 100644 --- a/fs/ufs/ialloc.c +++ b/fs/ufs/ialloc.c @@ -95,8 +95,8 @@ void ufs_free_inode (struct inode * inode) is_directory = S_ISDIR(inode->i_mode); - vfs_dq_free_inode(inode); - vfs_dq_drop(inode); + dquot_free_inode(inode); + dquot_drop(inode); clear_inode (inode); @@ -355,9 +355,10 @@ cg_found: unlock_super (sb); - if (vfs_dq_alloc_inode(inode)) { - vfs_dq_drop(inode); - err = -EDQUOT; + dquot_initialize(inode); + err = dquot_alloc_inode(inode); + if (err) { + dquot_drop(inode); goto fail_without_unlock; } diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 0a627e08610..80b68c3702d 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -37,6 +37,7 @@ #include <linux/smp_lock.h> #include <linux/buffer_head.h> #include <linux/writeback.h> +#include <linux/quotaops.h> #include "ufs_fs.h" #include "ufs.h" @@ -909,6 +910,9 @@ void ufs_delete_inode (struct inode * inode) { loff_t old_i_size; + if (!is_bad_inode(inode)) + dquot_initialize(inode); + truncate_inode_pages(&inode->i_data, 0); if (is_bad_inode(inode)) goto no_delete; diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c index 4c26d9e8bc9..118556243e7 100644 --- a/fs/ufs/namei.c +++ b/fs/ufs/namei.c @@ -30,6 +30,7 @@ #include <linux/time.h> #include <linux/fs.h> #include <linux/smp_lock.h> +#include <linux/quotaops.h> #include "ufs_fs.h" #include "ufs.h" @@ -84,6 +85,9 @@ static int ufs_create (struct inode * dir, struct dentry * dentry, int mode, int err; UFSD("BEGIN\n"); + + dquot_initialize(dir); + inode = ufs_new_inode(dir, mode); err = PTR_ERR(inode); @@ -107,6 +111,9 @@ static int ufs_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_t if (!old_valid_dev(rdev)) return -EINVAL; + + dquot_initialize(dir); + inode = ufs_new_inode(dir, mode); err = PTR_ERR(inode); if (!IS_ERR(inode)) { @@ -131,6 +138,8 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry, if (l > sb->s_blocksize) goto out_notlocked; + dquot_initialize(dir); + lock_kernel(); inode = ufs_new_inode(dir, S_IFLNK | S_IRWXUGO); err = PTR_ERR(inode); @@ -176,6 +185,8 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir, return -EMLINK; } + dquot_initialize(dir); + inode->i_ctime = CURRENT_TIME_SEC; inode_inc_link_count(inode); atomic_inc(&inode->i_count); @@ -193,6 +204,8 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, int mode) if (dir->i_nlink >= UFS_LINK_MAX) goto out; + dquot_initialize(dir); + lock_kernel(); inode_inc_link_count(dir); @@ -237,6 +250,8 @@ static int ufs_unlink(struct inode *dir, struct dentry *dentry) struct page *page; int err = -ENOENT; + dquot_initialize(dir); + de = ufs_find_entry(dir, &dentry->d_name, &page); if (!de) goto out; @@ -281,6 +296,9 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry, struct ufs_dir_entry *old_de; int err = -ENOENT; + dquot_initialize(old_dir); + dquot_initialize(new_dir); + old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_page); if (!old_de) goto out; diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 143c20bfb04..66b63a75161 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -1432,6 +1432,11 @@ static void destroy_inodecache(void) kmem_cache_destroy(ufs_inode_cachep); } +static void ufs_clear_inode(struct inode *inode) +{ + dquot_drop(inode); +} + #ifdef CONFIG_QUOTA static ssize_t ufs_quota_read(struct super_block *, int, char *,size_t, loff_t); static ssize_t ufs_quota_write(struct super_block *, int, const char *, size_t, loff_t); @@ -1442,6 +1447,7 @@ static const struct super_operations ufs_super_ops = { .destroy_inode = ufs_destroy_inode, .write_inode = ufs_write_inode, .delete_inode = ufs_delete_inode, + .clear_inode = ufs_clear_inode, .put_super = ufs_put_super, .write_super = ufs_write_super, .sync_fs = ufs_sync_fs, diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c index 41dd431ce22..d3b6270cb37 100644 --- a/fs/ufs/truncate.c +++ b/fs/ufs/truncate.c @@ -44,6 +44,7 @@ #include <linux/buffer_head.h> #include <linux/blkdev.h> #include <linux/sched.h> +#include <linux/quotaops.h> #include "ufs_fs.h" #include "ufs.h" @@ -517,9 +518,18 @@ static int ufs_setattr(struct dentry *dentry, struct iattr *attr) if (error) return error; + if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || + (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { + error = dquot_transfer(inode, attr); + if (error) + return error; + } if (ia_valid & ATTR_SIZE && attr->ia_size != i_size_read(inode)) { loff_t old_i_size = inode->i_size; + + dquot_initialize(inode); + error = vmtruncate(inode, attr->ia_size); if (error) return error; diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c index 3d4a0c84d63..1947514ce1a 100644 --- a/fs/xfs/linux-2.6/xfs_quotaops.c +++ b/fs/xfs/linux-2.6/xfs_quotaops.c @@ -44,20 +44,6 @@ xfs_quota_type(int type) } STATIC int -xfs_fs_quota_sync( - struct super_block *sb, - int type) -{ - struct xfs_mount *mp = XFS_M(sb); - - if (sb->s_flags & MS_RDONLY) - return -EROFS; - if (!XFS_IS_QUOTA_RUNNING(mp)) - return -ENOSYS; - return -xfs_sync_data(mp, 0); -} - -STATIC int xfs_fs_get_xstate( struct super_block *sb, struct fs_quota_stat *fqs) @@ -82,8 +68,6 @@ xfs_fs_set_xstate( return -EROFS; if (op != Q_XQUOTARM && !XFS_IS_QUOTA_RUNNING(mp)) return -ENOSYS; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; if (uflags & XFS_QUOTA_UDQ_ACCT) flags |= XFS_UQUOTA_ACCT; @@ -144,14 +128,11 @@ xfs_fs_set_xquota( return -ENOSYS; if (!XFS_IS_QUOTA_ON(mp)) return -ESRCH; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; return -xfs_qm_scall_setqlim(mp, id, xfs_quota_type(type), fdq); } const struct quotactl_ops xfs_quotactl_operations = { - .quota_sync = xfs_fs_quota_sync, .get_xstate = xfs_fs_get_xstate, .set_xstate = xfs_fs_set_xstate, .get_xquota = xfs_fs_get_xquota, diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h index deac2566450..cac84b00666 100644 --- a/include/linux/ext3_fs.h +++ b/include/linux/ext3_fs.h @@ -202,14 +202,6 @@ static inline __u32 ext3_mask_flags(umode_t mode, __u32 flags) return flags & EXT3_OTHER_FLMASK; } -/* - * Inode dynamic state flags - */ -#define EXT3_STATE_JDATA 0x00000001 /* journaled data exists */ -#define EXT3_STATE_NEW 0x00000002 /* inode is newly created */ -#define EXT3_STATE_XATTR 0x00000004 /* has in-inode xattrs */ -#define EXT3_STATE_FLUSH_ON_CLOSE 0x00000008 - /* Used to pass group descriptor data when online resize is done */ struct ext3_new_group_input { __u32 group; /* Group number for this data */ @@ -560,6 +552,31 @@ static inline int ext3_valid_inum(struct super_block *sb, unsigned long ino) (ino >= EXT3_FIRST_INO(sb) && ino <= le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count)); } + +/* + * Inode dynamic state flags + */ +enum { + EXT3_STATE_JDATA, /* journaled data exists */ + EXT3_STATE_NEW, /* inode is newly created */ + EXT3_STATE_XATTR, /* has in-inode xattrs */ + EXT3_STATE_FLUSH_ON_CLOSE, /* flush dirty pages on close */ +}; + +static inline int ext3_test_inode_state(struct inode *inode, int bit) +{ + return test_bit(bit, &EXT3_I(inode)->i_state); +} + +static inline void ext3_set_inode_state(struct inode *inode, int bit) +{ + set_bit(bit, &EXT3_I(inode)->i_state); +} + +static inline void ext3_clear_inode_state(struct inode *inode, int bit) +{ + clear_bit(bit, &EXT3_I(inode)->i_state); +} #else /* Assume that user mode programs are passing in an ext3fs superblock, not * a kernel struct super_block. This will allow us to call the feature-test diff --git a/include/linux/ext3_fs_i.h b/include/linux/ext3_fs_i.h index 93e7428156b..7679acdb519 100644 --- a/include/linux/ext3_fs_i.h +++ b/include/linux/ext3_fs_i.h @@ -87,7 +87,7 @@ struct ext3_inode_info { * near to their parent directory's inode. */ __u32 i_block_group; - __u32 i_state; /* Dynamic state flags for ext3 */ + unsigned long i_state; /* Dynamic state flags for ext3 */ /* block reservation info */ struct ext3_block_alloc_info *i_block_alloc_info; diff --git a/include/linux/jbd.h b/include/linux/jbd.h index 331530cd3cc..f3aa59cb675 100644 --- a/include/linux/jbd.h +++ b/include/linux/jbd.h @@ -246,19 +246,8 @@ typedef struct journal_superblock_s #define J_ASSERT(assert) BUG_ON(!(assert)) -#if defined(CONFIG_BUFFER_DEBUG) -void buffer_assertion_failure(struct buffer_head *bh); -#define J_ASSERT_BH(bh, expr) \ - do { \ - if (!(expr)) \ - buffer_assertion_failure(bh); \ - J_ASSERT(expr); \ - } while (0) -#define J_ASSERT_JH(jh, expr) J_ASSERT_BH(jh2bh(jh), expr) -#else #define J_ASSERT_BH(bh, expr) J_ASSERT(expr) #define J_ASSERT_JH(jh, expr) J_ASSERT(expr) -#endif #if defined(JBD_PARANOID_IOFAIL) #define J_EXPECT(expr, why...) J_ASSERT(expr) diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 8ada2a129d0..1ec87635818 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -277,19 +277,8 @@ typedef struct journal_superblock_s #define J_ASSERT(assert) BUG_ON(!(assert)) -#if defined(CONFIG_BUFFER_DEBUG) -void buffer_assertion_failure(struct buffer_head *bh); -#define J_ASSERT_BH(bh, expr) \ - do { \ - if (!(expr)) \ - buffer_assertion_failure(bh); \ - J_ASSERT(expr); \ - } while (0) -#define J_ASSERT_JH(jh, expr) J_ASSERT_BH(jh2bh(jh), expr) -#else #define J_ASSERT_BH(bh, expr) J_ASSERT(expr) #define J_ASSERT_JH(jh, expr) J_ASSERT(expr) -#endif #if defined(JBD2_PARANOID_IOFAIL) #define J_EXPECT(expr, why...) J_ASSERT(expr) diff --git a/include/linux/kvm.h b/include/linux/kvm.h index a24de0b1858..60df9c84eca 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -103,7 +103,7 @@ struct kvm_userspace_memory_region { /* for kvm_memory_region::flags */ #define KVM_MEM_LOG_DIRTY_PAGES 1UL - +#define KVM_MEMSLOT_INVALID (1UL << 1) /* for KVM_IRQ_LINE */ struct kvm_irq_level { @@ -497,6 +497,11 @@ struct kvm_ioeventfd { #endif #define KVM_CAP_S390_PSW 42 #define KVM_CAP_PPC_SEGSTATE 43 +#define KVM_CAP_HYPERV 44 +#define KVM_CAP_HYPERV_VAPIC 45 +#define KVM_CAP_HYPERV_SPIN 46 +#define KVM_CAP_PCI_SEGMENT 47 +#define KVM_CAP_X86_ROBUST_SINGLESTEP 51 #ifdef KVM_CAP_IRQ_ROUTING @@ -691,8 +696,9 @@ struct kvm_assigned_pci_dev { __u32 busnr; __u32 devfn; __u32 flags; + __u32 segnr; union { - __u32 reserved[12]; + __u32 reserved[11]; }; }; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index bd5a616d937..a3fd0f91d94 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -38,6 +38,7 @@ #define KVM_REQ_MMU_SYNC 7 #define KVM_REQ_KVMCLOCK_UPDATE 8 #define KVM_REQ_KICK 9 +#define KVM_REQ_DEACTIVATE_FPU 10 #define KVM_USERSPACE_IRQ_SOURCE_ID 0 @@ -57,20 +58,20 @@ struct kvm_io_bus { struct kvm_io_device *devs[NR_IOBUS_DEVS]; }; -void kvm_io_bus_init(struct kvm_io_bus *bus); -void kvm_io_bus_destroy(struct kvm_io_bus *bus); -int kvm_io_bus_write(struct kvm_io_bus *bus, gpa_t addr, int len, - const void *val); -int kvm_io_bus_read(struct kvm_io_bus *bus, gpa_t addr, int len, +enum kvm_bus { + KVM_MMIO_BUS, + KVM_PIO_BUS, + KVM_NR_BUSES +}; + +int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, + int len, const void *val); +int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int len, void *val); -int __kvm_io_bus_register_dev(struct kvm_io_bus *bus, - struct kvm_io_device *dev); -int kvm_io_bus_register_dev(struct kvm *kvm, struct kvm_io_bus *bus, +int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, struct kvm_io_device *dev); -void __kvm_io_bus_unregister_dev(struct kvm_io_bus *bus, - struct kvm_io_device *dev); -void kvm_io_bus_unregister_dev(struct kvm *kvm, struct kvm_io_bus *bus, - struct kvm_io_device *dev); +int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, + struct kvm_io_device *dev); struct kvm_vcpu { struct kvm *kvm; @@ -83,6 +84,8 @@ struct kvm_vcpu { struct kvm_run *run; unsigned long requests; unsigned long guest_debug; + int srcu_idx; + int fpu_active; int guest_fpu_loaded; wait_queue_head_t wq; @@ -150,14 +153,19 @@ struct kvm_irq_routing_table {}; #endif -struct kvm { - spinlock_t mmu_lock; - spinlock_t requests_lock; - struct rw_semaphore slots_lock; - struct mm_struct *mm; /* userspace tied to this vm */ +struct kvm_memslots { int nmemslots; struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS]; +}; + +struct kvm { + spinlock_t mmu_lock; + raw_spinlock_t requests_lock; + struct mutex slots_lock; + struct mm_struct *mm; /* userspace tied to this vm */ + struct kvm_memslots *memslots; + struct srcu_struct srcu; #ifdef CONFIG_KVM_APIC_ARCHITECTURE u32 bsp_vcpu_id; struct kvm_vcpu *bsp_vcpu; @@ -166,8 +174,7 @@ struct kvm { atomic_t online_vcpus; struct list_head vm_list; struct mutex lock; - struct kvm_io_bus mmio_bus; - struct kvm_io_bus pio_bus; + struct kvm_io_bus *buses[KVM_NR_BUSES]; #ifdef CONFIG_HAVE_KVM_EVENTFD struct { spinlock_t lock; @@ -249,13 +256,20 @@ int kvm_set_memory_region(struct kvm *kvm, int __kvm_set_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem, int user_alloc); -int kvm_arch_set_memory_region(struct kvm *kvm, +int kvm_arch_prepare_memory_region(struct kvm *kvm, + struct kvm_memory_slot *memslot, + struct kvm_memory_slot old, + struct kvm_userspace_memory_region *mem, + int user_alloc); +void kvm_arch_commit_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem, struct kvm_memory_slot old, int user_alloc); void kvm_disable_largepages(void); void kvm_arch_flush_shadow(struct kvm *kvm); gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn); +gfn_t unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn); + struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn); unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn); void kvm_release_page_clean(struct page *page); @@ -264,6 +278,9 @@ void kvm_set_page_dirty(struct page *page); void kvm_set_page_accessed(struct page *page); pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn); +pfn_t gfn_to_pfn_memslot(struct kvm *kvm, + struct kvm_memory_slot *slot, gfn_t gfn); +int memslot_id(struct kvm *kvm, gfn_t gfn); void kvm_release_pfn_dirty(pfn_t); void kvm_release_pfn_clean(pfn_t pfn); void kvm_set_pfn_dirty(pfn_t pfn); @@ -283,6 +300,7 @@ int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len); int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len); struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn); int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn); +unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn); void mark_page_dirty(struct kvm *kvm, gfn_t gfn); void kvm_vcpu_block(struct kvm_vcpu *vcpu); @@ -383,6 +401,7 @@ struct kvm_assigned_dev_kernel { struct work_struct interrupt_work; struct list_head list; int assigned_dev_id; + int host_segnr; int host_busnr; int host_devfn; unsigned int entries_nr; @@ -429,8 +448,7 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id); #define KVM_IOMMU_CACHE_COHERENCY 0x1 #ifdef CONFIG_IOMMU_API -int kvm_iommu_map_pages(struct kvm *kvm, gfn_t base_gfn, - unsigned long npages); +int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot); int kvm_iommu_map_guest(struct kvm *kvm); int kvm_iommu_unmap_guest(struct kvm *kvm); int kvm_assign_device(struct kvm *kvm, @@ -480,11 +498,6 @@ static inline void kvm_guest_exit(void) current->flags &= ~PF_VCPU; } -static inline int memslot_id(struct kvm *kvm, struct kvm_memory_slot *slot) -{ - return slot - kvm->memslots; -} - static inline gpa_t gfn_to_gpa(gfn_t gfn) { return (gpa_t)gfn << PAGE_SHIFT; @@ -532,6 +545,10 @@ static inline int mmu_notifier_retry(struct kvm_vcpu *vcpu, unsigned long mmu_se } #endif +#ifndef KVM_ARCH_HAS_UNALIAS_INSTANTIATION +#define unalias_gfn_instantiation unalias_gfn +#endif + #ifdef CONFIG_HAVE_KVM_IRQCHIP #define KVM_MAX_IRQ_ROUTES 1024 diff --git a/include/linux/quota.h b/include/linux/quota.h index a6861f11748..b462916b2a0 100644 --- a/include/linux/quota.h +++ b/include/linux/quota.h @@ -279,9 +279,6 @@ struct dquot { struct mem_dqblk dq_dqb; /* Diskquota usage */ }; -#define QUOTA_OK 0 -#define NO_QUOTA 1 - /* Operations which must be implemented by each quota format */ struct quota_format_ops { int (*check_quota_file)(struct super_block *sb, int type); /* Detect whether file is in our format */ @@ -295,13 +292,6 @@ struct quota_format_ops { /* Operations working with dquots */ struct dquot_operations { - int (*initialize) (struct inode *, int); - int (*drop) (struct inode *); - int (*alloc_space) (struct inode *, qsize_t, int); - int (*alloc_inode) (const struct inode *, qsize_t); - int (*free_space) (struct inode *, qsize_t); - int (*free_inode) (const struct inode *, qsize_t); - int (*transfer) (struct inode *, struct iattr *); int (*write_dquot) (struct dquot *); /* Ordinary dquot write */ struct dquot *(*alloc_dquot)(struct super_block *, int); /* Allocate memory for new dquot */ void (*destroy_dquot)(struct dquot *); /* Free memory for dquot */ @@ -309,12 +299,6 @@ struct dquot_operations { int (*release_dquot) (struct dquot *); /* Quota is going to be deleted from disk */ int (*mark_dirty) (struct dquot *); /* Dquot is marked dirty */ int (*write_info) (struct super_block *, int); /* Write of quota "superblock" */ - /* reserve quota for delayed block allocation */ - int (*reserve_space) (struct inode *, qsize_t, int); - /* claim reserved quota for delayed alloc */ - int (*claim_space) (struct inode *, qsize_t); - /* release rsved quota for delayed alloc */ - void (*release_rsv) (struct inode *, qsize_t); /* get reserved quota for delayed alloc, value returned is managed by * quota code only */ qsize_t *(*get_reserved_space) (struct inode *); @@ -324,7 +308,7 @@ struct dquot_operations { struct quotactl_ops { int (*quota_on)(struct super_block *, int, int, char *, int); int (*quota_off)(struct super_block *, int, int); - int (*quota_sync)(struct super_block *, int); + int (*quota_sync)(struct super_block *, int, int); int (*get_info)(struct super_block *, int, struct if_dqinfo *); int (*set_info)(struct super_block *, int, struct if_dqinfo *); int (*get_dqblk)(struct super_block *, int, qid_t, struct if_dqblk *); @@ -357,26 +341,25 @@ enum { #define DQUOT_STATE_FLAGS (DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED | \ DQUOT_SUSPENDED) /* Other quota flags */ -#define DQUOT_QUOTA_SYS_FILE (1 << 6) /* Quota file is a special +#define DQUOT_STATE_LAST (_DQUOT_STATE_FLAGS * MAXQUOTAS) +#define DQUOT_QUOTA_SYS_FILE (1 << DQUOT_STATE_LAST) + /* Quota file is a special * system file and user cannot * touch it. Filesystem is * responsible for setting * S_NOQUOTA, S_NOATIME flags */ -#define DQUOT_NEGATIVE_USAGE (1 << 7) /* Allow negative quota usage */ +#define DQUOT_NEGATIVE_USAGE (1 << (DQUOT_STATE_LAST + 1)) + /* Allow negative quota usage */ static inline unsigned int dquot_state_flag(unsigned int flags, int type) { - if (type == USRQUOTA) - return flags; - return flags << _DQUOT_STATE_FLAGS; + return flags << _DQUOT_STATE_FLAGS * type; } static inline unsigned int dquot_generic_flag(unsigned int flags, int type) { - if (type == USRQUOTA) - return flags; - return flags >> _DQUOT_STATE_FLAGS; + return (flags >> _DQUOT_STATE_FLAGS * type) & DQUOT_STATE_FLAGS; } #ifdef CONFIG_QUOTA_NETLINK_INTERFACE diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h index 3ebb2315364..e6fa7acce29 100644 --- a/include/linux/quotaops.h +++ b/include/linux/quotaops.h @@ -19,15 +19,12 @@ static inline struct quota_info *sb_dqopt(struct super_block *sb) /* * declaration of quota_function calls in kernel. */ -void sync_quota_sb(struct super_block *sb, int type); -static inline void writeout_quota_sb(struct super_block *sb, int type) -{ - if (sb->s_qcop->quota_sync) - sb->s_qcop->quota_sync(sb, type); -} +void inode_add_rsv_space(struct inode *inode, qsize_t number); +void inode_claim_rsv_space(struct inode *inode, qsize_t number); +void inode_sub_rsv_space(struct inode *inode, qsize_t number); -int dquot_initialize(struct inode *inode, int type); -int dquot_drop(struct inode *inode); +void dquot_initialize(struct inode *inode); +void dquot_drop(struct inode *inode); struct dquot *dqget(struct super_block *sb, unsigned int id, int type); void dqput(struct dquot *dquot); int dquot_scan_active(struct super_block *sb, @@ -36,24 +33,23 @@ int dquot_scan_active(struct super_block *sb, struct dquot *dquot_alloc(struct super_block *sb, int type); void dquot_destroy(struct dquot *dquot); -int dquot_alloc_space(struct inode *inode, qsize_t number, int prealloc); -int dquot_alloc_inode(const struct inode *inode, qsize_t number); +int __dquot_alloc_space(struct inode *inode, qsize_t number, + int warn, int reserve); +void __dquot_free_space(struct inode *inode, qsize_t number, int reserve); -int dquot_reserve_space(struct inode *inode, qsize_t number, int prealloc); -int dquot_claim_space(struct inode *inode, qsize_t number); -void dquot_release_reserved_space(struct inode *inode, qsize_t number); -qsize_t dquot_get_reserved_space(struct inode *inode); +int dquot_alloc_inode(const struct inode *inode); -int dquot_free_space(struct inode *inode, qsize_t number); -int dquot_free_inode(const struct inode *inode, qsize_t number); +int dquot_claim_space_nodirty(struct inode *inode, qsize_t number); +void dquot_free_inode(const struct inode *inode); -int dquot_transfer(struct inode *inode, struct iattr *iattr); int dquot_commit(struct dquot *dquot); int dquot_acquire(struct dquot *dquot); int dquot_release(struct dquot *dquot); int dquot_commit_info(struct super_block *sb, int type); int dquot_mark_dquot_dirty(struct dquot *dquot); +int dquot_file_open(struct inode *inode, struct file *file); + int vfs_quota_on(struct super_block *sb, int type, int format_id, char *path, int remount); int vfs_quota_enable(struct inode *inode, int type, int format_id, @@ -64,14 +60,13 @@ int vfs_quota_on_mount(struct super_block *sb, char *qf_name, int format_id, int type); int vfs_quota_off(struct super_block *sb, int type, int remount); int vfs_quota_disable(struct super_block *sb, int type, unsigned int flags); -int vfs_quota_sync(struct super_block *sb, int type); +int vfs_quota_sync(struct super_block *sb, int type, int wait); int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii); int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii); int vfs_get_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *di); int vfs_set_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *di); -void vfs_dq_drop(struct inode *inode); -int vfs_dq_transfer(struct inode *inode, struct iattr *iattr); +int dquot_transfer(struct inode *inode, struct iattr *iattr); int vfs_dq_quota_on_remount(struct super_block *sb); static inline struct mem_dqinfo *sb_dqinfo(struct super_block *sb, int type) @@ -83,53 +78,56 @@ static inline struct mem_dqinfo *sb_dqinfo(struct super_block *sb, int type) * Functions for checking status of quota */ -static inline int sb_has_quota_usage_enabled(struct super_block *sb, int type) +static inline bool sb_has_quota_usage_enabled(struct super_block *sb, int type) { return sb_dqopt(sb)->flags & dquot_state_flag(DQUOT_USAGE_ENABLED, type); } -static inline int sb_has_quota_limits_enabled(struct super_block *sb, int type) +static inline bool sb_has_quota_limits_enabled(struct super_block *sb, int type) { return sb_dqopt(sb)->flags & dquot_state_flag(DQUOT_LIMITS_ENABLED, type); } -static inline int sb_has_quota_suspended(struct super_block *sb, int type) +static inline bool sb_has_quota_suspended(struct super_block *sb, int type) { return sb_dqopt(sb)->flags & dquot_state_flag(DQUOT_SUSPENDED, type); } -static inline int sb_any_quota_suspended(struct super_block *sb) +static inline unsigned sb_any_quota_suspended(struct super_block *sb) { - return sb_has_quota_suspended(sb, USRQUOTA) || - sb_has_quota_suspended(sb, GRPQUOTA); + unsigned type, tmsk = 0; + for (type = 0; type < MAXQUOTAS; type++) + tmsk |= sb_has_quota_suspended(sb, type) << type; + return tmsk; } /* Does kernel know about any quota information for given sb + type? */ -static inline int sb_has_quota_loaded(struct super_block *sb, int type) +static inline bool sb_has_quota_loaded(struct super_block *sb, int type) { /* Currently if anything is on, then quota usage is on as well */ return sb_has_quota_usage_enabled(sb, type); } -static inline int sb_any_quota_loaded(struct super_block *sb) +static inline unsigned sb_any_quota_loaded(struct super_block *sb) { - return sb_has_quota_loaded(sb, USRQUOTA) || - sb_has_quota_loaded(sb, GRPQUOTA); + unsigned type, tmsk = 0; + for (type = 0; type < MAXQUOTAS; type++) + tmsk |= sb_has_quota_loaded(sb, type) << type; + return tmsk; } -static inline int sb_has_quota_active(struct super_block *sb, int type) +static inline bool sb_has_quota_active(struct super_block *sb, int type) { return sb_has_quota_loaded(sb, type) && !sb_has_quota_suspended(sb, type); } -static inline int sb_any_quota_active(struct super_block *sb) +static inline unsigned sb_any_quota_active(struct super_block *sb) { - return sb_has_quota_active(sb, USRQUOTA) || - sb_has_quota_active(sb, GRPQUOTA); + return sb_any_quota_loaded(sb) & ~sb_any_quota_suspended(sb); } /* @@ -141,122 +139,6 @@ extern const struct quotactl_ops vfs_quotactl_ops; #define sb_dquot_ops (&dquot_operations) #define sb_quotactl_ops (&vfs_quotactl_ops) -/* It is better to call this function outside of any transaction as it might - * need a lot of space in journal for dquot structure allocation. */ -static inline void vfs_dq_init(struct inode *inode) -{ - BUG_ON(!inode->i_sb); - if (sb_any_quota_active(inode->i_sb) && !IS_NOQUOTA(inode)) - inode->i_sb->dq_op->initialize(inode, -1); -} - -/* The following allocation/freeing/transfer functions *must* be called inside - * a transaction (deadlocks possible otherwise) */ -static inline int vfs_dq_prealloc_space_nodirty(struct inode *inode, qsize_t nr) -{ - if (sb_any_quota_active(inode->i_sb)) { - /* Used space is updated in alloc_space() */ - if (inode->i_sb->dq_op->alloc_space(inode, nr, 1) == NO_QUOTA) - return 1; - } - else - inode_add_bytes(inode, nr); - return 0; -} - -static inline int vfs_dq_prealloc_space(struct inode *inode, qsize_t nr) -{ - int ret; - if (!(ret = vfs_dq_prealloc_space_nodirty(inode, nr))) - mark_inode_dirty(inode); - return ret; -} - -static inline int vfs_dq_alloc_space_nodirty(struct inode *inode, qsize_t nr) -{ - if (sb_any_quota_active(inode->i_sb)) { - /* Used space is updated in alloc_space() */ - if (inode->i_sb->dq_op->alloc_space(inode, nr, 0) == NO_QUOTA) - return 1; - } - else - inode_add_bytes(inode, nr); - return 0; -} - -static inline int vfs_dq_alloc_space(struct inode *inode, qsize_t nr) -{ - int ret; - if (!(ret = vfs_dq_alloc_space_nodirty(inode, nr))) - mark_inode_dirty(inode); - return ret; -} - -static inline int vfs_dq_reserve_space(struct inode *inode, qsize_t nr) -{ - if (sb_any_quota_active(inode->i_sb)) { - /* Used space is updated in alloc_space() */ - if (inode->i_sb->dq_op->reserve_space(inode, nr, 0) == NO_QUOTA) - return 1; - } - return 0; -} - -static inline int vfs_dq_alloc_inode(struct inode *inode) -{ - if (sb_any_quota_active(inode->i_sb)) { - vfs_dq_init(inode); - if (inode->i_sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) - return 1; - } - return 0; -} - -/* - * Convert in-memory reserved quotas to real consumed quotas - */ -static inline int vfs_dq_claim_space(struct inode *inode, qsize_t nr) -{ - if (sb_any_quota_active(inode->i_sb)) { - if (inode->i_sb->dq_op->claim_space(inode, nr) == NO_QUOTA) - return 1; - } else - inode_add_bytes(inode, nr); - - mark_inode_dirty(inode); - return 0; -} - -/* - * Release reserved (in-memory) quotas - */ -static inline -void vfs_dq_release_reservation_space(struct inode *inode, qsize_t nr) -{ - if (sb_any_quota_active(inode->i_sb)) - inode->i_sb->dq_op->release_rsv(inode, nr); -} - -static inline void vfs_dq_free_space_nodirty(struct inode *inode, qsize_t nr) -{ - if (sb_any_quota_active(inode->i_sb)) - inode->i_sb->dq_op->free_space(inode, nr); - else - inode_sub_bytes(inode, nr); -} - -static inline void vfs_dq_free_space(struct inode *inode, qsize_t nr) -{ - vfs_dq_free_space_nodirty(inode, nr); - mark_inode_dirty(inode); -} - -static inline void vfs_dq_free_inode(struct inode *inode) -{ - if (sb_any_quota_active(inode->i_sb)) - inode->i_sb->dq_op->free_inode(inode, 1); -} - /* Cannot be called inside a transaction */ static inline int vfs_dq_off(struct super_block *sb, int remount) { @@ -316,28 +198,20 @@ static inline int sb_any_quota_active(struct super_block *sb) #define sb_dquot_ops (NULL) #define sb_quotactl_ops (NULL) -static inline void vfs_dq_init(struct inode *inode) +static inline void dquot_initialize(struct inode *inode) { } -static inline void vfs_dq_drop(struct inode *inode) +static inline void dquot_drop(struct inode *inode) { } -static inline int vfs_dq_alloc_inode(struct inode *inode) +static inline int dquot_alloc_inode(const struct inode *inode) { return 0; } -static inline void vfs_dq_free_inode(struct inode *inode) -{ -} - -static inline void sync_quota_sb(struct super_block *sb, int type) -{ -} - -static inline void writeout_quota_sb(struct super_block *sb, int type) +static inline void dquot_free_inode(const struct inode *inode) { } @@ -351,110 +225,116 @@ static inline int vfs_dq_quota_on_remount(struct super_block *sb) return 0; } -static inline int vfs_dq_transfer(struct inode *inode, struct iattr *iattr) +static inline int dquot_transfer(struct inode *inode, struct iattr *iattr) { return 0; } -static inline int vfs_dq_prealloc_space_nodirty(struct inode *inode, qsize_t nr) +static inline int __dquot_alloc_space(struct inode *inode, qsize_t number, + int warn, int reserve) { - inode_add_bytes(inode, nr); + if (!reserve) + inode_add_bytes(inode, number); return 0; } -static inline int vfs_dq_prealloc_space(struct inode *inode, qsize_t nr) +static inline void __dquot_free_space(struct inode *inode, qsize_t number, + int reserve) { - vfs_dq_prealloc_space_nodirty(inode, nr); - mark_inode_dirty(inode); - return 0; + if (!reserve) + inode_sub_bytes(inode, number); } -static inline int vfs_dq_alloc_space_nodirty(struct inode *inode, qsize_t nr) +static inline int dquot_claim_space_nodirty(struct inode *inode, qsize_t number) { - inode_add_bytes(inode, nr); + inode_add_bytes(inode, number); return 0; } -static inline int vfs_dq_alloc_space(struct inode *inode, qsize_t nr) +#define dquot_file_open generic_file_open + +#endif /* CONFIG_QUOTA */ + +static inline int dquot_alloc_space_nodirty(struct inode *inode, qsize_t nr) { - vfs_dq_alloc_space_nodirty(inode, nr); - mark_inode_dirty(inode); - return 0; + return __dquot_alloc_space(inode, nr, 1, 0); } -static inline int vfs_dq_reserve_space(struct inode *inode, qsize_t nr) +static inline int dquot_alloc_space(struct inode *inode, qsize_t nr) { - return 0; + int ret; + + ret = dquot_alloc_space_nodirty(inode, nr); + if (!ret) + mark_inode_dirty(inode); + return ret; } -static inline int vfs_dq_claim_space(struct inode *inode, qsize_t nr) +static inline int dquot_alloc_block_nodirty(struct inode *inode, qsize_t nr) { - return vfs_dq_alloc_space(inode, nr); + return dquot_alloc_space_nodirty(inode, nr << inode->i_blkbits); } -static inline -int vfs_dq_release_reservation_space(struct inode *inode, qsize_t nr) +static inline int dquot_alloc_block(struct inode *inode, qsize_t nr) { - return 0; + return dquot_alloc_space(inode, nr << inode->i_blkbits); } -static inline void vfs_dq_free_space_nodirty(struct inode *inode, qsize_t nr) +static inline int dquot_prealloc_block_nodirty(struct inode *inode, qsize_t nr) { - inode_sub_bytes(inode, nr); + return __dquot_alloc_space(inode, nr << inode->i_blkbits, 0, 0); } -static inline void vfs_dq_free_space(struct inode *inode, qsize_t nr) +static inline int dquot_prealloc_block(struct inode *inode, qsize_t nr) { - vfs_dq_free_space_nodirty(inode, nr); - mark_inode_dirty(inode); -} - -#endif /* CONFIG_QUOTA */ + int ret; -static inline int vfs_dq_prealloc_block_nodirty(struct inode *inode, qsize_t nr) -{ - return vfs_dq_prealloc_space_nodirty(inode, nr << inode->i_blkbits); + ret = dquot_prealloc_block_nodirty(inode, nr); + if (!ret) + mark_inode_dirty(inode); + return ret; } -static inline int vfs_dq_prealloc_block(struct inode *inode, qsize_t nr) +static inline int dquot_reserve_block(struct inode *inode, qsize_t nr) { - return vfs_dq_prealloc_space(inode, nr << inode->i_blkbits); + return __dquot_alloc_space(inode, nr << inode->i_blkbits, 1, 1); } -static inline int vfs_dq_alloc_block_nodirty(struct inode *inode, qsize_t nr) +static inline int dquot_claim_block(struct inode *inode, qsize_t nr) { - return vfs_dq_alloc_space_nodirty(inode, nr << inode->i_blkbits); -} + int ret; -static inline int vfs_dq_alloc_block(struct inode *inode, qsize_t nr) -{ - return vfs_dq_alloc_space(inode, nr << inode->i_blkbits); + ret = dquot_claim_space_nodirty(inode, nr << inode->i_blkbits); + if (!ret) + mark_inode_dirty(inode); + return ret; } -static inline int vfs_dq_reserve_block(struct inode *inode, qsize_t nr) +static inline void dquot_free_space_nodirty(struct inode *inode, qsize_t nr) { - return vfs_dq_reserve_space(inode, nr << inode->i_blkbits); + __dquot_free_space(inode, nr, 0); } -static inline int vfs_dq_claim_block(struct inode *inode, qsize_t nr) +static inline void dquot_free_space(struct inode *inode, qsize_t nr) { - return vfs_dq_claim_space(inode, nr << inode->i_blkbits); + dquot_free_space_nodirty(inode, nr); + mark_inode_dirty(inode); } -static inline -void vfs_dq_release_reservation_block(struct inode *inode, qsize_t nr) +static inline void dquot_free_block_nodirty(struct inode *inode, qsize_t nr) { - vfs_dq_release_reservation_space(inode, nr << inode->i_blkbits); + dquot_free_space_nodirty(inode, nr << inode->i_blkbits); } -static inline void vfs_dq_free_block_nodirty(struct inode *inode, qsize_t nr) +static inline void dquot_free_block(struct inode *inode, qsize_t nr) { - vfs_dq_free_space_nodirty(inode, nr << inode->i_blkbits); + dquot_free_space(inode, nr << inode->i_blkbits); } -static inline void vfs_dq_free_block(struct inode *inode, qsize_t nr) +static inline void dquot_release_reservation_block(struct inode *inode, + qsize_t nr) { - vfs_dq_free_space(inode, nr << inode->i_blkbits); + __dquot_free_space(inode, nr << inode->i_blkbits, 1); } #endif /* _LINUX_QUOTAOPS_ */ diff --git a/include/linux/virtio_9p.h b/include/linux/virtio_9p.h index 095e10d148b..33227508008 100644 --- a/include/linux/virtio_9p.h +++ b/include/linux/virtio_9p.h @@ -5,7 +5,4 @@ #include <linux/virtio_ids.h> #include <linux/virtio_config.h> -/* Maximum number of virtio channels per partition (1 for now) */ -#define MAX_9P_CHAN 1 - #endif /* _LINUX_VIRTIO_9P_H */ diff --git a/include/net/9p/client.h b/include/net/9p/client.h index fb00b329f0d..52e1fff709e 100644 --- a/include/net/9p/client.h +++ b/include/net/9p/client.h @@ -29,6 +29,19 @@ /* Number of requests per row */ #define P9_ROW_MAXTAG 255 +/** enum p9_proto_versions - 9P protocol versions + * @p9_proto_legacy: 9P Legacy mode, pre-9P2000.u + * @p9_proto_2000u: 9P2000.u extension + * @p9_proto_2010L: 9P2010.L extension + */ + +enum p9_proto_versions{ + p9_proto_legacy = 0, + p9_proto_2000u = 1, + p9_proto_2010L = 2, +}; + + /** * enum p9_trans_status - different states of underlying transports * @Connected: transport is connected and healthy @@ -111,6 +124,7 @@ struct p9_req_t { * @lock: protect @fidlist * @msize: maximum data size negotiated by protocol * @dotu: extension flags negotiated by protocol + * @proto_version: 9P protocol version to use * @trans_mod: module API instantiated with this client * @trans: tranport instance state and API * @conn: connection state information used by trans_fd @@ -137,7 +151,7 @@ struct p9_req_t { struct p9_client { spinlock_t lock; /* protect client structure */ int msize; - unsigned char dotu; + unsigned char proto_version; struct p9_trans_module *trans_mod; enum p9_trans_status status; void *trans; @@ -209,5 +223,7 @@ int p9_parse_header(struct p9_fcall *, int32_t *, int8_t *, int16_t *, int); int p9stat_read(char *, int, struct p9_wstat *, int); void p9stat_free(struct p9_wstat *); +int p9_is_proto_dotu(struct p9_client *clnt); +int p9_is_proto_dotl(struct p9_client *clnt); #endif /* NET_9P_CLIENT_H */ diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h index dbe10845527..b17d49dfc3e 100644 --- a/include/trace/events/kvm.h +++ b/include/trace/events/kvm.h @@ -145,6 +145,47 @@ TRACE_EVENT(kvm_mmio, __entry->len, __entry->gpa, __entry->val) ); +#define kvm_fpu_load_symbol \ + {0, "unload"}, \ + {1, "load"} + +TRACE_EVENT(kvm_fpu, + TP_PROTO(int load), + TP_ARGS(load), + + TP_STRUCT__entry( + __field( u32, load ) + ), + + TP_fast_assign( + __entry->load = load; + ), + + TP_printk("%s", __print_symbolic(__entry->load, kvm_fpu_load_symbol)) +); + +TRACE_EVENT(kvm_age_page, + TP_PROTO(ulong hva, struct kvm_memory_slot *slot, int ref), + TP_ARGS(hva, slot, ref), + + TP_STRUCT__entry( + __field( u64, hva ) + __field( u64, gfn ) + __field( u8, referenced ) + ), + + TP_fast_assign( + __entry->hva = hva; + __entry->gfn = + slot->base_gfn + ((hva - slot->userspace_addr) >> PAGE_SHIFT); + __entry->referenced = ref; + ), + + TP_printk("hva %llx gfn %llx %s", + __entry->hva, __entry->gfn, + __entry->referenced ? "YOUNG" : "OLD") +); + #endif /* _TRACE_KVM_MAIN_H */ /* This part must be outside protection */ diff --git a/net/9p/client.c b/net/9p/client.c index 09d4f1e2e4a..bde9f3d38c5 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -46,6 +46,7 @@ enum { Opt_msize, Opt_trans, Opt_legacy, + Opt_version, Opt_err, }; @@ -53,9 +54,42 @@ static const match_table_t tokens = { {Opt_msize, "msize=%u"}, {Opt_legacy, "noextend"}, {Opt_trans, "trans=%s"}, + {Opt_version, "version=%s"}, {Opt_err, NULL}, }; +inline int p9_is_proto_dotl(struct p9_client *clnt) +{ + return (clnt->proto_version == p9_proto_2010L); +} +EXPORT_SYMBOL(p9_is_proto_dotl); + +inline int p9_is_proto_dotu(struct p9_client *clnt) +{ + return (clnt->proto_version == p9_proto_2000u); +} +EXPORT_SYMBOL(p9_is_proto_dotu); + +/* Interpret mount option for protocol version */ +static unsigned char get_protocol_version(const substring_t *name) +{ + unsigned char version = -EINVAL; + if (!strncmp("9p2000", name->from, name->to-name->from)) { + version = p9_proto_legacy; + P9_DPRINTK(P9_DEBUG_9P, "Protocol version: Legacy\n"); + } else if (!strncmp("9p2000.u", name->from, name->to-name->from)) { + version = p9_proto_2000u; + P9_DPRINTK(P9_DEBUG_9P, "Protocol version: 9P2000.u\n"); + } else if (!strncmp("9p2010.L", name->from, name->to-name->from)) { + version = p9_proto_2010L; + P9_DPRINTK(P9_DEBUG_9P, "Protocol version: 9P2010.L\n"); + } else { + P9_DPRINTK(P9_DEBUG_ERROR, "Unknown protocol version %s. ", + name->from); + } + return version; +} + static struct p9_req_t * p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...); @@ -75,7 +109,7 @@ static int parse_opts(char *opts, struct p9_client *clnt) int option; int ret = 0; - clnt->dotu = 1; + clnt->proto_version = p9_proto_2000u; clnt->msize = 8192; if (!opts) @@ -118,7 +152,13 @@ static int parse_opts(char *opts, struct p9_client *clnt) } break; case Opt_legacy: - clnt->dotu = 0; + clnt->proto_version = p9_proto_legacy; + break; + case Opt_version: + ret = get_protocol_version(&args[0]); + if (ret == -EINVAL) + goto free_and_return; + clnt->proto_version = ret; break; default: continue; @@ -410,14 +450,15 @@ static int p9_check_errors(struct p9_client *c, struct p9_req_t *req) int ecode; char *ename; - err = p9pdu_readf(req->rc, c->dotu, "s?d", &ename, &ecode); + err = p9pdu_readf(req->rc, c->proto_version, "s?d", + &ename, &ecode); if (err) { P9_DPRINTK(P9_DEBUG_ERROR, "couldn't parse error%d\n", err); return err; } - if (c->dotu) + if (p9_is_proto_dotu(c)) err = -ecode; if (!err || !IS_ERR_VALUE(err)) @@ -515,7 +556,7 @@ p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...) /* marshall the data */ p9pdu_prepare(req->tc, tag, type); va_start(ap, fmt); - err = p9pdu_vwritef(req->tc, c->dotu, fmt, ap); + err = p9pdu_vwritef(req->tc, c->proto_version, fmt, ap); va_end(ap); p9pdu_finalize(req->tc); @@ -627,14 +668,31 @@ int p9_client_version(struct p9_client *c) char *version; int msize; - P9_DPRINTK(P9_DEBUG_9P, ">>> TVERSION msize %d extended %d\n", - c->msize, c->dotu); - req = p9_client_rpc(c, P9_TVERSION, "ds", c->msize, - c->dotu ? "9P2000.u" : "9P2000"); + P9_DPRINTK(P9_DEBUG_9P, ">>> TVERSION msize %d protocol %d\n", + c->msize, c->proto_version); + + switch (c->proto_version) { + case p9_proto_2010L: + req = p9_client_rpc(c, P9_TVERSION, "ds", + c->msize, "9P2010.L"); + break; + case p9_proto_2000u: + req = p9_client_rpc(c, P9_TVERSION, "ds", + c->msize, "9P2000.u"); + break; + case p9_proto_legacy: + req = p9_client_rpc(c, P9_TVERSION, "ds", + c->msize, "9P2000"); + break; + default: + return -EINVAL; + break; + } + if (IS_ERR(req)) return PTR_ERR(req); - err = p9pdu_readf(req->rc, c->dotu, "ds", &msize, &version); + err = p9pdu_readf(req->rc, c->proto_version, "ds", &msize, &version); if (err) { P9_DPRINTK(P9_DEBUG_9P, "version error %d\n", err); p9pdu_dump(1, req->rc); @@ -642,10 +700,12 @@ int p9_client_version(struct p9_client *c) } P9_DPRINTK(P9_DEBUG_9P, "<<< RVERSION msize %d %s\n", msize, version); - if (!memcmp(version, "9P2000.u", 8)) - c->dotu = 1; - else if (!memcmp(version, "9P2000", 6)) - c->dotu = 0; + if (!strncmp(version, "9P2010.L", 8)) + c->proto_version = p9_proto_2010L; + else if (!strncmp(version, "9P2000.u", 8)) + c->proto_version = p9_proto_2000u; + else if (!strncmp(version, "9P2000", 6)) + c->proto_version = p9_proto_legacy; else { err = -EREMOTEIO; goto error; @@ -700,8 +760,8 @@ struct p9_client *p9_client_create(const char *dev_name, char *options) goto put_trans; } - P9_DPRINTK(P9_DEBUG_MUX, "clnt %p trans %p msize %d dotu %d\n", - clnt, clnt->trans_mod, clnt->msize, clnt->dotu); + P9_DPRINTK(P9_DEBUG_MUX, "clnt %p trans %p msize %d protocol %d\n", + clnt, clnt->trans_mod, clnt->msize, clnt->proto_version); err = clnt->trans_mod->create(clnt, dev_name, options); if (err) @@ -784,7 +844,7 @@ struct p9_fid *p9_client_attach(struct p9_client *clnt, struct p9_fid *afid, goto error; } - err = p9pdu_readf(req->rc, clnt->dotu, "Q", &qid); + err = p9pdu_readf(req->rc, clnt->proto_version, "Q", &qid); if (err) { p9pdu_dump(1, req->rc); p9_free_req(clnt, req); @@ -833,7 +893,7 @@ p9_client_auth(struct p9_client *clnt, char *uname, u32 n_uname, char *aname) goto error; } - err = p9pdu_readf(req->rc, clnt->dotu, "Q", &qid); + err = p9pdu_readf(req->rc, clnt->proto_version, "Q", &qid); if (err) { p9pdu_dump(1, req->rc); p9_free_req(clnt, req); @@ -891,7 +951,7 @@ struct p9_fid *p9_client_walk(struct p9_fid *oldfid, int nwname, char **wnames, goto error; } - err = p9pdu_readf(req->rc, clnt->dotu, "R", &nwqids, &wqids); + err = p9pdu_readf(req->rc, clnt->proto_version, "R", &nwqids, &wqids); if (err) { p9pdu_dump(1, req->rc); p9_free_req(clnt, req); @@ -952,7 +1012,7 @@ int p9_client_open(struct p9_fid *fid, int mode) goto error; } - err = p9pdu_readf(req->rc, clnt->dotu, "Qd", &qid, &iounit); + err = p9pdu_readf(req->rc, clnt->proto_version, "Qd", &qid, &iounit); if (err) { p9pdu_dump(1, req->rc); goto free_and_error; @@ -997,7 +1057,7 @@ int p9_client_fcreate(struct p9_fid *fid, char *name, u32 perm, int mode, goto error; } - err = p9pdu_readf(req->rc, clnt->dotu, "Qd", &qid, &iounit); + err = p9pdu_readf(req->rc, clnt->proto_version, "Qd", &qid, &iounit); if (err) { p9pdu_dump(1, req->rc); goto free_and_error; @@ -1098,7 +1158,7 @@ p9_client_read(struct p9_fid *fid, char *data, char __user *udata, u64 offset, goto error; } - err = p9pdu_readf(req->rc, clnt->dotu, "D", &count, &dataptr); + err = p9pdu_readf(req->rc, clnt->proto_version, "D", &count, &dataptr); if (err) { p9pdu_dump(1, req->rc); goto free_and_error; @@ -1159,7 +1219,7 @@ p9_client_write(struct p9_fid *fid, char *data, const char __user *udata, goto error; } - err = p9pdu_readf(req->rc, clnt->dotu, "d", &count); + err = p9pdu_readf(req->rc, clnt->proto_version, "d", &count); if (err) { p9pdu_dump(1, req->rc); goto free_and_error; @@ -1199,7 +1259,7 @@ struct p9_wstat *p9_client_stat(struct p9_fid *fid) goto error; } - err = p9pdu_readf(req->rc, clnt->dotu, "wS", &ignored, ret); + err = p9pdu_readf(req->rc, clnt->proto_version, "wS", &ignored, ret); if (err) { p9pdu_dump(1, req->rc); p9_free_req(clnt, req); @@ -1226,7 +1286,7 @@ error: } EXPORT_SYMBOL(p9_client_stat); -static int p9_client_statsize(struct p9_wstat *wst, int optional) +static int p9_client_statsize(struct p9_wstat *wst, int proto_version) { int ret; @@ -1245,7 +1305,7 @@ static int p9_client_statsize(struct p9_wstat *wst, int optional) if (wst->muid) ret += strlen(wst->muid); - if (optional) { + if (proto_version == p9_proto_2000u) { ret += 2+4+4+4; /* extension[s] n_uid[4] n_gid[4] n_muid[4] */ if (wst->extension) ret += strlen(wst->extension); @@ -1262,7 +1322,7 @@ int p9_client_wstat(struct p9_fid *fid, struct p9_wstat *wst) err = 0; clnt = fid->clnt; - wst->size = p9_client_statsize(wst, clnt->dotu); + wst->size = p9_client_statsize(wst, clnt->proto_version); P9_DPRINTK(P9_DEBUG_9P, ">>> TWSTAT fid %d\n", fid->fid); P9_DPRINTK(P9_DEBUG_9P, " sz=%x type=%x dev=%x qid=%x.%llx.%x\n" diff --git a/net/9p/protocol.c b/net/9p/protocol.c index fc70147c771..94f5a8f65e9 100644 --- a/net/9p/protocol.c +++ b/net/9p/protocol.c @@ -52,7 +52,7 @@ #endif static int -p9pdu_writef(struct p9_fcall *pdu, int optional, const char *fmt, ...); +p9pdu_writef(struct p9_fcall *pdu, int proto_version, const char *fmt, ...); #ifdef CONFIG_NET_9P_DEBUG void @@ -144,7 +144,8 @@ pdu_write_u(struct p9_fcall *pdu, const char __user *udata, size_t size) */ static int -p9pdu_vreadf(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap) +p9pdu_vreadf(struct p9_fcall *pdu, int proto_version, const char *fmt, + va_list ap) { const char *ptr; int errcode = 0; @@ -194,7 +195,8 @@ p9pdu_vreadf(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap) int16_t len; int size; - errcode = p9pdu_readf(pdu, optional, "w", &len); + errcode = p9pdu_readf(pdu, proto_version, + "w", &len); if (errcode) break; @@ -217,7 +219,7 @@ p9pdu_vreadf(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap) struct p9_qid *qid = va_arg(ap, struct p9_qid *); - errcode = p9pdu_readf(pdu, optional, "bdq", + errcode = p9pdu_readf(pdu, proto_version, "bdq", &qid->type, &qid->version, &qid->path); } @@ -230,7 +232,7 @@ p9pdu_vreadf(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap) stbuf->n_uid = stbuf->n_gid = stbuf->n_muid = -1; errcode = - p9pdu_readf(pdu, optional, + p9pdu_readf(pdu, proto_version, "wwdQdddqssss?sddd", &stbuf->size, &stbuf->type, &stbuf->dev, &stbuf->qid, @@ -250,7 +252,7 @@ p9pdu_vreadf(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap) void **data = va_arg(ap, void **); errcode = - p9pdu_readf(pdu, optional, "d", count); + p9pdu_readf(pdu, proto_version, "d", count); if (!errcode) { *count = MIN(*count, @@ -263,8 +265,8 @@ p9pdu_vreadf(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap) int16_t *nwname = va_arg(ap, int16_t *); char ***wnames = va_arg(ap, char ***); - errcode = - p9pdu_readf(pdu, optional, "w", nwname); + errcode = p9pdu_readf(pdu, proto_version, + "w", nwname); if (!errcode) { *wnames = kmalloc(sizeof(char *) * *nwname, @@ -278,7 +280,8 @@ p9pdu_vreadf(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap) for (i = 0; i < *nwname; i++) { errcode = - p9pdu_readf(pdu, optional, + p9pdu_readf(pdu, + proto_version, "s", &(*wnames)[i]); if (errcode) @@ -306,7 +309,7 @@ p9pdu_vreadf(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap) *wqids = NULL; errcode = - p9pdu_readf(pdu, optional, "w", nwqid); + p9pdu_readf(pdu, proto_version, "w", nwqid); if (!errcode) { *wqids = kmalloc(*nwqid * @@ -321,7 +324,8 @@ p9pdu_vreadf(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap) for (i = 0; i < *nwqid; i++) { errcode = - p9pdu_readf(pdu, optional, + p9pdu_readf(pdu, + proto_version, "Q", &(*wqids)[i]); if (errcode) @@ -336,7 +340,7 @@ p9pdu_vreadf(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap) } break; case '?': - if (!optional) + if (proto_version != p9_proto_2000u) return 0; break; default: @@ -352,7 +356,8 @@ p9pdu_vreadf(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap) } int -p9pdu_vwritef(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap) +p9pdu_vwritef(struct p9_fcall *pdu, int proto_version, const char *fmt, + va_list ap) { const char *ptr; int errcode = 0; @@ -389,7 +394,8 @@ p9pdu_vwritef(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap) if (sptr) len = MIN(strlen(sptr), USHORT_MAX); - errcode = p9pdu_writef(pdu, optional, "w", len); + errcode = p9pdu_writef(pdu, proto_version, + "w", len); if (!errcode && pdu_write(pdu, sptr, len)) errcode = -EFAULT; } @@ -398,7 +404,7 @@ p9pdu_vwritef(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap) const struct p9_qid *qid = va_arg(ap, const struct p9_qid *); errcode = - p9pdu_writef(pdu, optional, "bdq", + p9pdu_writef(pdu, proto_version, "bdq", qid->type, qid->version, qid->path); } break; @@ -406,7 +412,7 @@ p9pdu_vwritef(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap) const struct p9_wstat *stbuf = va_arg(ap, const struct p9_wstat *); errcode = - p9pdu_writef(pdu, optional, + p9pdu_writef(pdu, proto_version, "wwdQdddqssss?sddd", stbuf->size, stbuf->type, stbuf->dev, &stbuf->qid, @@ -421,8 +427,8 @@ p9pdu_vwritef(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap) int32_t count = va_arg(ap, int32_t); const void *data = va_arg(ap, const void *); - errcode = - p9pdu_writef(pdu, optional, "d", count); + errcode = p9pdu_writef(pdu, proto_version, "d", + count); if (!errcode && pdu_write(pdu, data, count)) errcode = -EFAULT; } @@ -431,8 +437,8 @@ p9pdu_vwritef(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap) int32_t count = va_arg(ap, int32_t); const char __user *udata = va_arg(ap, const void __user *); - errcode = - p9pdu_writef(pdu, optional, "d", count); + errcode = p9pdu_writef(pdu, proto_version, "d", + count); if (!errcode && pdu_write_u(pdu, udata, count)) errcode = -EFAULT; } @@ -441,14 +447,15 @@ p9pdu_vwritef(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap) int16_t nwname = va_arg(ap, int); const char **wnames = va_arg(ap, const char **); - errcode = - p9pdu_writef(pdu, optional, "w", nwname); + errcode = p9pdu_writef(pdu, proto_version, "w", + nwname); if (!errcode) { int i; for (i = 0; i < nwname; i++) { errcode = - p9pdu_writef(pdu, optional, + p9pdu_writef(pdu, + proto_version, "s", wnames[i]); if (errcode) @@ -462,14 +469,15 @@ p9pdu_vwritef(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap) struct p9_qid *wqids = va_arg(ap, struct p9_qid *); - errcode = - p9pdu_writef(pdu, optional, "w", nwqid); + errcode = p9pdu_writef(pdu, proto_version, "w", + nwqid); if (!errcode) { int i; for (i = 0; i < nwqid; i++) { errcode = - p9pdu_writef(pdu, optional, + p9pdu_writef(pdu, + proto_version, "Q", &wqids[i]); if (errcode) @@ -479,7 +487,7 @@ p9pdu_vwritef(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap) } break; case '?': - if (!optional) + if (proto_version != p9_proto_2000u) return 0; break; default: @@ -494,32 +502,32 @@ p9pdu_vwritef(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap) return errcode; } -int p9pdu_readf(struct p9_fcall *pdu, int optional, const char *fmt, ...) +int p9pdu_readf(struct p9_fcall *pdu, int proto_version, const char *fmt, ...) { va_list ap; int ret; va_start(ap, fmt); - ret = p9pdu_vreadf(pdu, optional, fmt, ap); + ret = p9pdu_vreadf(pdu, proto_version, fmt, ap); va_end(ap); return ret; } static int -p9pdu_writef(struct p9_fcall *pdu, int optional, const char *fmt, ...) +p9pdu_writef(struct p9_fcall *pdu, int proto_version, const char *fmt, ...) { va_list ap; int ret; va_start(ap, fmt); - ret = p9pdu_vwritef(pdu, optional, fmt, ap); + ret = p9pdu_vwritef(pdu, proto_version, fmt, ap); va_end(ap); return ret; } -int p9stat_read(char *buf, int len, struct p9_wstat *st, int dotu) +int p9stat_read(char *buf, int len, struct p9_wstat *st, int proto_version) { struct p9_fcall fake_pdu; int ret; @@ -529,7 +537,7 @@ int p9stat_read(char *buf, int len, struct p9_wstat *st, int dotu) fake_pdu.sdata = buf; fake_pdu.offset = 0; - ret = p9pdu_readf(&fake_pdu, dotu, "S", st); + ret = p9pdu_readf(&fake_pdu, proto_version, "S", st); if (ret) { P9_DPRINTK(P9_DEBUG_9P, "<<< p9stat_read failed: %d\n", ret); p9pdu_dump(1, &fake_pdu); diff --git a/net/9p/protocol.h b/net/9p/protocol.h index ccde462e7ac..2431c0f38d5 100644 --- a/net/9p/protocol.h +++ b/net/9p/protocol.h @@ -25,9 +25,9 @@ * */ -int -p9pdu_vwritef(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap); -int p9pdu_readf(struct p9_fcall *pdu, int optional, const char *fmt, ...); +int p9pdu_vwritef(struct p9_fcall *pdu, int proto_version, const char *fmt, + va_list ap); +int p9pdu_readf(struct p9_fcall *pdu, int proto_version, const char *fmt, ...); int p9pdu_prepare(struct p9_fcall *pdu, int16_t tag, int8_t type); int p9pdu_finalize(struct p9_fcall *pdu); void p9pdu_dump(int, struct p9_fcall *); diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index cb50f4ae5ee..0aaed481937 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -49,8 +49,6 @@ /* a single mutex to manage channel initialization and attachment */ static DEFINE_MUTEX(virtio_9p_lock); -/* global which tracks highest initialized channel */ -static int chan_index; /** * struct virtio_chan - per-instance transport information @@ -68,8 +66,7 @@ static int chan_index; * */ -static struct virtio_chan { - bool initialized; +struct virtio_chan { bool inuse; spinlock_t lock; @@ -80,7 +77,11 @@ static struct virtio_chan { /* Scatterlist: can be too big for stack. */ struct scatterlist sg[VIRTQUEUE_NUM]; -} channels[MAX_9P_CHAN]; + + struct list_head chan_list; +}; + +static struct list_head virtio_chan_list; /* How many bytes left in this page. */ static unsigned int rest_of_page(void *data) @@ -217,9 +218,7 @@ p9_virtio_request(struct p9_client *client, struct p9_req_t *req) * p9_virtio_probe - probe for existence of 9P virtio channels * @vdev: virtio device to probe * - * This probes for existing virtio channels. At present only - * a single channel is in use, so in the future more work may need - * to be done here. + * This probes for existing virtio channels. * */ @@ -227,16 +226,10 @@ static int p9_virtio_probe(struct virtio_device *vdev) { int err; struct virtio_chan *chan; - int index; - mutex_lock(&virtio_9p_lock); - index = chan_index++; - chan = &channels[index]; - mutex_unlock(&virtio_9p_lock); - - if (chan_index > MAX_9P_CHAN) { - printk(KERN_ERR "9p: virtio: Maximum channels exceeded\n"); - BUG(); + chan = kmalloc(sizeof(struct virtio_chan), GFP_KERNEL); + if (!chan) { + printk(KERN_ERR "9p: Failed to allocate virtio 9P channel\n"); err = -ENOMEM; goto fail; } @@ -255,15 +248,15 @@ static int p9_virtio_probe(struct virtio_device *vdev) sg_init_table(chan->sg, VIRTQUEUE_NUM); chan->inuse = false; - chan->initialized = true; + mutex_lock(&virtio_9p_lock); + list_add_tail(&chan->chan_list, &virtio_chan_list); + mutex_unlock(&virtio_9p_lock); return 0; out_free_vq: vdev->config->del_vqs(vdev); + kfree(chan); fail: - mutex_lock(&virtio_9p_lock); - chan_index--; - mutex_unlock(&virtio_9p_lock); return err; } @@ -280,35 +273,31 @@ fail: * We use a simple reference count mechanism to ensure that only a single * mount has a channel open at a time. * - * Bugs: doesn't allow identification of a specific channel - * to allocate, channels are allocated sequentially. This was - * a pragmatic decision to get things rolling, but ideally some - * way of identifying the channel to attach to would be nice - * if we are going to support multiple channels. - * */ static int p9_virtio_create(struct p9_client *client, const char *devname, char *args) { - struct virtio_chan *chan = channels; - int index = 0; + struct virtio_chan *chan; + int ret = -ENOENT; + int found = 0; mutex_lock(&virtio_9p_lock); - while (index < MAX_9P_CHAN) { - if (chan->initialized && !chan->inuse) { - chan->inuse = true; - break; - } else { - index++; - chan = &channels[index]; + list_for_each_entry(chan, &virtio_chan_list, chan_list) { + if (!strcmp(devname, dev_name(&chan->vdev->dev))) { + if (!chan->inuse) { + chan->inuse = true; + found = 1; + break; + } + ret = -EBUSY; } } mutex_unlock(&virtio_9p_lock); - if (index >= MAX_9P_CHAN) { + if (!found) { printk(KERN_ERR "9p: no channels available\n"); - return -ENODEV; + return ret; } client->trans = (void *)chan; @@ -329,11 +318,13 @@ static void p9_virtio_remove(struct virtio_device *vdev) struct virtio_chan *chan = vdev->priv; BUG_ON(chan->inuse); + vdev->config->del_vqs(vdev); + + mutex_lock(&virtio_9p_lock); + list_del(&chan->chan_list); + mutex_unlock(&virtio_9p_lock); + kfree(chan); - if (chan->initialized) { - vdev->config->del_vqs(vdev); - chan->initialized = false; - } } static struct virtio_device_id id_table[] = { @@ -364,10 +355,7 @@ static struct p9_trans_module p9_virtio_trans = { /* The standard init function */ static int __init p9_virtio_init(void) { - int count; - - for (count = 0; count < MAX_9P_CHAN; count++) - channels[count].initialized = false; + INIT_LIST_HEAD(&virtio_chan_list); v9fs_register_trans(&p9_virtio_trans); return register_virtio_driver(&p9_virtio_drv); diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index daece36c0a5..7f1178f6b83 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -12,3 +12,6 @@ config HAVE_KVM_EVENTFD config KVM_APIC_ARCHITECTURE bool + +config KVM_MMIO + bool diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c index f73de631e3e..057e2cca6af 100644 --- a/virt/kvm/assigned-dev.c +++ b/virt/kvm/assigned-dev.c @@ -504,12 +504,12 @@ out: static int kvm_vm_ioctl_assign_device(struct kvm *kvm, struct kvm_assigned_pci_dev *assigned_dev) { - int r = 0; + int r = 0, idx; struct kvm_assigned_dev_kernel *match; struct pci_dev *dev; mutex_lock(&kvm->lock); - down_read(&kvm->slots_lock); + idx = srcu_read_lock(&kvm->srcu); match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, assigned_dev->assigned_dev_id); @@ -526,7 +526,8 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm, r = -ENOMEM; goto out; } - dev = pci_get_bus_and_slot(assigned_dev->busnr, + dev = pci_get_domain_bus_and_slot(assigned_dev->segnr, + assigned_dev->busnr, assigned_dev->devfn); if (!dev) { printk(KERN_INFO "%s: host device not found\n", __func__); @@ -548,6 +549,7 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm, pci_reset_function(dev); match->assigned_dev_id = assigned_dev->assigned_dev_id; + match->host_segnr = assigned_dev->segnr; match->host_busnr = assigned_dev->busnr; match->host_devfn = assigned_dev->devfn; match->flags = assigned_dev->flags; @@ -573,7 +575,7 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm, } out: - up_read(&kvm->slots_lock); + srcu_read_unlock(&kvm->srcu, idx); mutex_unlock(&kvm->lock); return r; out_list_del: @@ -585,7 +587,7 @@ out_put: pci_dev_put(dev); out_free: kfree(match); - up_read(&kvm->slots_lock); + srcu_read_unlock(&kvm->srcu, idx); mutex_unlock(&kvm->lock); return r; } diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c index 04d69cd7049..5169736377a 100644 --- a/virt/kvm/coalesced_mmio.c +++ b/virt/kvm/coalesced_mmio.c @@ -92,41 +92,64 @@ static const struct kvm_io_device_ops coalesced_mmio_ops = { int kvm_coalesced_mmio_init(struct kvm *kvm) { struct kvm_coalesced_mmio_dev *dev; + struct page *page; int ret; + ret = -ENOMEM; + page = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!page) + goto out_err; + kvm->coalesced_mmio_ring = page_address(page); + + ret = -ENOMEM; dev = kzalloc(sizeof(struct kvm_coalesced_mmio_dev), GFP_KERNEL); if (!dev) - return -ENOMEM; + goto out_free_page; spin_lock_init(&dev->lock); kvm_iodevice_init(&dev->dev, &coalesced_mmio_ops); dev->kvm = kvm; kvm->coalesced_mmio_dev = dev; - ret = kvm_io_bus_register_dev(kvm, &kvm->mmio_bus, &dev->dev); + mutex_lock(&kvm->slots_lock); + ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, &dev->dev); + mutex_unlock(&kvm->slots_lock); if (ret < 0) - kfree(dev); + goto out_free_dev; + + return ret; +out_free_dev: + kfree(dev); +out_free_page: + __free_page(page); +out_err: return ret; } +void kvm_coalesced_mmio_free(struct kvm *kvm) +{ + if (kvm->coalesced_mmio_ring) + free_page((unsigned long)kvm->coalesced_mmio_ring); +} + int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm, - struct kvm_coalesced_mmio_zone *zone) + struct kvm_coalesced_mmio_zone *zone) { struct kvm_coalesced_mmio_dev *dev = kvm->coalesced_mmio_dev; if (dev == NULL) return -EINVAL; - down_write(&kvm->slots_lock); + mutex_lock(&kvm->slots_lock); if (dev->nb_zones >= KVM_COALESCED_MMIO_ZONE_MAX) { - up_write(&kvm->slots_lock); + mutex_unlock(&kvm->slots_lock); return -ENOBUFS; } dev->zone[dev->nb_zones] = *zone; dev->nb_zones++; - up_write(&kvm->slots_lock); + mutex_unlock(&kvm->slots_lock); return 0; } @@ -140,10 +163,10 @@ int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm, if (dev == NULL) return -EINVAL; - down_write(&kvm->slots_lock); + mutex_lock(&kvm->slots_lock); i = dev->nb_zones; - while(i) { + while (i) { z = &dev->zone[i - 1]; /* unregister all zones @@ -158,7 +181,7 @@ int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm, i--; } - up_write(&kvm->slots_lock); + mutex_unlock(&kvm->slots_lock); return 0; } diff --git a/virt/kvm/coalesced_mmio.h b/virt/kvm/coalesced_mmio.h index 4b49f27fa31..8a5959e3535 100644 --- a/virt/kvm/coalesced_mmio.h +++ b/virt/kvm/coalesced_mmio.h @@ -1,3 +1,6 @@ +#ifndef __KVM_COALESCED_MMIO_H__ +#define __KVM_COALESCED_MMIO_H__ + /* * KVM coalesced MMIO * @@ -7,6 +10,8 @@ * */ +#ifdef CONFIG_KVM_MMIO + #define KVM_COALESCED_MMIO_ZONE_MAX 100 struct kvm_coalesced_mmio_dev { @@ -18,7 +23,17 @@ struct kvm_coalesced_mmio_dev { }; int kvm_coalesced_mmio_init(struct kvm *kvm); +void kvm_coalesced_mmio_free(struct kvm *kvm); int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm, struct kvm_coalesced_mmio_zone *zone); int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm, struct kvm_coalesced_mmio_zone *zone); + +#else + +static inline int kvm_coalesced_mmio_init(struct kvm *kvm) { return 0; } +static inline void kvm_coalesced_mmio_free(struct kvm *kvm) { } + +#endif + +#endif diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index a9d3fc6c681..7016319b1ec 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -47,7 +47,6 @@ struct _irqfd { int gsi; struct list_head list; poll_table pt; - wait_queue_head_t *wqh; wait_queue_t wait; struct work_struct inject; struct work_struct shutdown; @@ -159,8 +158,6 @@ irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh, poll_table *pt) { struct _irqfd *irqfd = container_of(pt, struct _irqfd, pt); - - irqfd->wqh = wqh; add_wait_queue(wqh, &irqfd->wait); } @@ -463,7 +460,7 @@ static int kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) { int pio = args->flags & KVM_IOEVENTFD_FLAG_PIO; - struct kvm_io_bus *bus = pio ? &kvm->pio_bus : &kvm->mmio_bus; + enum kvm_bus bus_idx = pio ? KVM_PIO_BUS : KVM_MMIO_BUS; struct _ioeventfd *p; struct eventfd_ctx *eventfd; int ret; @@ -508,7 +505,7 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) else p->wildcard = true; - down_write(&kvm->slots_lock); + mutex_lock(&kvm->slots_lock); /* Verify that there isnt a match already */ if (ioeventfd_check_collision(kvm, p)) { @@ -518,18 +515,18 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) kvm_iodevice_init(&p->dev, &ioeventfd_ops); - ret = __kvm_io_bus_register_dev(bus, &p->dev); + ret = kvm_io_bus_register_dev(kvm, bus_idx, &p->dev); if (ret < 0) goto unlock_fail; list_add_tail(&p->list, &kvm->ioeventfds); - up_write(&kvm->slots_lock); + mutex_unlock(&kvm->slots_lock); return 0; unlock_fail: - up_write(&kvm->slots_lock); + mutex_unlock(&kvm->slots_lock); fail: kfree(p); @@ -542,7 +539,7 @@ static int kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) { int pio = args->flags & KVM_IOEVENTFD_FLAG_PIO; - struct kvm_io_bus *bus = pio ? &kvm->pio_bus : &kvm->mmio_bus; + enum kvm_bus bus_idx = pio ? KVM_PIO_BUS : KVM_MMIO_BUS; struct _ioeventfd *p, *tmp; struct eventfd_ctx *eventfd; int ret = -ENOENT; @@ -551,7 +548,7 @@ kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) if (IS_ERR(eventfd)) return PTR_ERR(eventfd); - down_write(&kvm->slots_lock); + mutex_lock(&kvm->slots_lock); list_for_each_entry_safe(p, tmp, &kvm->ioeventfds, list) { bool wildcard = !(args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH); @@ -565,13 +562,13 @@ kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) if (!p->wildcard && p->datamatch != args->datamatch) continue; - __kvm_io_bus_unregister_dev(bus, &p->dev); + kvm_io_bus_unregister_dev(kvm, bus_idx, &p->dev); ioeventfd_release(p); ret = 0; break; } - up_write(&kvm->slots_lock); + mutex_unlock(&kvm->slots_lock); eventfd_ctx_put(eventfd); diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index 38a2d20b89d..3db15a807f8 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c @@ -100,6 +100,19 @@ static int ioapic_service(struct kvm_ioapic *ioapic, unsigned int idx) return injected; } +static void update_handled_vectors(struct kvm_ioapic *ioapic) +{ + DECLARE_BITMAP(handled_vectors, 256); + int i; + + memset(handled_vectors, 0, sizeof(handled_vectors)); + for (i = 0; i < IOAPIC_NUM_PINS; ++i) + __set_bit(ioapic->redirtbl[i].fields.vector, handled_vectors); + memcpy(ioapic->handled_vectors, handled_vectors, + sizeof(handled_vectors)); + smp_wmb(); +} + static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) { unsigned index; @@ -134,6 +147,7 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) e->bits |= (u32) val; e->fields.remote_irr = 0; } + update_handled_vectors(ioapic); mask_after = e->fields.mask; if (mask_before != mask_after) kvm_fire_mask_notifiers(ioapic->kvm, index, mask_after); @@ -241,6 +255,9 @@ void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode) { struct kvm_ioapic *ioapic = kvm->arch.vioapic; + smp_rmb(); + if (!test_bit(vector, ioapic->handled_vectors)) + return; mutex_lock(&ioapic->lock); __kvm_ioapic_update_eoi(ioapic, vector, trigger_mode); mutex_unlock(&ioapic->lock); @@ -352,6 +369,7 @@ void kvm_ioapic_reset(struct kvm_ioapic *ioapic) ioapic->ioregsel = 0; ioapic->irr = 0; ioapic->id = 0; + update_handled_vectors(ioapic); } static const struct kvm_io_device_ops ioapic_mmio_ops = { @@ -372,13 +390,28 @@ int kvm_ioapic_init(struct kvm *kvm) kvm_ioapic_reset(ioapic); kvm_iodevice_init(&ioapic->dev, &ioapic_mmio_ops); ioapic->kvm = kvm; - ret = kvm_io_bus_register_dev(kvm, &kvm->mmio_bus, &ioapic->dev); - if (ret < 0) + mutex_lock(&kvm->slots_lock); + ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, &ioapic->dev); + mutex_unlock(&kvm->slots_lock); + if (ret < 0) { + kvm->arch.vioapic = NULL; kfree(ioapic); + } return ret; } +void kvm_ioapic_destroy(struct kvm *kvm) +{ + struct kvm_ioapic *ioapic = kvm->arch.vioapic; + + if (ioapic) { + kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &ioapic->dev); + kvm->arch.vioapic = NULL; + kfree(ioapic); + } +} + int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state) { struct kvm_ioapic *ioapic = ioapic_irqchip(kvm); @@ -399,6 +432,7 @@ int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state) mutex_lock(&ioapic->lock); memcpy(ioapic, state, sizeof(struct kvm_ioapic_state)); + update_handled_vectors(ioapic); mutex_unlock(&ioapic->lock); return 0; } diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h index 419c43b667a..8a751b78a43 100644 --- a/virt/kvm/ioapic.h +++ b/virt/kvm/ioapic.h @@ -46,6 +46,7 @@ struct kvm_ioapic { struct kvm *kvm; void (*ack_notifier)(void *opaque, int irq); struct mutex lock; + DECLARE_BITMAP(handled_vectors, 256); }; #ifdef DEBUG @@ -71,6 +72,7 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2); void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode); int kvm_ioapic_init(struct kvm *kvm); +void kvm_ioapic_destroy(struct kvm *kvm); int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level); void kvm_ioapic_reset(struct kvm_ioapic *ioapic); int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c index 15147583abd..80fd3ad3b2d 100644 --- a/virt/kvm/iommu.c +++ b/virt/kvm/iommu.c @@ -32,10 +32,10 @@ static int kvm_iommu_unmap_memslots(struct kvm *kvm); static void kvm_iommu_put_pages(struct kvm *kvm, gfn_t base_gfn, unsigned long npages); -int kvm_iommu_map_pages(struct kvm *kvm, - gfn_t base_gfn, unsigned long npages) +int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot) { - gfn_t gfn = base_gfn; + gfn_t gfn = slot->base_gfn; + unsigned long npages = slot->npages; pfn_t pfn; int i, r = 0; struct iommu_domain *domain = kvm->arch.iommu_domain; @@ -54,7 +54,7 @@ int kvm_iommu_map_pages(struct kvm *kvm, if (iommu_iova_to_phys(domain, gfn_to_gpa(gfn))) continue; - pfn = gfn_to_pfn(kvm, gfn); + pfn = gfn_to_pfn_memslot(kvm, slot, gfn); r = iommu_map_range(domain, gfn_to_gpa(gfn), pfn_to_hpa(pfn), @@ -69,17 +69,19 @@ int kvm_iommu_map_pages(struct kvm *kvm, return 0; unmap_pages: - kvm_iommu_put_pages(kvm, base_gfn, i); + kvm_iommu_put_pages(kvm, slot->base_gfn, i); return r; } static int kvm_iommu_map_memslots(struct kvm *kvm) { int i, r = 0; + struct kvm_memslots *slots; + + slots = rcu_dereference(kvm->memslots); - for (i = 0; i < kvm->nmemslots; i++) { - r = kvm_iommu_map_pages(kvm, kvm->memslots[i].base_gfn, - kvm->memslots[i].npages); + for (i = 0; i < slots->nmemslots; i++) { + r = kvm_iommu_map_pages(kvm, &slots->memslots[i]); if (r) break; } @@ -104,7 +106,8 @@ int kvm_assign_device(struct kvm *kvm, r = iommu_attach_device(domain, &pdev->dev); if (r) { - printk(KERN_ERR "assign device %x:%x.%x failed", + printk(KERN_ERR "assign device %x:%x:%x.%x failed", + pci_domain_nr(pdev->bus), pdev->bus->number, PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn)); @@ -125,7 +128,8 @@ int kvm_assign_device(struct kvm *kvm, goto out_unmap; } - printk(KERN_DEBUG "assign device: host bdf = %x:%x:%x\n", + printk(KERN_DEBUG "assign device %x:%x:%x.%x\n", + assigned_dev->host_segnr, assigned_dev->host_busnr, PCI_SLOT(assigned_dev->host_devfn), PCI_FUNC(assigned_dev->host_devfn)); @@ -152,7 +156,8 @@ int kvm_deassign_device(struct kvm *kvm, iommu_detach_device(domain, &pdev->dev); - printk(KERN_DEBUG "deassign device: host bdf = %x:%x:%x\n", + printk(KERN_DEBUG "deassign device %x:%x:%x.%x\n", + assigned_dev->host_segnr, assigned_dev->host_busnr, PCI_SLOT(assigned_dev->host_devfn), PCI_FUNC(assigned_dev->host_devfn)); @@ -210,10 +215,13 @@ static void kvm_iommu_put_pages(struct kvm *kvm, static int kvm_iommu_unmap_memslots(struct kvm *kvm) { int i; + struct kvm_memslots *slots; + + slots = rcu_dereference(kvm->memslots); - for (i = 0; i < kvm->nmemslots; i++) { - kvm_iommu_put_pages(kvm, kvm->memslots[i].base_gfn, - kvm->memslots[i].npages); + for (i = 0; i < slots->nmemslots; i++) { + kvm_iommu_put_pages(kvm, slots->memslots[i].base_gfn, + slots->memslots[i].npages); } return 0; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index a944be392d6..548f9253c19 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -44,6 +44,8 @@ #include <linux/bitops.h> #include <linux/spinlock.h> #include <linux/compat.h> +#include <linux/srcu.h> +#include <linux/hugetlb.h> #include <asm/processor.h> #include <asm/io.h> @@ -51,9 +53,7 @@ #include <asm/pgtable.h> #include <asm-generic/bitops/le.h> -#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET #include "coalesced_mmio.h" -#endif #define CREATE_TRACE_POINTS #include <trace/events/kvm.h> @@ -86,6 +86,8 @@ static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, static int hardware_enable_all(void); static void hardware_disable_all(void); +static void kvm_io_bus_destroy(struct kvm_io_bus *bus); + static bool kvm_rebooting; static bool largepages_enabled = true; @@ -136,7 +138,7 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req) zalloc_cpumask_var(&cpus, GFP_ATOMIC); - spin_lock(&kvm->requests_lock); + raw_spin_lock(&kvm->requests_lock); me = smp_processor_id(); kvm_for_each_vcpu(i, vcpu, kvm) { if (test_and_set_bit(req, &vcpu->requests)) @@ -151,7 +153,7 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req) smp_call_function_many(cpus, ack_flush, NULL, 1); else called = false; - spin_unlock(&kvm->requests_lock); + raw_spin_unlock(&kvm->requests_lock); free_cpumask_var(cpus); return called; } @@ -215,7 +217,7 @@ static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn, unsigned long address) { struct kvm *kvm = mmu_notifier_to_kvm(mn); - int need_tlb_flush; + int need_tlb_flush, idx; /* * When ->invalidate_page runs, the linux pte has been zapped @@ -235,10 +237,12 @@ static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn, * pte after kvm_unmap_hva returned, without noticing the page * is going to be freed. */ + idx = srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); kvm->mmu_notifier_seq++; need_tlb_flush = kvm_unmap_hva(kvm, address); spin_unlock(&kvm->mmu_lock); + srcu_read_unlock(&kvm->srcu, idx); /* we've to flush the tlb before the pages can be freed */ if (need_tlb_flush) @@ -252,11 +256,14 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, pte_t pte) { struct kvm *kvm = mmu_notifier_to_kvm(mn); + int idx; + idx = srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); kvm->mmu_notifier_seq++; kvm_set_spte_hva(kvm, address, pte); spin_unlock(&kvm->mmu_lock); + srcu_read_unlock(&kvm->srcu, idx); } static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, @@ -265,8 +272,9 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, unsigned long end) { struct kvm *kvm = mmu_notifier_to_kvm(mn); - int need_tlb_flush = 0; + int need_tlb_flush = 0, idx; + idx = srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); /* * The count increase must become visible at unlock time as no @@ -277,6 +285,7 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, for (; start < end; start += PAGE_SIZE) need_tlb_flush |= kvm_unmap_hva(kvm, start); spin_unlock(&kvm->mmu_lock); + srcu_read_unlock(&kvm->srcu, idx); /* we've to flush the tlb before the pages can be freed */ if (need_tlb_flush) @@ -314,11 +323,13 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, unsigned long address) { struct kvm *kvm = mmu_notifier_to_kvm(mn); - int young; + int young, idx; + idx = srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); young = kvm_age_hva(kvm, address); spin_unlock(&kvm->mmu_lock); + srcu_read_unlock(&kvm->srcu, idx); if (young) kvm_flush_remote_tlbs(kvm); @@ -341,15 +352,26 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { .change_pte = kvm_mmu_notifier_change_pte, .release = kvm_mmu_notifier_release, }; + +static int kvm_init_mmu_notifier(struct kvm *kvm) +{ + kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops; + return mmu_notifier_register(&kvm->mmu_notifier, current->mm); +} + +#else /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */ + +static int kvm_init_mmu_notifier(struct kvm *kvm) +{ + return 0; +} + #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */ static struct kvm *kvm_create_vm(void) { - int r = 0; + int r = 0, i; struct kvm *kvm = kvm_arch_create_vm(); -#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET - struct page *page; -#endif if (IS_ERR(kvm)) goto out; @@ -363,39 +385,35 @@ static struct kvm *kvm_create_vm(void) INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list); #endif -#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET - page = alloc_page(GFP_KERNEL | __GFP_ZERO); - if (!page) { - r = -ENOMEM; + r = -ENOMEM; + kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); + if (!kvm->memslots) goto out_err; - } - kvm->coalesced_mmio_ring = - (struct kvm_coalesced_mmio_ring *)page_address(page); -#endif - -#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) - { - kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops; - r = mmu_notifier_register(&kvm->mmu_notifier, current->mm); - if (r) { -#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET - put_page(page); -#endif + if (init_srcu_struct(&kvm->srcu)) + goto out_err; + for (i = 0; i < KVM_NR_BUSES; i++) { + kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus), + GFP_KERNEL); + if (!kvm->buses[i]) { + cleanup_srcu_struct(&kvm->srcu); goto out_err; } } -#endif + + r = kvm_init_mmu_notifier(kvm); + if (r) { + cleanup_srcu_struct(&kvm->srcu); + goto out_err; + } kvm->mm = current->mm; atomic_inc(&kvm->mm->mm_count); spin_lock_init(&kvm->mmu_lock); - spin_lock_init(&kvm->requests_lock); - kvm_io_bus_init(&kvm->pio_bus); + raw_spin_lock_init(&kvm->requests_lock); kvm_eventfd_init(kvm); mutex_init(&kvm->lock); mutex_init(&kvm->irq_lock); - kvm_io_bus_init(&kvm->mmio_bus); - init_rwsem(&kvm->slots_lock); + mutex_init(&kvm->slots_lock); atomic_set(&kvm->users_count, 1); spin_lock(&kvm_lock); list_add(&kvm->vm_list, &vm_list); @@ -406,12 +424,12 @@ static struct kvm *kvm_create_vm(void) out: return kvm; -#if defined(KVM_COALESCED_MMIO_PAGE_OFFSET) || \ - (defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)) out_err: hardware_disable_all(); -#endif out_err_nodisable: + for (i = 0; i < KVM_NR_BUSES; i++) + kfree(kvm->buses[i]); + kfree(kvm->memslots); kfree(kvm); return ERR_PTR(r); } @@ -446,13 +464,17 @@ static void kvm_free_physmem_slot(struct kvm_memory_slot *free, void kvm_free_physmem(struct kvm *kvm) { int i; + struct kvm_memslots *slots = kvm->memslots; + + for (i = 0; i < slots->nmemslots; ++i) + kvm_free_physmem_slot(&slots->memslots[i], NULL); - for (i = 0; i < kvm->nmemslots; ++i) - kvm_free_physmem_slot(&kvm->memslots[i], NULL); + kfree(kvm->memslots); } static void kvm_destroy_vm(struct kvm *kvm) { + int i; struct mm_struct *mm = kvm->mm; kvm_arch_sync_events(kvm); @@ -460,12 +482,9 @@ static void kvm_destroy_vm(struct kvm *kvm) list_del(&kvm->vm_list); spin_unlock(&kvm_lock); kvm_free_irq_routing(kvm); - kvm_io_bus_destroy(&kvm->pio_bus); - kvm_io_bus_destroy(&kvm->mmio_bus); -#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET - if (kvm->coalesced_mmio_ring != NULL) - free_page((unsigned long)kvm->coalesced_mmio_ring); -#endif + for (i = 0; i < KVM_NR_BUSES; i++) + kvm_io_bus_destroy(kvm->buses[i]); + kvm_coalesced_mmio_free(kvm); #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); #else @@ -512,12 +531,13 @@ int __kvm_set_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem, int user_alloc) { - int r; + int r, flush_shadow = 0; gfn_t base_gfn; unsigned long npages; unsigned long i; struct kvm_memory_slot *memslot; struct kvm_memory_slot old, new; + struct kvm_memslots *slots, *old_memslots; r = -EINVAL; /* General sanity checks */ @@ -532,7 +552,7 @@ int __kvm_set_memory_region(struct kvm *kvm, if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) goto out; - memslot = &kvm->memslots[mem->slot]; + memslot = &kvm->memslots->memslots[mem->slot]; base_gfn = mem->guest_phys_addr >> PAGE_SHIFT; npages = mem->memory_size >> PAGE_SHIFT; @@ -553,7 +573,7 @@ int __kvm_set_memory_region(struct kvm *kvm, /* Check for overlaps */ r = -EEXIST; for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { - struct kvm_memory_slot *s = &kvm->memslots[i]; + struct kvm_memory_slot *s = &kvm->memslots->memslots[i]; if (s == memslot || !s->npages) continue; @@ -579,15 +599,7 @@ int __kvm_set_memory_region(struct kvm *kvm, memset(new.rmap, 0, npages * sizeof(*new.rmap)); new.user_alloc = user_alloc; - /* - * hva_to_rmmap() serialzies with the mmu_lock and to be - * safe it has to ignore memslots with !user_alloc && - * !userspace_addr. - */ - if (user_alloc) - new.userspace_addr = mem->userspace_addr; - else - new.userspace_addr = 0; + new.userspace_addr = mem->userspace_addr; } if (!npages) goto skip_lpage; @@ -642,8 +654,9 @@ skip_lpage: if (!new.dirty_bitmap) goto out_free; memset(new.dirty_bitmap, 0, dirty_bytes); + /* destroy any largepage mappings for dirty tracking */ if (old.npages) - kvm_arch_flush_shadow(kvm); + flush_shadow = 1; } #else /* not defined CONFIG_S390 */ new.user_alloc = user_alloc; @@ -651,36 +664,72 @@ skip_lpage: new.userspace_addr = mem->userspace_addr; #endif /* not defined CONFIG_S390 */ - if (!npages) + if (!npages) { + r = -ENOMEM; + slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); + if (!slots) + goto out_free; + memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); + if (mem->slot >= slots->nmemslots) + slots->nmemslots = mem->slot + 1; + slots->memslots[mem->slot].flags |= KVM_MEMSLOT_INVALID; + + old_memslots = kvm->memslots; + rcu_assign_pointer(kvm->memslots, slots); + synchronize_srcu_expedited(&kvm->srcu); + /* From this point no new shadow pages pointing to a deleted + * memslot will be created. + * + * validation of sp->gfn happens in: + * - gfn_to_hva (kvm_read_guest, gfn_to_pfn) + * - kvm_is_visible_gfn (mmu_check_roots) + */ kvm_arch_flush_shadow(kvm); + kfree(old_memslots); + } - spin_lock(&kvm->mmu_lock); - if (mem->slot >= kvm->nmemslots) - kvm->nmemslots = mem->slot + 1; - - *memslot = new; - spin_unlock(&kvm->mmu_lock); - - r = kvm_arch_set_memory_region(kvm, mem, old, user_alloc); - if (r) { - spin_lock(&kvm->mmu_lock); - *memslot = old; - spin_unlock(&kvm->mmu_lock); + r = kvm_arch_prepare_memory_region(kvm, &new, old, mem, user_alloc); + if (r) goto out_free; - } - kvm_free_physmem_slot(&old, npages ? &new : NULL); - /* Slot deletion case: we have to update the current slot */ - spin_lock(&kvm->mmu_lock); - if (!npages) - *memslot = old; - spin_unlock(&kvm->mmu_lock); #ifdef CONFIG_DMAR /* map the pages in iommu page table */ - r = kvm_iommu_map_pages(kvm, base_gfn, npages); - if (r) - goto out; + if (npages) { + r = kvm_iommu_map_pages(kvm, &new); + if (r) + goto out_free; + } #endif + + r = -ENOMEM; + slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); + if (!slots) + goto out_free; + memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); + if (mem->slot >= slots->nmemslots) + slots->nmemslots = mem->slot + 1; + + /* actual memory is freed via old in kvm_free_physmem_slot below */ + if (!npages) { + new.rmap = NULL; + new.dirty_bitmap = NULL; + for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) + new.lpage_info[i] = NULL; + } + + slots->memslots[mem->slot] = new; + old_memslots = kvm->memslots; + rcu_assign_pointer(kvm->memslots, slots); + synchronize_srcu_expedited(&kvm->srcu); + + kvm_arch_commit_memory_region(kvm, mem, old, user_alloc); + + kvm_free_physmem_slot(&old, &new); + kfree(old_memslots); + + if (flush_shadow) + kvm_arch_flush_shadow(kvm); + return 0; out_free: @@ -697,9 +746,9 @@ int kvm_set_memory_region(struct kvm *kvm, { int r; - down_write(&kvm->slots_lock); + mutex_lock(&kvm->slots_lock); r = __kvm_set_memory_region(kvm, mem, user_alloc); - up_write(&kvm->slots_lock); + mutex_unlock(&kvm->slots_lock); return r; } EXPORT_SYMBOL_GPL(kvm_set_memory_region); @@ -726,7 +775,7 @@ int kvm_get_dirty_log(struct kvm *kvm, if (log->slot >= KVM_MEMORY_SLOTS) goto out; - memslot = &kvm->memslots[log->slot]; + memslot = &kvm->memslots->memslots[log->slot]; r = -ENOENT; if (!memslot->dirty_bitmap) goto out; @@ -780,9 +829,10 @@ EXPORT_SYMBOL_GPL(kvm_is_error_hva); struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn) { int i; + struct kvm_memslots *slots = rcu_dereference(kvm->memslots); - for (i = 0; i < kvm->nmemslots; ++i) { - struct kvm_memory_slot *memslot = &kvm->memslots[i]; + for (i = 0; i < slots->nmemslots; ++i) { + struct kvm_memory_slot *memslot = &slots->memslots[i]; if (gfn >= memslot->base_gfn && gfn < memslot->base_gfn + memslot->npages) @@ -801,10 +851,14 @@ struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) { int i; + struct kvm_memslots *slots = rcu_dereference(kvm->memslots); - gfn = unalias_gfn(kvm, gfn); + gfn = unalias_gfn_instantiation(kvm, gfn); for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { - struct kvm_memory_slot *memslot = &kvm->memslots[i]; + struct kvm_memory_slot *memslot = &slots->memslots[i]; + + if (memslot->flags & KVM_MEMSLOT_INVALID) + continue; if (gfn >= memslot->base_gfn && gfn < memslot->base_gfn + memslot->npages) @@ -814,33 +868,68 @@ int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) } EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); +unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn) +{ + struct vm_area_struct *vma; + unsigned long addr, size; + + size = PAGE_SIZE; + + addr = gfn_to_hva(kvm, gfn); + if (kvm_is_error_hva(addr)) + return PAGE_SIZE; + + down_read(¤t->mm->mmap_sem); + vma = find_vma(current->mm, addr); + if (!vma) + goto out; + + size = vma_kernel_pagesize(vma); + +out: + up_read(¤t->mm->mmap_sem); + + return size; +} + +int memslot_id(struct kvm *kvm, gfn_t gfn) +{ + int i; + struct kvm_memslots *slots = rcu_dereference(kvm->memslots); + struct kvm_memory_slot *memslot = NULL; + + gfn = unalias_gfn(kvm, gfn); + for (i = 0; i < slots->nmemslots; ++i) { + memslot = &slots->memslots[i]; + + if (gfn >= memslot->base_gfn + && gfn < memslot->base_gfn + memslot->npages) + break; + } + + return memslot - slots->memslots; +} + unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) { struct kvm_memory_slot *slot; - gfn = unalias_gfn(kvm, gfn); + gfn = unalias_gfn_instantiation(kvm, gfn); slot = gfn_to_memslot_unaliased(kvm, gfn); - if (!slot) + if (!slot || slot->flags & KVM_MEMSLOT_INVALID) return bad_hva(); return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE); } EXPORT_SYMBOL_GPL(gfn_to_hva); -pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) +static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr) { struct page *page[1]; - unsigned long addr; int npages; pfn_t pfn; might_sleep(); - addr = gfn_to_hva(kvm, gfn); - if (kvm_is_error_hva(addr)) { - get_page(bad_page); - return page_to_pfn(bad_page); - } - npages = get_user_pages_fast(addr, 1, 1, page); if (unlikely(npages != 1)) { @@ -865,8 +954,32 @@ pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) return pfn; } +pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) +{ + unsigned long addr; + + addr = gfn_to_hva(kvm, gfn); + if (kvm_is_error_hva(addr)) { + get_page(bad_page); + return page_to_pfn(bad_page); + } + + return hva_to_pfn(kvm, addr); +} EXPORT_SYMBOL_GPL(gfn_to_pfn); +static unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn) +{ + return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE); +} + +pfn_t gfn_to_pfn_memslot(struct kvm *kvm, + struct kvm_memory_slot *slot, gfn_t gfn) +{ + unsigned long addr = gfn_to_hva_memslot(slot, gfn); + return hva_to_pfn(kvm, addr); +} + struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) { pfn_t pfn; @@ -1854,12 +1967,7 @@ static struct notifier_block kvm_reboot_notifier = { .priority = 0, }; -void kvm_io_bus_init(struct kvm_io_bus *bus) -{ - memset(bus, 0, sizeof(*bus)); -} - -void kvm_io_bus_destroy(struct kvm_io_bus *bus) +static void kvm_io_bus_destroy(struct kvm_io_bus *bus) { int i; @@ -1868,13 +1976,15 @@ void kvm_io_bus_destroy(struct kvm_io_bus *bus) kvm_iodevice_destructor(pos); } + kfree(bus); } /* kvm_io_bus_write - called under kvm->slots_lock */ -int kvm_io_bus_write(struct kvm_io_bus *bus, gpa_t addr, +int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int len, const void *val) { int i; + struct kvm_io_bus *bus = rcu_dereference(kvm->buses[bus_idx]); for (i = 0; i < bus->dev_count; i++) if (!kvm_iodevice_write(bus->devs[i], addr, len, val)) return 0; @@ -1882,59 +1992,71 @@ int kvm_io_bus_write(struct kvm_io_bus *bus, gpa_t addr, } /* kvm_io_bus_read - called under kvm->slots_lock */ -int kvm_io_bus_read(struct kvm_io_bus *bus, gpa_t addr, int len, void *val) +int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, + int len, void *val) { int i; + struct kvm_io_bus *bus = rcu_dereference(kvm->buses[bus_idx]); + for (i = 0; i < bus->dev_count; i++) if (!kvm_iodevice_read(bus->devs[i], addr, len, val)) return 0; return -EOPNOTSUPP; } -int kvm_io_bus_register_dev(struct kvm *kvm, struct kvm_io_bus *bus, - struct kvm_io_device *dev) +/* Caller must hold slots_lock. */ +int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, + struct kvm_io_device *dev) { - int ret; - - down_write(&kvm->slots_lock); - ret = __kvm_io_bus_register_dev(bus, dev); - up_write(&kvm->slots_lock); + struct kvm_io_bus *new_bus, *bus; - return ret; -} - -/* An unlocked version. Caller must have write lock on slots_lock. */ -int __kvm_io_bus_register_dev(struct kvm_io_bus *bus, - struct kvm_io_device *dev) -{ + bus = kvm->buses[bus_idx]; if (bus->dev_count > NR_IOBUS_DEVS-1) return -ENOSPC; - bus->devs[bus->dev_count++] = dev; + new_bus = kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL); + if (!new_bus) + return -ENOMEM; + memcpy(new_bus, bus, sizeof(struct kvm_io_bus)); + new_bus->devs[new_bus->dev_count++] = dev; + rcu_assign_pointer(kvm->buses[bus_idx], new_bus); + synchronize_srcu_expedited(&kvm->srcu); + kfree(bus); return 0; } -void kvm_io_bus_unregister_dev(struct kvm *kvm, - struct kvm_io_bus *bus, - struct kvm_io_device *dev) +/* Caller must hold slots_lock. */ +int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, + struct kvm_io_device *dev) { - down_write(&kvm->slots_lock); - __kvm_io_bus_unregister_dev(bus, dev); - up_write(&kvm->slots_lock); -} + int i, r; + struct kvm_io_bus *new_bus, *bus; -/* An unlocked version. Caller must have write lock on slots_lock. */ -void __kvm_io_bus_unregister_dev(struct kvm_io_bus *bus, - struct kvm_io_device *dev) -{ - int i; + new_bus = kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL); + if (!new_bus) + return -ENOMEM; - for (i = 0; i < bus->dev_count; i++) - if (bus->devs[i] == dev) { - bus->devs[i] = bus->devs[--bus->dev_count]; + bus = kvm->buses[bus_idx]; + memcpy(new_bus, bus, sizeof(struct kvm_io_bus)); + + r = -ENOENT; + for (i = 0; i < new_bus->dev_count; i++) + if (new_bus->devs[i] == dev) { + r = 0; + new_bus->devs[i] = new_bus->devs[--new_bus->dev_count]; break; } + + if (r) { + kfree(new_bus); + return r; + } + + rcu_assign_pointer(kvm->buses[bus_idx], new_bus); + synchronize_srcu_expedited(&kvm->srcu); + kfree(bus); + return r; } static struct notifier_block kvm_cpu_notifier = { |