summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/feature-removal-schedule.txt32
-rw-r--r--Documentation/filesystems/Locking18
-rw-r--r--Documentation/kvm/api.txt12
-rw-r--r--MAINTAINERS2
-rw-r--r--arch/ia64/kvm/Kconfig1
-rw-r--r--arch/ia64/kvm/kvm-ia64.c50
-rw-r--r--arch/ia64/kvm/kvm_fw.c28
-rw-r--r--arch/ia64/kvm/mmio.c4
-rw-r--r--arch/ia64/kvm/vcpu.c4
-rw-r--r--arch/powerpc/include/asm/kvm_asm.h6
-rw-r--r--arch/powerpc/include/asm/kvm_book3s.h11
-rw-r--r--arch/powerpc/include/asm/kvm_book3s_64_asm.h18
-rw-r--r--arch/powerpc/include/asm/kvm_e500.h3
-rw-r--r--arch/powerpc/include/asm/kvm_host.h23
-rw-r--r--arch/powerpc/include/asm/kvm_ppc.h83
-rw-r--r--arch/powerpc/include/asm/paca.h5
-rw-r--r--arch/powerpc/include/asm/reg.h4
-rw-r--r--arch/powerpc/kernel/asm-offsets.c33
-rw-r--r--arch/powerpc/kernel/ppc_ksyms.c1
-rw-r--r--arch/powerpc/kvm/44x_emulate.c25
-rw-r--r--arch/powerpc/kvm/44x_tlb.c20
-rw-r--r--arch/powerpc/kvm/Kconfig1
-rw-r--r--arch/powerpc/kvm/book3s.c309
-rw-r--r--arch/powerpc/kvm/book3s_64_emulate.c77
-rw-r--r--arch/powerpc/kvm/book3s_64_exports.c8
-rw-r--r--arch/powerpc/kvm/book3s_64_interrupts.S336
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu.c10
-rw-r--r--arch/powerpc/kvm/book3s_64_rmhandlers.S119
-rw-r--r--arch/powerpc/kvm/book3s_64_slb.S160
-rw-r--r--arch/powerpc/kvm/booke.c87
-rw-r--r--arch/powerpc/kvm/booke_emulate.c107
-rw-r--r--arch/powerpc/kvm/e500.c6
-rw-r--r--arch/powerpc/kvm/e500_emulate.c93
-rw-r--r--arch/powerpc/kvm/e500_tlb.c10
-rw-r--r--arch/powerpc/kvm/emulate.c118
-rw-r--r--arch/powerpc/kvm/powerpc.c40
-rw-r--r--arch/s390/kvm/kvm-s390.c26
-rw-r--r--arch/s390/kvm/kvm-s390.h10
-rw-r--r--arch/x86/include/asm/Kbuild1
-rw-r--r--arch/x86/include/asm/hyperv.h186
-rw-r--r--arch/x86/include/asm/kvm_emulate.h17
-rw-r--r--arch/x86/include/asm/kvm_host.h60
-rw-r--r--arch/x86/include/asm/kvm_para.h1
-rw-r--r--arch/x86/include/asm/svm.h2
-rw-r--r--arch/x86/include/asm/vmx.h5
-rw-r--r--arch/x86/kernel/vsyscall_64.c3
-rw-r--r--arch/x86/kvm/Kconfig1
-rw-r--r--arch/x86/kvm/emulate.c440
-rw-r--r--arch/x86/kvm/i8254.c23
-rw-r--r--arch/x86/kvm/i8254.h2
-rw-r--r--arch/x86/kvm/i8259.c46
-rw-r--r--arch/x86/kvm/irq.h3
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h31
-rw-r--r--arch/x86/kvm/lapic.c31
-rw-r--r--arch/x86/kvm/lapic.h8
-rw-r--r--arch/x86/kvm/mmu.c137
-rw-r--r--arch/x86/kvm/mmu.h35
-rw-r--r--arch/x86/kvm/paging_tmpl.h13
-rw-r--r--arch/x86/kvm/svm.c237
-rw-r--r--arch/x86/kvm/trace.h59
-rw-r--r--arch/x86/kvm/vmx.c396
-rw-r--r--arch/x86/kvm/x86.c1098
-rw-r--r--arch/x86/kvm/x86.h30
-rw-r--r--drivers/staging/pohmelfs/inode.c2
-rw-r--r--fs/9p/fid.c2
-rw-r--r--fs/9p/v9fs.c8
-rw-r--r--fs/9p/v9fs.h23
-rw-r--r--fs/9p/vfs_dir.c2
-rw-r--r--fs/9p/vfs_file.c4
-rw-r--r--fs/9p/vfs_inode.c48
-rw-r--r--fs/attr.c11
-rw-r--r--fs/ext2/balloc.c12
-rw-r--r--fs/ext2/file.c5
-rw-r--r--fs/ext2/ialloc.c14
-rw-r--r--fs/ext2/inode.c7
-rw-r--r--fs/ext2/namei.c51
-rw-r--r--fs/ext2/super.c2
-rw-r--r--fs/ext2/xattr.c10
-rw-r--r--fs/ext3/balloc.c11
-rw-r--r--fs/ext3/file.c7
-rw-r--r--fs/ext3/ialloc.c16
-rw-r--r--fs/ext3/inode.c41
-rw-r--r--fs/ext3/namei.c24
-rw-r--r--fs/ext3/super.c246
-rw-r--r--fs/ext3/xattr.c22
-rw-r--r--fs/ext4/file.c3
-rw-r--r--fs/ext4/ialloc.c16
-rw-r--r--fs/ext4/inode.c27
-rw-r--r--fs/ext4/mballoc.c6
-rw-r--r--fs/ext4/namei.c23
-rw-r--r--fs/ext4/super.c15
-rw-r--r--fs/ext4/xattr.c8
-rw-r--r--fs/gfs2/quota.c9
-rw-r--r--fs/gfs2/quota.h2
-rw-r--r--fs/gfs2/super.c2
-rw-r--r--fs/gfs2/sys.c2
-rw-r--r--fs/inode.c4
-rw-r--r--fs/jbd/commit.c10
-rw-r--r--fs/jbd/transaction.c43
-rw-r--r--fs/jfs/acl.c26
-rw-r--r--fs/jfs/file.c31
-rw-r--r--fs/jfs/inode.c9
-rw-r--r--fs/jfs/jfs_acl.h7
-rw-r--r--fs/jfs/jfs_dtree.c28
-rw-r--r--fs/jfs/jfs_extent.c16
-rw-r--r--fs/jfs/jfs_inode.c8
-rw-r--r--fs/jfs/jfs_inode.h1
-rw-r--r--fs/jfs/jfs_xtree.c21
-rw-r--r--fs/jfs/namei.c23
-rw-r--r--fs/jfs/super.c6
-rw-r--r--fs/jfs/xattr.c17
-rw-r--r--fs/namei.c16
-rw-r--r--fs/nfsd/vfs.c4
-rw-r--r--fs/ocfs2/alloc.c13
-rw-r--r--fs/ocfs2/aops.c11
-rw-r--r--fs/ocfs2/dir.c37
-rw-r--r--fs/ocfs2/file.c20
-rw-r--r--fs/ocfs2/inode.c6
-rw-r--r--fs/ocfs2/namei.c52
-rw-r--r--fs/ocfs2/quota_global.c7
-rw-r--r--fs/ocfs2/refcounttree.c2
-rw-r--r--fs/open.c5
-rw-r--r--fs/quota/Kconfig5
-rw-r--r--fs/quota/Makefile2
-rw-r--r--fs/quota/compat.c118
-rw-r--r--fs/quota/dquot.c412
-rw-r--r--fs/quota/netlink.c95
-rw-r--r--fs/quota/quota.c735
-rw-r--r--fs/reiserfs/bitmap.c10
-rw-r--r--fs/reiserfs/file.c2
-rw-r--r--fs/reiserfs/inode.c20
-rw-r--r--fs/reiserfs/namei.c23
-rw-r--r--fs/reiserfs/stree.c20
-rw-r--r--fs/reiserfs/super.c15
-rw-r--r--fs/reiserfs/xattr.c4
-rw-r--r--fs/sync.c14
-rw-r--r--fs/udf/balloc.c35
-rw-r--r--fs/udf/file.c28
-rw-r--r--fs/udf/ialloc.c14
-rw-r--r--fs/udf/inode.c6
-rw-r--r--fs/udf/namei.c17
-rw-r--r--fs/ufs/balloc.c24
-rw-r--r--fs/ufs/file.c3
-rw-r--r--fs/ufs/ialloc.c11
-rw-r--r--fs/ufs/inode.c4
-rw-r--r--fs/ufs/namei.c18
-rw-r--r--fs/ufs/super.c6
-rw-r--r--fs/ufs/truncate.c10
-rw-r--r--fs/xfs/linux-2.6/xfs_quotaops.c19
-rw-r--r--include/linux/ext3_fs.h33
-rw-r--r--include/linux/ext3_fs_i.h2
-rw-r--r--include/linux/jbd.h11
-rw-r--r--include/linux/jbd2.h11
-rw-r--r--include/linux/kvm.h10
-rw-r--r--include/linux/kvm_host.h71
-rw-r--r--include/linux/quota.h33
-rw-r--r--include/linux/quotaops.h304
-rw-r--r--include/linux/virtio_9p.h3
-rw-r--r--include/net/9p/client.h18
-rw-r--r--include/trace/events/kvm.h41
-rw-r--r--net/9p/client.c114
-rw-r--r--net/9p/protocol.c74
-rw-r--r--net/9p/protocol.h6
-rw-r--r--net/9p/trans_virtio.c80
-rw-r--r--virt/kvm/Kconfig3
-rw-r--r--virt/kvm/assigned-dev.c12
-rw-r--r--virt/kvm/coalesced_mmio.c43
-rw-r--r--virt/kvm/coalesced_mmio.h15
-rw-r--r--virt/kvm/eventfd.c21
-rw-r--r--virt/kvm/ioapic.c38
-rw-r--r--virt/kvm/ioapic.h2
-rw-r--r--virt/kvm/iommu.c36
-rw-r--r--virt/kvm/kvm_main.c392
173 files changed, 5562 insertions, 3350 deletions
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 03497909539..31575e220f3 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -556,3 +556,35 @@ Why: udev fully replaces this special file system that only contains CAPI
NCCI TTY device nodes. User space (pppdcapiplugin) works without
noticing the difference.
Who: Jan Kiszka <jan.kiszka@web.de>
+
+----------------------------
+
+What: KVM memory aliases support
+When: July 2010
+Why: Memory aliasing support is used for speeding up guest vga access
+ through the vga windows.
+
+ Modern userspace no longer uses this feature, so it's just bitrotted
+ code and can be removed with no impact.
+Who: Avi Kivity <avi@redhat.com>
+
+----------------------------
+
+What: KVM kernel-allocated memory slots
+When: July 2010
+Why: Since 2.6.25, kvm supports user-allocated memory slots, which are
+ much more flexible than kernel-allocated slots. All current userspace
+ supports the newer interface and this code can be removed with no
+ impact.
+Who: Avi Kivity <avi@redhat.com>
+
+----------------------------
+
+What: KVM paravirt mmu host support
+When: January 2011
+Why: The paravirt mmu host support is slower than non-paravirt mmu, both
+ on newer and older hardware. It is already not exposed to the guest,
+ and kept only for live migration purposes.
+Who: Avi Kivity <avi@redhat.com>
+
+----------------------------
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 18b9d0ca063..06bbbed7120 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -460,13 +460,6 @@ in sys_read() and friends.
--------------------------- dquot_operations -------------------------------
prototypes:
- int (*initialize) (struct inode *, int);
- int (*drop) (struct inode *);
- int (*alloc_space) (struct inode *, qsize_t, int);
- int (*alloc_inode) (const struct inode *, unsigned long);
- int (*free_space) (struct inode *, qsize_t);
- int (*free_inode) (const struct inode *, unsigned long);
- int (*transfer) (struct inode *, struct iattr *);
int (*write_dquot) (struct dquot *);
int (*acquire_dquot) (struct dquot *);
int (*release_dquot) (struct dquot *);
@@ -479,13 +472,6 @@ a proper locking wrt the filesystem and call the generic quota operations.
What filesystem should expect from the generic quota functions:
FS recursion Held locks when called
-initialize: yes maybe dqonoff_sem
-drop: yes -
-alloc_space: ->mark_dirty() -
-alloc_inode: ->mark_dirty() -
-free_space: ->mark_dirty() -
-free_inode: ->mark_dirty() -
-transfer: yes -
write_dquot: yes dqonoff_sem or dqptr_sem
acquire_dquot: yes dqonoff_sem or dqptr_sem
release_dquot: yes dqonoff_sem or dqptr_sem
@@ -495,10 +481,6 @@ write_info: yes dqonoff_sem
FS recursion means calling ->quota_read() and ->quota_write() from superblock
operations.
-->alloc_space(), ->alloc_inode(), ->free_space(), ->free_inode() are called
-only directly by the filesystem and do not call any fs functions only
-the ->mark_dirty() operation.
-
More details about quota locking can be found in fs/dquot.c.
--------------------------- vm_operations_struct -----------------------------
diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt
index 2811e452f75..c6416a39816 100644
--- a/Documentation/kvm/api.txt
+++ b/Documentation/kvm/api.txt
@@ -23,12 +23,12 @@ of a virtual machine. The ioctls belong to three classes
Only run vcpu ioctls from the same thread that was used to create the
vcpu.
-2. File descritpors
+2. File descriptors
The kvm API is centered around file descriptors. An initial
open("/dev/kvm") obtains a handle to the kvm subsystem; this handle
can be used to issue system ioctls. A KVM_CREATE_VM ioctl on this
-handle will create a VM file descripror which can be used to issue VM
+handle will create a VM file descriptor which can be used to issue VM
ioctls. A KVM_CREATE_VCPU ioctl on a VM fd will create a virtual cpu
and return a file descriptor pointing to it. Finally, ioctls on a vcpu
fd can be used to control the vcpu, including the important task of
@@ -643,7 +643,7 @@ Type: vm ioctl
Parameters: struct kvm_clock_data (in)
Returns: 0 on success, -1 on error
-Sets the current timestamp of kvmclock to the valued specific in its parameter.
+Sets the current timestamp of kvmclock to the value specified in its parameter.
In conjunction with KVM_GET_CLOCK, it is used to ensure monotonicity on scenarios
such as migration.
@@ -795,11 +795,11 @@ Unused.
__u64 data_offset; /* relative to kvm_run start */
} io;
-If exit_reason is KVM_EXIT_IO_IN or KVM_EXIT_IO_OUT, then the vcpu has
+If exit_reason is KVM_EXIT_IO, then the vcpu has
executed a port I/O instruction which could not be satisfied by kvm.
data_offset describes where the data is located (KVM_EXIT_IO_OUT) or
where kvm expects application code to place the data for the next
-KVM_RUN invocation (KVM_EXIT_IO_IN). Data format is a patcked array.
+KVM_RUN invocation (KVM_EXIT_IO_IN). Data format is a packed array.
struct {
struct kvm_debug_exit_arch arch;
@@ -815,7 +815,7 @@ Unused.
__u8 is_write;
} mmio;
-If exit_reason is KVM_EXIT_MMIO or KVM_EXIT_IO_OUT, then the vcpu has
+If exit_reason is KVM_EXIT_MMIO, then the vcpu has
executed a memory-mapped I/O instruction which could not be satisfied
by kvm. The 'data' member contains the written data if 'is_write' is
true, and should be filled by application code otherwise.
diff --git a/MAINTAINERS b/MAINTAINERS
index c6591bca646..51d8b5221dd 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3173,7 +3173,7 @@ F: arch/x86/include/asm/svm.h
F: arch/x86/kvm/svm.c
KERNEL VIRTUAL MACHINE (KVM) FOR POWERPC
-M: Hollis Blanchard <hollisb@us.ibm.com>
+M: Alexander Graf <agraf@suse.de>
L: kvm-ppc@vger.kernel.org
W: http://kvm.qumranet.com
S: Supported
diff --git a/arch/ia64/kvm/Kconfig b/arch/ia64/kvm/Kconfig
index 01c75797119..fa4d1e59deb 100644
--- a/arch/ia64/kvm/Kconfig
+++ b/arch/ia64/kvm/Kconfig
@@ -26,6 +26,7 @@ config KVM
select ANON_INODES
select HAVE_KVM_IRQCHIP
select KVM_APIC_ARCHITECTURE
+ select KVM_MMIO
---help---
Support hosting fully virtualized guest machines using hardware
virtualization extensions. You will need a fairly recent
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 5fdeec5fddc..26e0e089bfe 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -241,10 +241,10 @@ static int handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
return 0;
mmio:
if (p->dir)
- r = kvm_io_bus_read(&vcpu->kvm->mmio_bus, p->addr,
+ r = kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, p->addr,
p->size, &p->data);
else
- r = kvm_io_bus_write(&vcpu->kvm->mmio_bus, p->addr,
+ r = kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, p->addr,
p->size, &p->data);
if (r)
printk(KERN_ERR"kvm: No iodevice found! addr:%lx\n", p->addr);
@@ -636,12 +636,9 @@ static void kvm_vcpu_post_transition(struct kvm_vcpu *vcpu)
static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
union context *host_ctx, *guest_ctx;
- int r;
+ int r, idx;
- /*
- * down_read() may sleep and return with interrupts enabled
- */
- down_read(&vcpu->kvm->slots_lock);
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
again:
if (signal_pending(current)) {
@@ -663,7 +660,7 @@ again:
if (r < 0)
goto vcpu_run_fail;
- up_read(&vcpu->kvm->slots_lock);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
kvm_guest_enter();
/*
@@ -687,7 +684,7 @@ again:
kvm_guest_exit();
preempt_enable();
- down_read(&vcpu->kvm->slots_lock);
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
r = kvm_handle_exit(kvm_run, vcpu);
@@ -697,10 +694,10 @@ again:
}
out:
- up_read(&vcpu->kvm->slots_lock);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
if (r > 0) {
kvm_resched(vcpu);
- down_read(&vcpu->kvm->slots_lock);
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
goto again;
}
@@ -971,7 +968,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
goto out;
r = kvm_setup_default_irq_routing(kvm);
if (r) {
- kfree(kvm->arch.vioapic);
+ kvm_ioapic_destroy(kvm);
goto out;
}
break;
@@ -1377,12 +1374,14 @@ static void free_kvm(struct kvm *kvm)
static void kvm_release_vm_pages(struct kvm *kvm)
{
+ struct kvm_memslots *slots;
struct kvm_memory_slot *memslot;
int i, j;
unsigned long base_gfn;
- for (i = 0; i < kvm->nmemslots; i++) {
- memslot = &kvm->memslots[i];
+ slots = rcu_dereference(kvm->memslots);
+ for (i = 0; i < slots->nmemslots; i++) {
+ memslot = &slots->memslots[i];
base_gfn = memslot->base_gfn;
for (j = 0; j < memslot->npages; j++) {
@@ -1405,6 +1404,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
kfree(kvm->arch.vioapic);
kvm_release_vm_pages(kvm);
kvm_free_physmem(kvm);
+ cleanup_srcu_struct(&kvm->srcu);
free_kvm(kvm);
}
@@ -1576,15 +1576,15 @@ out:
return r;
}
-int kvm_arch_set_memory_region(struct kvm *kvm,
- struct kvm_userspace_memory_region *mem,
+int kvm_arch_prepare_memory_region(struct kvm *kvm,
+ struct kvm_memory_slot *memslot,
struct kvm_memory_slot old,
+ struct kvm_userspace_memory_region *mem,
int user_alloc)
{
unsigned long i;
unsigned long pfn;
- int npages = mem->memory_size >> PAGE_SHIFT;
- struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot];
+ int npages = memslot->npages;
unsigned long base_gfn = memslot->base_gfn;
if (base_gfn + npages > (KVM_MAX_MEM_SIZE >> PAGE_SHIFT))
@@ -1608,6 +1608,14 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
return 0;
}
+void kvm_arch_commit_memory_region(struct kvm *kvm,
+ struct kvm_userspace_memory_region *mem,
+ struct kvm_memory_slot old,
+ int user_alloc)
+{
+ return;
+}
+
void kvm_arch_flush_shadow(struct kvm *kvm)
{
kvm_flush_remote_tlbs(kvm);
@@ -1802,7 +1810,7 @@ static int kvm_ia64_sync_dirty_log(struct kvm *kvm,
if (log->slot >= KVM_MEMORY_SLOTS)
goto out;
- memslot = &kvm->memslots[log->slot];
+ memslot = &kvm->memslots->memslots[log->slot];
r = -ENOENT;
if (!memslot->dirty_bitmap)
goto out;
@@ -1827,6 +1835,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
struct kvm_memory_slot *memslot;
int is_dirty = 0;
+ mutex_lock(&kvm->slots_lock);
spin_lock(&kvm->arch.dirty_log_lock);
r = kvm_ia64_sync_dirty_log(kvm, log);
@@ -1840,12 +1849,13 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
/* If nothing is dirty, don't bother messing with page tables. */
if (is_dirty) {
kvm_flush_remote_tlbs(kvm);
- memslot = &kvm->memslots[log->slot];
+ memslot = &kvm->memslots->memslots[log->slot];
n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
memset(memslot->dirty_bitmap, 0, n);
}
r = 0;
out:
+ mutex_unlock(&kvm->slots_lock);
spin_unlock(&kvm->arch.dirty_log_lock);
return r;
}
diff --git a/arch/ia64/kvm/kvm_fw.c b/arch/ia64/kvm/kvm_fw.c
index e4b82319881..cb548ee9fca 100644
--- a/arch/ia64/kvm/kvm_fw.c
+++ b/arch/ia64/kvm/kvm_fw.c
@@ -75,7 +75,7 @@ static void set_pal_result(struct kvm_vcpu *vcpu,
struct exit_ctl_data *p;
p = kvm_get_exit_data(vcpu);
- if (p && p->exit_reason == EXIT_REASON_PAL_CALL) {
+ if (p->exit_reason == EXIT_REASON_PAL_CALL) {
p->u.pal_data.ret = result;
return ;
}
@@ -87,7 +87,7 @@ static void set_sal_result(struct kvm_vcpu *vcpu,
struct exit_ctl_data *p;
p = kvm_get_exit_data(vcpu);
- if (p && p->exit_reason == EXIT_REASON_SAL_CALL) {
+ if (p->exit_reason == EXIT_REASON_SAL_CALL) {
p->u.sal_data.ret = result;
return ;
}
@@ -322,7 +322,7 @@ static u64 kvm_get_pal_call_index(struct kvm_vcpu *vcpu)
struct exit_ctl_data *p;
p = kvm_get_exit_data(vcpu);
- if (p && (p->exit_reason == EXIT_REASON_PAL_CALL))
+ if (p->exit_reason == EXIT_REASON_PAL_CALL)
index = p->u.pal_data.gr28;
return index;
@@ -646,18 +646,16 @@ static void kvm_get_sal_call_data(struct kvm_vcpu *vcpu, u64 *in0, u64 *in1,
p = kvm_get_exit_data(vcpu);
- if (p) {
- if (p->exit_reason == EXIT_REASON_SAL_CALL) {
- *in0 = p->u.sal_data.in0;
- *in1 = p->u.sal_data.in1;
- *in2 = p->u.sal_data.in2;
- *in3 = p->u.sal_data.in3;
- *in4 = p->u.sal_data.in4;
- *in5 = p->u.sal_data.in5;
- *in6 = p->u.sal_data.in6;
- *in7 = p->u.sal_data.in7;
- return ;
- }
+ if (p->exit_reason == EXIT_REASON_SAL_CALL) {
+ *in0 = p->u.sal_data.in0;
+ *in1 = p->u.sal_data.in1;
+ *in2 = p->u.sal_data.in2;
+ *in3 = p->u.sal_data.in3;
+ *in4 = p->u.sal_data.in4;
+ *in5 = p->u.sal_data.in5;
+ *in6 = p->u.sal_data.in6;
+ *in7 = p->u.sal_data.in7;
+ return ;
}
*in0 = 0;
}
diff --git a/arch/ia64/kvm/mmio.c b/arch/ia64/kvm/mmio.c
index 9bf55afd08d..fb8f9f59a1e 100644
--- a/arch/ia64/kvm/mmio.c
+++ b/arch/ia64/kvm/mmio.c
@@ -316,8 +316,8 @@ void emulate_io_inst(struct kvm_vcpu *vcpu, u64 padr, u64 ma)
return;
} else {
inst_type = -1;
- panic_vm(vcpu, "Unsupported MMIO access instruction! \
- Bunld[0]=0x%lx, Bundle[1]=0x%lx\n",
+ panic_vm(vcpu, "Unsupported MMIO access instruction! "
+ "Bunld[0]=0x%lx, Bundle[1]=0x%lx\n",
bundle.i64[0], bundle.i64[1]);
}
diff --git a/arch/ia64/kvm/vcpu.c b/arch/ia64/kvm/vcpu.c
index dce75b70cdd..958815c9787 100644
--- a/arch/ia64/kvm/vcpu.c
+++ b/arch/ia64/kvm/vcpu.c
@@ -1639,8 +1639,8 @@ void vcpu_set_psr(struct kvm_vcpu *vcpu, unsigned long val)
* Otherwise panic
*/
if (val & (IA64_PSR_PK | IA64_PSR_IS | IA64_PSR_VM))
- panic_vm(vcpu, "Only support guests with vpsr.pk =0 \
- & vpsr.is=0\n");
+ panic_vm(vcpu, "Only support guests with vpsr.pk =0 "
+ "& vpsr.is=0\n");
/*
* For those IA64_PSR bits: id/da/dd/ss/ed/ia
diff --git a/arch/powerpc/include/asm/kvm_asm.h b/arch/powerpc/include/asm/kvm_asm.h
index af2abe74f54..aadf2dd6f84 100644
--- a/arch/powerpc/include/asm/kvm_asm.h
+++ b/arch/powerpc/include/asm/kvm_asm.h
@@ -97,4 +97,10 @@
#define RESUME_HOST RESUME_FLAG_HOST
#define RESUME_HOST_NV (RESUME_FLAG_HOST|RESUME_FLAG_NV)
+#define KVM_GUEST_MODE_NONE 0
+#define KVM_GUEST_MODE_GUEST 1
+#define KVM_GUEST_MODE_SKIP 2
+
+#define KVM_INST_FETCH_FAILED -1
+
#endif /* __POWERPC_KVM_ASM_H__ */
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 74b7369770d..db7db0a9696 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -22,7 +22,7 @@
#include <linux/types.h>
#include <linux/kvm_host.h>
-#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s_64_asm.h>
struct kvmppc_slb {
u64 esid;
@@ -33,7 +33,8 @@ struct kvmppc_slb {
bool Ks;
bool Kp;
bool nx;
- bool large;
+ bool large; /* PTEs are 16MB */
+ bool tb; /* 1TB segment */
bool class;
};
@@ -69,6 +70,7 @@ struct kvmppc_sid_map {
struct kvmppc_vcpu_book3s {
struct kvm_vcpu vcpu;
+ struct kvmppc_book3s_shadow_vcpu shadow_vcpu;
struct kvmppc_sid_map sid_map[SID_MAP_NUM];
struct kvmppc_slb slb[64];
struct {
@@ -89,6 +91,7 @@ struct kvmppc_vcpu_book3s {
u64 vsid_next;
u64 vsid_max;
int context_id;
+ ulong prog_flags; /* flags to inject when giving a 700 trap */
};
#define CONTEXT_HOST 0
@@ -119,6 +122,10 @@ extern void kvmppc_set_bat(struct kvm_vcpu *vcpu, struct kvmppc_bat *bat,
extern u32 kvmppc_trampoline_lowmem;
extern u32 kvmppc_trampoline_enter;
+extern void kvmppc_rmcall(ulong srr0, ulong srr1);
+extern void kvmppc_load_up_fpu(void);
+extern void kvmppc_load_up_altivec(void);
+extern void kvmppc_load_up_vsx(void);
static inline struct kvmppc_vcpu_book3s *to_book3s(struct kvm_vcpu *vcpu)
{
diff --git a/arch/powerpc/include/asm/kvm_book3s_64_asm.h b/arch/powerpc/include/asm/kvm_book3s_64_asm.h
index 2e06ee8184e..183461b4840 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64_asm.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64_asm.h
@@ -20,6 +20,8 @@
#ifndef __ASM_KVM_BOOK3S_ASM_H__
#define __ASM_KVM_BOOK3S_ASM_H__
+#ifdef __ASSEMBLY__
+
#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
#include <asm/kvm_asm.h>
@@ -55,4 +57,20 @@ kvmppc_resume_\intno:
#endif /* CONFIG_KVM_BOOK3S_64_HANDLER */
+#else /*__ASSEMBLY__ */
+
+struct kvmppc_book3s_shadow_vcpu {
+ ulong gpr[14];
+ u32 cr;
+ u32 xer;
+ ulong host_r1;
+ ulong host_r2;
+ ulong handler;
+ ulong scratch0;
+ ulong scratch1;
+ ulong vmhandler;
+};
+
+#endif /*__ASSEMBLY__ */
+
#endif /* __ASM_KVM_BOOK3S_ASM_H__ */
diff --git a/arch/powerpc/include/asm/kvm_e500.h b/arch/powerpc/include/asm/kvm_e500.h
index 9d497ce4972..7fea26fffb2 100644
--- a/arch/powerpc/include/asm/kvm_e500.h
+++ b/arch/powerpc/include/asm/kvm_e500.h
@@ -52,9 +52,12 @@ struct kvmppc_vcpu_e500 {
u32 mas5;
u32 mas6;
u32 mas7;
+ u32 l1csr0;
u32 l1csr1;
u32 hid0;
u32 hid1;
+ u32 tlb0cfg;
+ u32 tlb1cfg;
struct kvm_vcpu vcpu;
};
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 1201f62d0d7..5e5bae7e152 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -167,23 +167,40 @@ struct kvm_vcpu_arch {
ulong trampoline_lowmem;
ulong trampoline_enter;
ulong highmem_handler;
+ ulong rmcall;
ulong host_paca_phys;
struct kvmppc_mmu mmu;
#endif
- u64 fpr[32];
ulong gpr[32];
+ u64 fpr[32];
+ u32 fpscr;
+
+#ifdef CONFIG_ALTIVEC
+ vector128 vr[32];
+ vector128 vscr;
+#endif
+
+#ifdef CONFIG_VSX
+ u64 vsr[32];
+#endif
+
ulong pc;
- u32 cr;
ulong ctr;
ulong lr;
+
+#ifdef CONFIG_BOOKE
ulong xer;
+ u32 cr;
+#endif
ulong msr;
#ifdef CONFIG_PPC64
ulong shadow_msr;
+ ulong shadow_srr1;
ulong hflags;
+ ulong guest_owned_ext;
#endif
u32 mmucr;
ulong sprg0;
@@ -242,6 +259,8 @@ struct kvm_vcpu_arch {
#endif
ulong fault_dear;
ulong fault_esr;
+ ulong queued_dear;
+ ulong queued_esr;
gpa_t paddr_accessed;
u8 io_gpr; /* GPR used as IO source/target */
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 269ee46ab02..e2642829e43 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -28,6 +28,9 @@
#include <linux/types.h>
#include <linux/kvm_types.h>
#include <linux/kvm_host.h>
+#ifdef CONFIG_PPC_BOOK3S
+#include <asm/kvm_book3s.h>
+#endif
enum emulation_result {
EMULATE_DONE, /* no further processing */
@@ -80,8 +83,9 @@ extern void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu);
extern void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu);
extern int kvmppc_core_pending_dec(struct kvm_vcpu *vcpu);
-extern void kvmppc_core_queue_program(struct kvm_vcpu *vcpu);
+extern void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong flags);
extern void kvmppc_core_queue_dec(struct kvm_vcpu *vcpu);
+extern void kvmppc_core_dequeue_dec(struct kvm_vcpu *vcpu);
extern void kvmppc_core_queue_external(struct kvm_vcpu *vcpu,
struct kvm_interrupt *irq);
@@ -95,4 +99,81 @@ extern void kvmppc_booke_exit(void);
extern void kvmppc_core_destroy_mmu(struct kvm_vcpu *vcpu);
+#ifdef CONFIG_PPC_BOOK3S
+
+/* We assume we're always acting on the current vcpu */
+
+static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val)
+{
+ if ( num < 14 ) {
+ get_paca()->shadow_vcpu.gpr[num] = val;
+ to_book3s(vcpu)->shadow_vcpu.gpr[num] = val;
+ } else
+ vcpu->arch.gpr[num] = val;
+}
+
+static inline ulong kvmppc_get_gpr(struct kvm_vcpu *vcpu, int num)
+{
+ if ( num < 14 )
+ return get_paca()->shadow_vcpu.gpr[num];
+ else
+ return vcpu->arch.gpr[num];
+}
+
+static inline void kvmppc_set_cr(struct kvm_vcpu *vcpu, u32 val)
+{
+ get_paca()->shadow_vcpu.cr = val;
+ to_book3s(vcpu)->shadow_vcpu.cr = val;
+}
+
+static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu)
+{
+ return get_paca()->shadow_vcpu.cr;
+}
+
+static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, u32 val)
+{
+ get_paca()->shadow_vcpu.xer = val;
+ to_book3s(vcpu)->shadow_vcpu.xer = val;
+}
+
+static inline u32 kvmppc_get_xer(struct kvm_vcpu *vcpu)
+{
+ return get_paca()->shadow_vcpu.xer;
+}
+
+#else
+
+static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val)
+{
+ vcpu->arch.gpr[num] = val;
+}
+
+static inline ulong kvmppc_get_gpr(struct kvm_vcpu *vcpu, int num)
+{
+ return vcpu->arch.gpr[num];
+}
+
+static inline void kvmppc_set_cr(struct kvm_vcpu *vcpu, u32 val)
+{
+ vcpu->arch.cr = val;
+}
+
+static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.cr;
+}
+
+static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, u32 val)
+{
+ vcpu->arch.xer = val;
+}
+
+static inline u32 kvmppc_get_xer(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.xer;
+}
+
+#endif
+
#endif /* __POWERPC_KVM_PPC_H__ */
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 5e9b4ef7141..d8a693109c8 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -19,6 +19,9 @@
#include <asm/mmu.h>
#include <asm/page.h>
#include <asm/exception-64e.h>
+#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
+#include <asm/kvm_book3s_64_asm.h>
+#endif
register struct paca_struct *local_paca asm("r13");
@@ -135,6 +138,8 @@ struct paca_struct {
u64 esid;
u64 vsid;
} kvm_slb[64]; /* guest SLB */
+ /* We use this to store guest state in */
+ struct kvmppc_book3s_shadow_vcpu shadow_vcpu;
u8 kvm_slb_max; /* highest used guest slb entry */
u8 kvm_in_guest; /* are we inside the guest? */
#endif
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index bc8dd53f718..5572e86223f 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -426,6 +426,10 @@
#define SRR1_WAKEMT 0x00280000 /* mtctrl */
#define SRR1_WAKEDEC 0x00180000 /* Decrementer interrupt */
#define SRR1_WAKETHERM 0x00100000 /* Thermal management interrupt */
+#define SRR1_PROGFPE 0x00100000 /* Floating Point Enabled */
+#define SRR1_PROGPRIV 0x00040000 /* Privileged instruction */
+#define SRR1_PROGTRAP 0x00020000 /* Trap */
+#define SRR1_PROGADDR 0x00010000 /* SRR0 contains subsequent addr */
#define SPRN_HSRR0 0x13A /* Save/Restore Register 0 */
#define SPRN_HSRR1 0x13B /* Save/Restore Register 1 */
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index a6c2b63227b..957ceb7059c 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -194,6 +194,30 @@ int main(void)
DEFINE(PACA_KVM_IN_GUEST, offsetof(struct paca_struct, kvm_in_guest));
DEFINE(PACA_KVM_SLB, offsetof(struct paca_struct, kvm_slb));
DEFINE(PACA_KVM_SLB_MAX, offsetof(struct paca_struct, kvm_slb_max));
+ DEFINE(PACA_KVM_CR, offsetof(struct paca_struct, shadow_vcpu.cr));
+ DEFINE(PACA_KVM_XER, offsetof(struct paca_struct, shadow_vcpu.xer));
+ DEFINE(PACA_KVM_R0, offsetof(struct paca_struct, shadow_vcpu.gpr[0]));
+ DEFINE(PACA_KVM_R1, offsetof(struct paca_struct, shadow_vcpu.gpr[1]));
+ DEFINE(PACA_KVM_R2, offsetof(struct paca_struct, shadow_vcpu.gpr[2]));
+ DEFINE(PACA_KVM_R3, offsetof(struct paca_struct, shadow_vcpu.gpr[3]));
+ DEFINE(PACA_KVM_R4, offsetof(struct paca_struct, shadow_vcpu.gpr[4]));
+ DEFINE(PACA_KVM_R5, offsetof(struct paca_struct, shadow_vcpu.gpr[5]));
+ DEFINE(PACA_KVM_R6, offsetof(struct paca_struct, shadow_vcpu.gpr[6]));
+ DEFINE(PACA_KVM_R7, offsetof(struct paca_struct, shadow_vcpu.gpr[7]));
+ DEFINE(PACA_KVM_R8, offsetof(struct paca_struct, shadow_vcpu.gpr[8]));
+ DEFINE(PACA_KVM_R9, offsetof(struct paca_struct, shadow_vcpu.gpr[9]));
+ DEFINE(PACA_KVM_R10, offsetof(struct paca_struct, shadow_vcpu.gpr[10]));
+ DEFINE(PACA_KVM_R11, offsetof(struct paca_struct, shadow_vcpu.gpr[11]));
+ DEFINE(PACA_KVM_R12, offsetof(struct paca_struct, shadow_vcpu.gpr[12]));
+ DEFINE(PACA_KVM_R13, offsetof(struct paca_struct, shadow_vcpu.gpr[13]));
+ DEFINE(PACA_KVM_HOST_R1, offsetof(struct paca_struct, shadow_vcpu.host_r1));
+ DEFINE(PACA_KVM_HOST_R2, offsetof(struct paca_struct, shadow_vcpu.host_r2));
+ DEFINE(PACA_KVM_VMHANDLER, offsetof(struct paca_struct,
+ shadow_vcpu.vmhandler));
+ DEFINE(PACA_KVM_SCRATCH0, offsetof(struct paca_struct,
+ shadow_vcpu.scratch0));
+ DEFINE(PACA_KVM_SCRATCH1, offsetof(struct paca_struct,
+ shadow_vcpu.scratch1));
#endif
#endif /* CONFIG_PPC64 */
@@ -389,8 +413,6 @@ int main(void)
DEFINE(VCPU_HOST_PID, offsetof(struct kvm_vcpu, arch.host_pid));
DEFINE(VCPU_GPRS, offsetof(struct kvm_vcpu, arch.gpr));
DEFINE(VCPU_LR, offsetof(struct kvm_vcpu, arch.lr));
- DEFINE(VCPU_CR, offsetof(struct kvm_vcpu, arch.cr));
- DEFINE(VCPU_XER, offsetof(struct kvm_vcpu, arch.xer));
DEFINE(VCPU_CTR, offsetof(struct kvm_vcpu, arch.ctr));
DEFINE(VCPU_PC, offsetof(struct kvm_vcpu, arch.pc));
DEFINE(VCPU_MSR, offsetof(struct kvm_vcpu, arch.msr));
@@ -411,11 +433,16 @@ int main(void)
DEFINE(VCPU_HOST_R2, offsetof(struct kvm_vcpu, arch.host_r2));
DEFINE(VCPU_HOST_MSR, offsetof(struct kvm_vcpu, arch.host_msr));
DEFINE(VCPU_SHADOW_MSR, offsetof(struct kvm_vcpu, arch.shadow_msr));
+ DEFINE(VCPU_SHADOW_SRR1, offsetof(struct kvm_vcpu, arch.shadow_srr1));
DEFINE(VCPU_TRAMPOLINE_LOWMEM, offsetof(struct kvm_vcpu, arch.trampoline_lowmem));
DEFINE(VCPU_TRAMPOLINE_ENTER, offsetof(struct kvm_vcpu, arch.trampoline_enter));
DEFINE(VCPU_HIGHMEM_HANDLER, offsetof(struct kvm_vcpu, arch.highmem_handler));
+ DEFINE(VCPU_RMCALL, offsetof(struct kvm_vcpu, arch.rmcall));
DEFINE(VCPU_HFLAGS, offsetof(struct kvm_vcpu, arch.hflags));
-#endif
+#else
+ DEFINE(VCPU_CR, offsetof(struct kvm_vcpu, arch.cr));
+ DEFINE(VCPU_XER, offsetof(struct kvm_vcpu, arch.xer));
+#endif /* CONFIG_PPC64 */
#endif
#ifdef CONFIG_44x
DEFINE(PGD_T_LOG2, PGD_T_LOG2);
diff --git a/arch/powerpc/kernel/ppc_ksyms.c b/arch/powerpc/kernel/ppc_ksyms.c
index 425451453e9..ab3e392ac63 100644
--- a/arch/powerpc/kernel/ppc_ksyms.c
+++ b/arch/powerpc/kernel/ppc_ksyms.c
@@ -107,6 +107,7 @@ EXPORT_SYMBOL(giveup_altivec);
#endif /* CONFIG_ALTIVEC */
#ifdef CONFIG_VSX
EXPORT_SYMBOL(giveup_vsx);
+EXPORT_SYMBOL_GPL(__giveup_vsx);
#endif /* CONFIG_VSX */
#ifdef CONFIG_SPE
EXPORT_SYMBOL(giveup_spe);
diff --git a/arch/powerpc/kvm/44x_emulate.c b/arch/powerpc/kvm/44x_emulate.c
index 61af58fcece..65ea083a5b2 100644
--- a/arch/powerpc/kvm/44x_emulate.c
+++ b/arch/powerpc/kvm/44x_emulate.c
@@ -65,13 +65,14 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
*/
switch (dcrn) {
case DCRN_CPR0_CONFIG_ADDR:
- vcpu->arch.gpr[rt] = vcpu->arch.cpr0_cfgaddr;
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.cpr0_cfgaddr);
break;
case DCRN_CPR0_CONFIG_DATA:
local_irq_disable();
mtdcr(DCRN_CPR0_CONFIG_ADDR,
vcpu->arch.cpr0_cfgaddr);
- vcpu->arch.gpr[rt] = mfdcr(DCRN_CPR0_CONFIG_DATA);
+ kvmppc_set_gpr(vcpu, rt,
+ mfdcr(DCRN_CPR0_CONFIG_DATA));
local_irq_enable();
break;
default:
@@ -93,11 +94,11 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
/* emulate some access in kernel */
switch (dcrn) {
case DCRN_CPR0_CONFIG_ADDR:
- vcpu->arch.cpr0_cfgaddr = vcpu->arch.gpr[rs];
+ vcpu->arch.cpr0_cfgaddr = kvmppc_get_gpr(vcpu, rs);
break;
default:
run->dcr.dcrn = dcrn;
- run->dcr.data = vcpu->arch.gpr[rs];
+ run->dcr.data = kvmppc_get_gpr(vcpu, rs);
run->dcr.is_write = 1;
vcpu->arch.dcr_needed = 1;
kvmppc_account_exit(vcpu, DCR_EXITS);
@@ -146,13 +147,13 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
switch (sprn) {
case SPRN_PID:
- kvmppc_set_pid(vcpu, vcpu->arch.gpr[rs]); break;
+ kvmppc_set_pid(vcpu, kvmppc_get_gpr(vcpu, rs)); break;
case SPRN_MMUCR:
- vcpu->arch.mmucr = vcpu->arch.gpr[rs]; break;
+ vcpu->arch.mmucr = kvmppc_get_gpr(vcpu, rs); break;
case SPRN_CCR0:
- vcpu->arch.ccr0 = vcpu->arch.gpr[rs]; break;
+ vcpu->arch.ccr0 = kvmppc_get_gpr(vcpu, rs); break;
case SPRN_CCR1:
- vcpu->arch.ccr1 = vcpu->arch.gpr[rs]; break;
+ vcpu->arch.ccr1 = kvmppc_get_gpr(vcpu, rs); break;
default:
emulated = kvmppc_booke_emulate_mtspr(vcpu, sprn, rs);
}
@@ -167,13 +168,13 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
switch (sprn) {
case SPRN_PID:
- vcpu->arch.gpr[rt] = vcpu->arch.pid; break;
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.pid); break;
case SPRN_MMUCR:
- vcpu->arch.gpr[rt] = vcpu->arch.mmucr; break;
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.mmucr); break;
case SPRN_CCR0:
- vcpu->arch.gpr[rt] = vcpu->arch.ccr0; break;
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.ccr0); break;
case SPRN_CCR1:
- vcpu->arch.gpr[rt] = vcpu->arch.ccr1; break;
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.ccr1); break;
default:
emulated = kvmppc_booke_emulate_mfspr(vcpu, sprn, rt);
}
diff --git a/arch/powerpc/kvm/44x_tlb.c b/arch/powerpc/kvm/44x_tlb.c
index ff3cb63b811..2570fcc7665 100644
--- a/arch/powerpc/kvm/44x_tlb.c
+++ b/arch/powerpc/kvm/44x_tlb.c
@@ -439,7 +439,7 @@ int kvmppc_44x_emul_tlbwe(struct kvm_vcpu *vcpu, u8 ra, u8 rs, u8 ws)
struct kvmppc_44x_tlbe *tlbe;
unsigned int gtlb_index;
- gtlb_index = vcpu->arch.gpr[ra];
+ gtlb_index = kvmppc_get_gpr(vcpu, ra);
if (gtlb_index > KVM44x_GUEST_TLB_SIZE) {
printk("%s: index %d\n", __func__, gtlb_index);
kvmppc_dump_vcpu(vcpu);
@@ -455,15 +455,15 @@ int kvmppc_44x_emul_tlbwe(struct kvm_vcpu *vcpu, u8 ra, u8 rs, u8 ws)
switch (ws) {
case PPC44x_TLB_PAGEID:
tlbe->tid = get_mmucr_stid(vcpu);
- tlbe->word0 = vcpu->arch.gpr[rs];
+ tlbe->word0 = kvmppc_get_gpr(vcpu, rs);
break;
case PPC44x_TLB_XLAT:
- tlbe->word1 = vcpu->arch.gpr[rs];
+ tlbe->word1 = kvmppc_get_gpr(vcpu, rs);
break;
case PPC44x_TLB_ATTRIB:
- tlbe->word2 = vcpu->arch.gpr[rs];
+ tlbe->word2 = kvmppc_get_gpr(vcpu, rs);
break;
default:
@@ -500,18 +500,20 @@ int kvmppc_44x_emul_tlbsx(struct kvm_vcpu *vcpu, u8 rt, u8 ra, u8 rb, u8 rc)
unsigned int as = get_mmucr_sts(vcpu);
unsigned int pid = get_mmucr_stid(vcpu);
- ea = vcpu->arch.gpr[rb];
+ ea = kvmppc_get_gpr(vcpu, rb);
if (ra)
- ea += vcpu->arch.gpr[ra];
+ ea += kvmppc_get_gpr(vcpu, ra);
gtlb_index = kvmppc_44x_tlb_index(vcpu, ea, pid, as);
if (rc) {
+ u32 cr = kvmppc_get_cr(vcpu);
+
if (gtlb_index < 0)
- vcpu->arch.cr &= ~0x20000000;
+ kvmppc_set_cr(vcpu, cr & ~0x20000000);
else
- vcpu->arch.cr |= 0x20000000;
+ kvmppc_set_cr(vcpu, cr | 0x20000000);
}
- vcpu->arch.gpr[rt] = gtlb_index;
+ kvmppc_set_gpr(vcpu, rt, gtlb_index);
kvmppc_set_exit_type(vcpu, EMULATED_TLBSX_EXITS);
return EMULATE_DONE;
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index fe037fdaf1b..60624cc9f4d 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -20,6 +20,7 @@ config KVM
bool
select PREEMPT_NOTIFIERS
select ANON_INODES
+ select KVM_MMIO
config KVM_BOOK3S_64_HANDLER
bool
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 3e294bd9b8c..9a271f0929c 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -33,12 +33,9 @@
/* #define EXIT_DEBUG */
/* #define EXIT_DEBUG_SIMPLE */
+/* #define DEBUG_EXT */
-/* Without AGGRESSIVE_DEC we only fire off a DEC interrupt when DEC turns 0.
- * When set, we retrigger a DEC interrupt after that if DEC <= 0.
- * PPC32 Linux runs faster without AGGRESSIVE_DEC, PPC64 Linux requires it. */
-
-/* #define AGGRESSIVE_DEC */
+static void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr);
struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "exits", VCPU_STAT(sum_exits) },
@@ -72,16 +69,24 @@ void kvmppc_core_load_guest_debugstate(struct kvm_vcpu *vcpu)
void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
memcpy(get_paca()->kvm_slb, to_book3s(vcpu)->slb_shadow, sizeof(get_paca()->kvm_slb));
+ memcpy(&get_paca()->shadow_vcpu, &to_book3s(vcpu)->shadow_vcpu,
+ sizeof(get_paca()->shadow_vcpu));
get_paca()->kvm_slb_max = to_book3s(vcpu)->slb_shadow_max;
}
void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
{
memcpy(to_book3s(vcpu)->slb_shadow, get_paca()->kvm_slb, sizeof(get_paca()->kvm_slb));
+ memcpy(&to_book3s(vcpu)->shadow_vcpu, &get_paca()->shadow_vcpu,
+ sizeof(get_paca()->shadow_vcpu));
to_book3s(vcpu)->slb_shadow_max = get_paca()->kvm_slb_max;
+
+ kvmppc_giveup_ext(vcpu, MSR_FP);
+ kvmppc_giveup_ext(vcpu, MSR_VEC);
+ kvmppc_giveup_ext(vcpu, MSR_VSX);
}
-#if defined(AGGRESSIVE_DEC) || defined(EXIT_DEBUG)
+#if defined(EXIT_DEBUG)
static u32 kvmppc_get_dec(struct kvm_vcpu *vcpu)
{
u64 jd = mftb() - vcpu->arch.dec_jiffies;
@@ -89,6 +94,23 @@ static u32 kvmppc_get_dec(struct kvm_vcpu *vcpu)
}
#endif
+static void kvmppc_recalc_shadow_msr(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.shadow_msr = vcpu->arch.msr;
+ /* Guest MSR values */
+ vcpu->arch.shadow_msr &= MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE |
+ MSR_BE | MSR_DE;
+ /* Process MSR values */
+ vcpu->arch.shadow_msr |= MSR_ME | MSR_RI | MSR_IR | MSR_DR | MSR_PR |
+ MSR_EE;
+ /* External providers the guest reserved */
+ vcpu->arch.shadow_msr |= (vcpu->arch.msr & vcpu->arch.guest_owned_ext);
+ /* 64-bit Process MSR values */
+#ifdef CONFIG_PPC_BOOK3S_64
+ vcpu->arch.shadow_msr |= MSR_ISF | MSR_HV;
+#endif
+}
+
void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr)
{
ulong old_msr = vcpu->arch.msr;
@@ -96,12 +118,10 @@ void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr)
#ifdef EXIT_DEBUG
printk(KERN_INFO "KVM: Set MSR to 0x%llx\n", msr);
#endif
+
msr &= to_book3s(vcpu)->msr_mask;
vcpu->arch.msr = msr;
- vcpu->arch.shadow_msr = msr | MSR_USER32;
- vcpu->arch.shadow_msr &= ( MSR_VEC | MSR_VSX | MSR_FP | MSR_FE0 |
- MSR_USER64 | MSR_SE | MSR_BE | MSR_DE |
- MSR_FE1);
+ kvmppc_recalc_shadow_msr(vcpu);
if (msr & (MSR_WE|MSR_POW)) {
if (!vcpu->arch.pending_exceptions) {
@@ -125,11 +145,10 @@ void kvmppc_inject_interrupt(struct kvm_vcpu *vcpu, int vec, u64 flags)
vcpu->arch.mmu.reset_msr(vcpu);
}
-void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec)
+static int kvmppc_book3s_vec2irqprio(unsigned int vec)
{
unsigned int prio;
- vcpu->stat.queue_intr++;
switch (vec) {
case 0x100: prio = BOOK3S_IRQPRIO_SYSTEM_RESET; break;
case 0x200: prio = BOOK3S_IRQPRIO_MACHINE_CHECK; break;
@@ -149,15 +168,31 @@ void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec)
default: prio = BOOK3S_IRQPRIO_MAX; break;
}
- set_bit(prio, &vcpu->arch.pending_exceptions);
+ return prio;
+}
+
+static void kvmppc_book3s_dequeue_irqprio(struct kvm_vcpu *vcpu,
+ unsigned int vec)
+{
+ clear_bit(kvmppc_book3s_vec2irqprio(vec),
+ &vcpu->arch.pending_exceptions);
+}
+
+void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec)
+{
+ vcpu->stat.queue_intr++;
+
+ set_bit(kvmppc_book3s_vec2irqprio(vec),
+ &vcpu->arch.pending_exceptions);
#ifdef EXIT_DEBUG
printk(KERN_INFO "Queueing interrupt %x\n", vec);
#endif
}
-void kvmppc_core_queue_program(struct kvm_vcpu *vcpu)
+void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong flags)
{
+ to_book3s(vcpu)->prog_flags = flags;
kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_PROGRAM);
}
@@ -171,6 +206,11 @@ int kvmppc_core_pending_dec(struct kvm_vcpu *vcpu)
return test_bit(BOOK3S_INTERRUPT_DECREMENTER >> 7, &vcpu->arch.pending_exceptions);
}
+void kvmppc_core_dequeue_dec(struct kvm_vcpu *vcpu)
+{
+ kvmppc_book3s_dequeue_irqprio(vcpu, BOOK3S_INTERRUPT_DECREMENTER);
+}
+
void kvmppc_core_queue_external(struct kvm_vcpu *vcpu,
struct kvm_interrupt *irq)
{
@@ -181,6 +221,7 @@ int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority)
{
int deliver = 1;
int vec = 0;
+ ulong flags = 0ULL;
switch (priority) {
case BOOK3S_IRQPRIO_DECREMENTER:
@@ -214,6 +255,7 @@ int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority)
break;
case BOOK3S_IRQPRIO_PROGRAM:
vec = BOOK3S_INTERRUPT_PROGRAM;
+ flags = to_book3s(vcpu)->prog_flags;
break;
case BOOK3S_IRQPRIO_VSX:
vec = BOOK3S_INTERRUPT_VSX;
@@ -244,7 +286,7 @@ int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority)
#endif
if (deliver)
- kvmppc_inject_interrupt(vcpu, vec, 0ULL);
+ kvmppc_inject_interrupt(vcpu, vec, flags);
return deliver;
}
@@ -254,21 +296,15 @@ void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu)
unsigned long *pending = &vcpu->arch.pending_exceptions;
unsigned int priority;
- /* XXX be more clever here - no need to mftb() on every entry */
- /* Issue DEC again if it's still active */
-#ifdef AGGRESSIVE_DEC
- if (vcpu->arch.msr & MSR_EE)
- if (kvmppc_get_dec(vcpu) & 0x80000000)
- kvmppc_core_queue_dec(vcpu);
-#endif
-
#ifdef EXIT_DEBUG
if (vcpu->arch.pending_exceptions)
printk(KERN_EMERG "KVM: Check pending: %lx\n", vcpu->arch.pending_exceptions);
#endif
priority = __ffs(*pending);
while (priority <= (sizeof(unsigned int) * 8)) {
- if (kvmppc_book3s_irqprio_deliver(vcpu, priority)) {
+ if (kvmppc_book3s_irqprio_deliver(vcpu, priority) &&
+ (priority != BOOK3S_IRQPRIO_DECREMENTER)) {
+ /* DEC interrupts get cleared by mtdec */
clear_bit(priority, &vcpu->arch.pending_exceptions);
break;
}
@@ -503,14 +539,14 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu,
/* Page not found in guest PTE entries */
vcpu->arch.dear = vcpu->arch.fault_dear;
to_book3s(vcpu)->dsisr = vcpu->arch.fault_dsisr;
- vcpu->arch.msr |= (vcpu->arch.shadow_msr & 0x00000000f8000000ULL);
+ vcpu->arch.msr |= (vcpu->arch.shadow_srr1 & 0x00000000f8000000ULL);
kvmppc_book3s_queue_irqprio(vcpu, vec);
} else if (page_found == -EPERM) {
/* Storage protection */
vcpu->arch.dear = vcpu->arch.fault_dear;
to_book3s(vcpu)->dsisr = vcpu->arch.fault_dsisr & ~DSISR_NOHPTE;
to_book3s(vcpu)->dsisr |= DSISR_PROTFAULT;
- vcpu->arch.msr |= (vcpu->arch.shadow_msr & 0x00000000f8000000ULL);
+ vcpu->arch.msr |= (vcpu->arch.shadow_srr1 & 0x00000000f8000000ULL);
kvmppc_book3s_queue_irqprio(vcpu, vec);
} else if (page_found == -EINVAL) {
/* Page not found in guest SLB */
@@ -532,13 +568,122 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu,
r = kvmppc_emulate_mmio(run, vcpu);
if ( r == RESUME_HOST_NV )
r = RESUME_HOST;
- if ( r == RESUME_GUEST_NV )
- r = RESUME_GUEST;
}
return r;
}
+static inline int get_fpr_index(int i)
+{
+#ifdef CONFIG_VSX
+ i *= 2;
+#endif
+ return i;
+}
+
+/* Give up external provider (FPU, Altivec, VSX) */
+static void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr)
+{
+ struct thread_struct *t = &current->thread;
+ u64 *vcpu_fpr = vcpu->arch.fpr;
+ u64 *vcpu_vsx = vcpu->arch.vsr;
+ u64 *thread_fpr = (u64*)t->fpr;
+ int i;
+
+ if (!(vcpu->arch.guest_owned_ext & msr))
+ return;
+
+#ifdef DEBUG_EXT
+ printk(KERN_INFO "Giving up ext 0x%lx\n", msr);
+#endif
+
+ switch (msr) {
+ case MSR_FP:
+ giveup_fpu(current);
+ for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++)
+ vcpu_fpr[i] = thread_fpr[get_fpr_index(i)];
+
+ vcpu->arch.fpscr = t->fpscr.val;
+ break;
+ case MSR_VEC:
+#ifdef CONFIG_ALTIVEC
+ giveup_altivec(current);
+ memcpy(vcpu->arch.vr, t->vr, sizeof(vcpu->arch.vr));
+ vcpu->arch.vscr = t->vscr;
+#endif
+ break;
+ case MSR_VSX:
+#ifdef CONFIG_VSX
+ __giveup_vsx(current);
+ for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr); i++)
+ vcpu_vsx[i] = thread_fpr[get_fpr_index(i) + 1];
+#endif
+ break;
+ default:
+ BUG();
+ }
+
+ vcpu->arch.guest_owned_ext &= ~msr;
+ current->thread.regs->msr &= ~msr;
+ kvmppc_recalc_shadow_msr(vcpu);
+}
+
+/* Handle external providers (FPU, Altivec, VSX) */
+static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr,
+ ulong msr)
+{
+ struct thread_struct *t = &current->thread;
+ u64 *vcpu_fpr = vcpu->arch.fpr;
+ u64 *vcpu_vsx = vcpu->arch.vsr;
+ u64 *thread_fpr = (u64*)t->fpr;
+ int i;
+
+ if (!(vcpu->arch.msr & msr)) {
+ kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
+ return RESUME_GUEST;
+ }
+
+#ifdef DEBUG_EXT
+ printk(KERN_INFO "Loading up ext 0x%lx\n", msr);
+#endif
+
+ current->thread.regs->msr |= msr;
+
+ switch (msr) {
+ case MSR_FP:
+ for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++)
+ thread_fpr[get_fpr_index(i)] = vcpu_fpr[i];
+
+ t->fpscr.val = vcpu->arch.fpscr;
+ t->fpexc_mode = 0;
+ kvmppc_load_up_fpu();
+ break;
+ case MSR_VEC:
+#ifdef CONFIG_ALTIVEC
+ memcpy(t->vr, vcpu->arch.vr, sizeof(vcpu->arch.vr));
+ t->vscr = vcpu->arch.vscr;
+ t->vrsave = -1;
+ kvmppc_load_up_altivec();
+#endif
+ break;
+ case MSR_VSX:
+#ifdef CONFIG_VSX
+ for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr); i++)
+ thread_fpr[get_fpr_index(i) + 1] = vcpu_vsx[i];
+ kvmppc_load_up_vsx();
+#endif
+ break;
+ default:
+ BUG();
+ }
+
+ vcpu->arch.guest_owned_ext |= msr;
+
+ kvmppc_recalc_shadow_msr(vcpu);
+
+ return RESUME_GUEST;
+}
+
int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
unsigned int exit_nr)
{
@@ -563,7 +708,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
case BOOK3S_INTERRUPT_INST_STORAGE:
vcpu->stat.pf_instruc++;
/* only care about PTEG not found errors, but leave NX alone */
- if (vcpu->arch.shadow_msr & 0x40000000) {
+ if (vcpu->arch.shadow_srr1 & 0x40000000) {
r = kvmppc_handle_pagefault(run, vcpu, vcpu->arch.pc, exit_nr);
vcpu->stat.sp_instruc++;
} else if (vcpu->arch.mmu.is_dcbz32(vcpu) &&
@@ -575,7 +720,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
*/
kvmppc_mmu_pte_flush(vcpu, vcpu->arch.pc, ~0xFFFULL);
} else {
- vcpu->arch.msr |= (vcpu->arch.shadow_msr & 0x58000000);
+ vcpu->arch.msr |= vcpu->arch.shadow_srr1 & 0x58000000;
kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
kvmppc_mmu_pte_flush(vcpu, vcpu->arch.pc, ~0xFFFULL);
r = RESUME_GUEST;
@@ -621,6 +766,9 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
case BOOK3S_INTERRUPT_PROGRAM:
{
enum emulation_result er;
+ ulong flags;
+
+ flags = vcpu->arch.shadow_srr1 & 0x1f0000ull;
if (vcpu->arch.msr & MSR_PR) {
#ifdef EXIT_DEBUG
@@ -628,7 +776,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
#endif
if ((vcpu->arch.last_inst & 0xff0007ff) !=
(INS_DCBZ & 0xfffffff7)) {
- kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
+ kvmppc_core_queue_program(vcpu, flags);
r = RESUME_GUEST;
break;
}
@@ -638,12 +786,12 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
er = kvmppc_emulate_instruction(run, vcpu);
switch (er) {
case EMULATE_DONE:
- r = RESUME_GUEST;
+ r = RESUME_GUEST_NV;
break;
case EMULATE_FAIL:
printk(KERN_CRIT "%s: emulation at %lx failed (%08x)\n",
__func__, vcpu->arch.pc, vcpu->arch.last_inst);
- kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
+ kvmppc_core_queue_program(vcpu, flags);
r = RESUME_GUEST;
break;
default:
@@ -653,23 +801,30 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
}
case BOOK3S_INTERRUPT_SYSCALL:
#ifdef EXIT_DEBUG
- printk(KERN_INFO "Syscall Nr %d\n", (int)vcpu->arch.gpr[0]);
+ printk(KERN_INFO "Syscall Nr %d\n", (int)kvmppc_get_gpr(vcpu, 0));
#endif
vcpu->stat.syscall_exits++;
kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
r = RESUME_GUEST;
break;
- case BOOK3S_INTERRUPT_MACHINE_CHECK:
case BOOK3S_INTERRUPT_FP_UNAVAIL:
- case BOOK3S_INTERRUPT_TRACE:
+ r = kvmppc_handle_ext(vcpu, exit_nr, MSR_FP);
+ break;
case BOOK3S_INTERRUPT_ALTIVEC:
+ r = kvmppc_handle_ext(vcpu, exit_nr, MSR_VEC);
+ break;
case BOOK3S_INTERRUPT_VSX:
+ r = kvmppc_handle_ext(vcpu, exit_nr, MSR_VSX);
+ break;
+ case BOOK3S_INTERRUPT_MACHINE_CHECK:
+ case BOOK3S_INTERRUPT_TRACE:
kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
r = RESUME_GUEST;
break;
default:
/* Ugh - bork here! What did we get? */
- printk(KERN_EMERG "exit_nr=0x%x | pc=0x%lx | msr=0x%lx\n", exit_nr, vcpu->arch.pc, vcpu->arch.shadow_msr);
+ printk(KERN_EMERG "exit_nr=0x%x | pc=0x%lx | msr=0x%lx\n",
+ exit_nr, vcpu->arch.pc, vcpu->arch.shadow_srr1);
r = RESUME_HOST;
BUG();
break;
@@ -712,10 +867,10 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
int i;
regs->pc = vcpu->arch.pc;
- regs->cr = vcpu->arch.cr;
+ regs->cr = kvmppc_get_cr(vcpu);
regs->ctr = vcpu->arch.ctr;
regs->lr = vcpu->arch.lr;
- regs->xer = vcpu->arch.xer;
+ regs->xer = kvmppc_get_xer(vcpu);
regs->msr = vcpu->arch.msr;
regs->srr0 = vcpu->arch.srr0;
regs->srr1 = vcpu->arch.srr1;
@@ -729,7 +884,7 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
regs->sprg7 = vcpu->arch.sprg6;
for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
- regs->gpr[i] = vcpu->arch.gpr[i];
+ regs->gpr[i] = kvmppc_get_gpr(vcpu, i);
return 0;
}
@@ -739,10 +894,10 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
int i;
vcpu->arch.pc = regs->pc;
- vcpu->arch.cr = regs->cr;
+ kvmppc_set_cr(vcpu, regs->cr);
vcpu->arch.ctr = regs->ctr;
vcpu->arch.lr = regs->lr;
- vcpu->arch.xer = regs->xer;
+ kvmppc_set_xer(vcpu, regs->xer);
kvmppc_set_msr(vcpu, regs->msr);
vcpu->arch.srr0 = regs->srr0;
vcpu->arch.srr1 = regs->srr1;
@@ -754,8 +909,8 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
vcpu->arch.sprg6 = regs->sprg5;
vcpu->arch.sprg7 = regs->sprg6;
- for (i = 0; i < ARRAY_SIZE(vcpu->arch.gpr); i++)
- vcpu->arch.gpr[i] = regs->gpr[i];
+ for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
+ kvmppc_set_gpr(vcpu, i, regs->gpr[i]);
return 0;
}
@@ -850,7 +1005,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
int is_dirty = 0;
int r, n;
- down_write(&kvm->slots_lock);
+ mutex_lock(&kvm->slots_lock);
r = kvm_get_dirty_log(kvm, log, &is_dirty);
if (r)
@@ -858,7 +1013,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
/* If nothing is dirty, don't bother messing with page tables. */
if (is_dirty) {
- memslot = &kvm->memslots[log->slot];
+ memslot = &kvm->memslots->memslots[log->slot];
ga = memslot->base_gfn << PAGE_SHIFT;
ga_end = ga + (memslot->npages << PAGE_SHIFT);
@@ -872,7 +1027,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
r = 0;
out:
- up_write(&kvm->slots_lock);
+ mutex_unlock(&kvm->slots_lock);
return r;
}
@@ -910,6 +1065,7 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
vcpu->arch.trampoline_lowmem = kvmppc_trampoline_lowmem;
vcpu->arch.trampoline_enter = kvmppc_trampoline_enter;
vcpu->arch.highmem_handler = (ulong)kvmppc_handler_highmem;
+ vcpu->arch.rmcall = *(ulong*)kvmppc_rmcall;
vcpu->arch.shadow_msr = MSR_USER64;
@@ -943,6 +1099,10 @@ extern int __kvmppc_vcpu_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
{
int ret;
+ struct thread_struct ext_bkp;
+ bool save_vec = current->thread.used_vr;
+ bool save_vsx = current->thread.used_vsr;
+ ulong ext_msr;
/* No need to go into the guest when all we do is going out */
if (signal_pending(current)) {
@@ -950,6 +1110,35 @@ int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
return -EINTR;
}
+ /* Save FPU state in stack */
+ if (current->thread.regs->msr & MSR_FP)
+ giveup_fpu(current);
+ memcpy(ext_bkp.fpr, current->thread.fpr, sizeof(current->thread.fpr));
+ ext_bkp.fpscr = current->thread.fpscr;
+ ext_bkp.fpexc_mode = current->thread.fpexc_mode;
+
+#ifdef CONFIG_ALTIVEC
+ /* Save Altivec state in stack */
+ if (save_vec) {
+ if (current->thread.regs->msr & MSR_VEC)
+ giveup_altivec(current);
+ memcpy(ext_bkp.vr, current->thread.vr, sizeof(ext_bkp.vr));
+ ext_bkp.vscr = current->thread.vscr;
+ ext_bkp.vrsave = current->thread.vrsave;
+ }
+ ext_bkp.used_vr = current->thread.used_vr;
+#endif
+
+#ifdef CONFIG_VSX
+ /* Save VSX state in stack */
+ if (save_vsx && (current->thread.regs->msr & MSR_VSX))
+ __giveup_vsx(current);
+ ext_bkp.used_vsr = current->thread.used_vsr;
+#endif
+
+ /* Remember the MSR with disabled extensions */
+ ext_msr = current->thread.regs->msr;
+
/* XXX we get called with irq disabled - change that! */
local_irq_enable();
@@ -957,6 +1146,32 @@ int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
local_irq_disable();
+ current->thread.regs->msr = ext_msr;
+
+ /* Make sure we save the guest FPU/Altivec/VSX state */
+ kvmppc_giveup_ext(vcpu, MSR_FP);
+ kvmppc_giveup_ext(vcpu, MSR_VEC);
+ kvmppc_giveup_ext(vcpu, MSR_VSX);
+
+ /* Restore FPU state from stack */
+ memcpy(current->thread.fpr, ext_bkp.fpr, sizeof(ext_bkp.fpr));
+ current->thread.fpscr = ext_bkp.fpscr;
+ current->thread.fpexc_mode = ext_bkp.fpexc_mode;
+
+#ifdef CONFIG_ALTIVEC
+ /* Restore Altivec state from stack */
+ if (save_vec && current->thread.used_vr) {
+ memcpy(current->thread.vr, ext_bkp.vr, sizeof(ext_bkp.vr));
+ current->thread.vscr = ext_bkp.vscr;
+ current->thread.vrsave= ext_bkp.vrsave;
+ }
+ current->thread.used_vr = ext_bkp.used_vr;
+#endif
+
+#ifdef CONFIG_VSX
+ current->thread.used_vsr = ext_bkp.used_vsr;
+#endif
+
return ret;
}
diff --git a/arch/powerpc/kvm/book3s_64_emulate.c b/arch/powerpc/kvm/book3s_64_emulate.c
index 1027eac6d47..2b0ee7e040c 100644
--- a/arch/powerpc/kvm/book3s_64_emulate.c
+++ b/arch/powerpc/kvm/book3s_64_emulate.c
@@ -65,11 +65,11 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
case 31:
switch (get_xop(inst)) {
case OP_31_XOP_MFMSR:
- vcpu->arch.gpr[get_rt(inst)] = vcpu->arch.msr;
+ kvmppc_set_gpr(vcpu, get_rt(inst), vcpu->arch.msr);
break;
case OP_31_XOP_MTMSRD:
{
- ulong rs = vcpu->arch.gpr[get_rs(inst)];
+ ulong rs = kvmppc_get_gpr(vcpu, get_rs(inst));
if (inst & 0x10000) {
vcpu->arch.msr &= ~(MSR_RI | MSR_EE);
vcpu->arch.msr |= rs & (MSR_RI | MSR_EE);
@@ -78,30 +78,30 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
break;
}
case OP_31_XOP_MTMSR:
- kvmppc_set_msr(vcpu, vcpu->arch.gpr[get_rs(inst)]);
+ kvmppc_set_msr(vcpu, kvmppc_get_gpr(vcpu, get_rs(inst)));
break;
case OP_31_XOP_MFSRIN:
{
int srnum;
- srnum = (vcpu->arch.gpr[get_rb(inst)] >> 28) & 0xf;
+ srnum = (kvmppc_get_gpr(vcpu, get_rb(inst)) >> 28) & 0xf;
if (vcpu->arch.mmu.mfsrin) {
u32 sr;
sr = vcpu->arch.mmu.mfsrin(vcpu, srnum);
- vcpu->arch.gpr[get_rt(inst)] = sr;
+ kvmppc_set_gpr(vcpu, get_rt(inst), sr);
}
break;
}
case OP_31_XOP_MTSRIN:
vcpu->arch.mmu.mtsrin(vcpu,
- (vcpu->arch.gpr[get_rb(inst)] >> 28) & 0xf,
- vcpu->arch.gpr[get_rs(inst)]);
+ (kvmppc_get_gpr(vcpu, get_rb(inst)) >> 28) & 0xf,
+ kvmppc_get_gpr(vcpu, get_rs(inst)));
break;
case OP_31_XOP_TLBIE:
case OP_31_XOP_TLBIEL:
{
bool large = (inst & 0x00200000) ? true : false;
- ulong addr = vcpu->arch.gpr[get_rb(inst)];
+ ulong addr = kvmppc_get_gpr(vcpu, get_rb(inst));
vcpu->arch.mmu.tlbie(vcpu, addr, large);
break;
}
@@ -111,14 +111,16 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
if (!vcpu->arch.mmu.slbmte)
return EMULATE_FAIL;
- vcpu->arch.mmu.slbmte(vcpu, vcpu->arch.gpr[get_rs(inst)],
- vcpu->arch.gpr[get_rb(inst)]);
+ vcpu->arch.mmu.slbmte(vcpu,
+ kvmppc_get_gpr(vcpu, get_rs(inst)),
+ kvmppc_get_gpr(vcpu, get_rb(inst)));
break;
case OP_31_XOP_SLBIE:
if (!vcpu->arch.mmu.slbie)
return EMULATE_FAIL;
- vcpu->arch.mmu.slbie(vcpu, vcpu->arch.gpr[get_rb(inst)]);
+ vcpu->arch.mmu.slbie(vcpu,
+ kvmppc_get_gpr(vcpu, get_rb(inst)));
break;
case OP_31_XOP_SLBIA:
if (!vcpu->arch.mmu.slbia)
@@ -132,9 +134,9 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
} else {
ulong t, rb;
- rb = vcpu->arch.gpr[get_rb(inst)];
+ rb = kvmppc_get_gpr(vcpu, get_rb(inst));
t = vcpu->arch.mmu.slbmfee(vcpu, rb);
- vcpu->arch.gpr[get_rt(inst)] = t;
+ kvmppc_set_gpr(vcpu, get_rt(inst), t);
}
break;
case OP_31_XOP_SLBMFEV:
@@ -143,20 +145,20 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
} else {
ulong t, rb;
- rb = vcpu->arch.gpr[get_rb(inst)];
+ rb = kvmppc_get_gpr(vcpu, get_rb(inst));
t = vcpu->arch.mmu.slbmfev(vcpu, rb);
- vcpu->arch.gpr[get_rt(inst)] = t;
+ kvmppc_set_gpr(vcpu, get_rt(inst), t);
}
break;
case OP_31_XOP_DCBZ:
{
- ulong rb = vcpu->arch.gpr[get_rb(inst)];
+ ulong rb = kvmppc_get_gpr(vcpu, get_rb(inst));
ulong ra = 0;
ulong addr;
u32 zeros[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
if (get_ra(inst))
- ra = vcpu->arch.gpr[get_ra(inst)];
+ ra = kvmppc_get_gpr(vcpu, get_ra(inst));
addr = (ra + rb) & ~31ULL;
if (!(vcpu->arch.msr & MSR_SF))
@@ -233,43 +235,44 @@ static void kvmppc_write_bat(struct kvm_vcpu *vcpu, int sprn, u32 val)
int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
{
int emulated = EMULATE_DONE;
+ ulong spr_val = kvmppc_get_gpr(vcpu, rs);
switch (sprn) {
case SPRN_SDR1:
- to_book3s(vcpu)->sdr1 = vcpu->arch.gpr[rs];
+ to_book3s(vcpu)->sdr1 = spr_val;
break;
case SPRN_DSISR:
- to_book3s(vcpu)->dsisr = vcpu->arch.gpr[rs];
+ to_book3s(vcpu)->dsisr = spr_val;
break;
case SPRN_DAR:
- vcpu->arch.dear = vcpu->arch.gpr[rs];
+ vcpu->arch.dear = spr_val;
break;
case SPRN_HIOR:
- to_book3s(vcpu)->hior = vcpu->arch.gpr[rs];
+ to_book3s(vcpu)->hior = spr_val;
break;
case SPRN_IBAT0U ... SPRN_IBAT3L:
case SPRN_IBAT4U ... SPRN_IBAT7L:
case SPRN_DBAT0U ... SPRN_DBAT3L:
case SPRN_DBAT4U ... SPRN_DBAT7L:
- kvmppc_write_bat(vcpu, sprn, (u32)vcpu->arch.gpr[rs]);
+ kvmppc_write_bat(vcpu, sprn, (u32)spr_val);
/* BAT writes happen so rarely that we're ok to flush
* everything here */
kvmppc_mmu_pte_flush(vcpu, 0, 0);
break;
case SPRN_HID0:
- to_book3s(vcpu)->hid[0] = vcpu->arch.gpr[rs];
+ to_book3s(vcpu)->hid[0] = spr_val;
break;
case SPRN_HID1:
- to_book3s(vcpu)->hid[1] = vcpu->arch.gpr[rs];
+ to_book3s(vcpu)->hid[1] = spr_val;
break;
case SPRN_HID2:
- to_book3s(vcpu)->hid[2] = vcpu->arch.gpr[rs];
+ to_book3s(vcpu)->hid[2] = spr_val;
break;
case SPRN_HID4:
- to_book3s(vcpu)->hid[4] = vcpu->arch.gpr[rs];
+ to_book3s(vcpu)->hid[4] = spr_val;
break;
case SPRN_HID5:
- to_book3s(vcpu)->hid[5] = vcpu->arch.gpr[rs];
+ to_book3s(vcpu)->hid[5] = spr_val;
/* guest HID5 set can change is_dcbz32 */
if (vcpu->arch.mmu.is_dcbz32(vcpu) &&
(mfmsr() & MSR_HV))
@@ -299,38 +302,38 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
switch (sprn) {
case SPRN_SDR1:
- vcpu->arch.gpr[rt] = to_book3s(vcpu)->sdr1;
+ kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->sdr1);
break;
case SPRN_DSISR:
- vcpu->arch.gpr[rt] = to_book3s(vcpu)->dsisr;
+ kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->dsisr);
break;
case SPRN_DAR:
- vcpu->arch.gpr[rt] = vcpu->arch.dear;
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.dear);
break;
case SPRN_HIOR:
- vcpu->arch.gpr[rt] = to_book3s(vcpu)->hior;
+ kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->hior);
break;
case SPRN_HID0:
- vcpu->arch.gpr[rt] = to_book3s(vcpu)->hid[0];
+ kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->hid[0]);
break;
case SPRN_HID1:
- vcpu->arch.gpr[rt] = to_book3s(vcpu)->hid[1];
+ kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->hid[1]);
break;
case SPRN_HID2:
- vcpu->arch.gpr[rt] = to_book3s(vcpu)->hid[2];
+ kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->hid[2]);
break;
case SPRN_HID4:
- vcpu->arch.gpr[rt] = to_book3s(vcpu)->hid[4];
+ kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->hid[4]);
break;
case SPRN_HID5:
- vcpu->arch.gpr[rt] = to_book3s(vcpu)->hid[5];
+ kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->hid[5]);
break;
case SPRN_THRM1:
case SPRN_THRM2:
case SPRN_THRM3:
case SPRN_CTRLF:
case SPRN_CTRLT:
- vcpu->arch.gpr[rt] = 0;
+ kvmppc_set_gpr(vcpu, rt, 0);
break;
default:
printk(KERN_INFO "KVM: invalid SPR read: %d\n", sprn);
diff --git a/arch/powerpc/kvm/book3s_64_exports.c b/arch/powerpc/kvm/book3s_64_exports.c
index 5b2db38ed86..1dd5a1ddfd0 100644
--- a/arch/powerpc/kvm/book3s_64_exports.c
+++ b/arch/powerpc/kvm/book3s_64_exports.c
@@ -22,3 +22,11 @@
EXPORT_SYMBOL_GPL(kvmppc_trampoline_enter);
EXPORT_SYMBOL_GPL(kvmppc_trampoline_lowmem);
+EXPORT_SYMBOL_GPL(kvmppc_rmcall);
+EXPORT_SYMBOL_GPL(kvmppc_load_up_fpu);
+#ifdef CONFIG_ALTIVEC
+EXPORT_SYMBOL_GPL(kvmppc_load_up_altivec);
+#endif
+#ifdef CONFIG_VSX
+EXPORT_SYMBOL_GPL(kvmppc_load_up_vsx);
+#endif
diff --git a/arch/powerpc/kvm/book3s_64_interrupts.S b/arch/powerpc/kvm/book3s_64_interrupts.S
index 7b55d8094c8..c1584d0cbce 100644
--- a/arch/powerpc/kvm/book3s_64_interrupts.S
+++ b/arch/powerpc/kvm/book3s_64_interrupts.S
@@ -28,11 +28,6 @@
#define ULONG_SIZE 8
#define VCPU_GPR(n) (VCPU_GPRS + (n * ULONG_SIZE))
-.macro mfpaca tmp_reg, src_reg, offset, vcpu_reg
- ld \tmp_reg, (PACA_EXMC+\offset)(r13)
- std \tmp_reg, VCPU_GPR(\src_reg)(\vcpu_reg)
-.endm
-
.macro DISABLE_INTERRUPTS
mfmsr r0
rldicl r0,r0,48,1
@@ -40,6 +35,26 @@
mtmsrd r0,1
.endm
+#define VCPU_LOAD_NVGPRS(vcpu) \
+ ld r14, VCPU_GPR(r14)(vcpu); \
+ ld r15, VCPU_GPR(r15)(vcpu); \
+ ld r16, VCPU_GPR(r16)(vcpu); \
+ ld r17, VCPU_GPR(r17)(vcpu); \
+ ld r18, VCPU_GPR(r18)(vcpu); \
+ ld r19, VCPU_GPR(r19)(vcpu); \
+ ld r20, VCPU_GPR(r20)(vcpu); \
+ ld r21, VCPU_GPR(r21)(vcpu); \
+ ld r22, VCPU_GPR(r22)(vcpu); \
+ ld r23, VCPU_GPR(r23)(vcpu); \
+ ld r24, VCPU_GPR(r24)(vcpu); \
+ ld r25, VCPU_GPR(r25)(vcpu); \
+ ld r26, VCPU_GPR(r26)(vcpu); \
+ ld r27, VCPU_GPR(r27)(vcpu); \
+ ld r28, VCPU_GPR(r28)(vcpu); \
+ ld r29, VCPU_GPR(r29)(vcpu); \
+ ld r30, VCPU_GPR(r30)(vcpu); \
+ ld r31, VCPU_GPR(r31)(vcpu); \
+
/*****************************************************************************
* *
* Guest entry / exit code that is in kernel module memory (highmem) *
@@ -67,61 +82,32 @@ kvm_start_entry:
SAVE_NVGPRS(r1)
/* Save LR */
- mflr r14
- std r14, _LINK(r1)
-
-/* XXX optimize non-volatile loading away */
-kvm_start_lightweight:
+ std r0, _LINK(r1)
- DISABLE_INTERRUPTS
+ /* Load non-volatile guest state from the vcpu */
+ VCPU_LOAD_NVGPRS(r4)
/* Save R1/R2 in the PACA */
- std r1, PACAR1(r13)
- std r2, (PACA_EXMC+EX_SRR0)(r13)
+ std r1, PACA_KVM_HOST_R1(r13)
+ std r2, PACA_KVM_HOST_R2(r13)
+
+ /* XXX swap in/out on load? */
ld r3, VCPU_HIGHMEM_HANDLER(r4)
- std r3, PACASAVEDMSR(r13)
+ std r3, PACA_KVM_VMHANDLER(r13)
- /* Load non-volatile guest state from the vcpu */
- ld r14, VCPU_GPR(r14)(r4)
- ld r15, VCPU_GPR(r15)(r4)
- ld r16, VCPU_GPR(r16)(r4)
- ld r17, VCPU_GPR(r17)(r4)
- ld r18, VCPU_GPR(r18)(r4)
- ld r19, VCPU_GPR(r19)(r4)
- ld r20, VCPU_GPR(r20)(r4)
- ld r21, VCPU_GPR(r21)(r4)
- ld r22, VCPU_GPR(r22)(r4)
- ld r23, VCPU_GPR(r23)(r4)
- ld r24, VCPU_GPR(r24)(r4)
- ld r25, VCPU_GPR(r25)(r4)
- ld r26, VCPU_GPR(r26)(r4)
- ld r27, VCPU_GPR(r27)(r4)
- ld r28, VCPU_GPR(r28)(r4)
- ld r29, VCPU_GPR(r29)(r4)
- ld r30, VCPU_GPR(r30)(r4)
- ld r31, VCPU_GPR(r31)(r4)
+kvm_start_lightweight:
ld r9, VCPU_PC(r4) /* r9 = vcpu->arch.pc */
ld r10, VCPU_SHADOW_MSR(r4) /* r10 = vcpu->arch.shadow_msr */
- ld r3, VCPU_TRAMPOLINE_ENTER(r4)
- mtsrr0 r3
-
- LOAD_REG_IMMEDIATE(r3, MSR_KERNEL & ~(MSR_IR | MSR_DR))
- mtsrr1 r3
-
- /* Load guest state in the respective registers */
- lwz r3, VCPU_CR(r4) /* r3 = vcpu->arch.cr */
- stw r3, (PACA_EXMC + EX_CCR)(r13)
-
- ld r3, VCPU_CTR(r4) /* r3 = vcpu->arch.ctr */
- mtctr r3 /* CTR = r3 */
+ /* Load some guest state in the respective registers */
+ ld r5, VCPU_CTR(r4) /* r5 = vcpu->arch.ctr */
+ /* will be swapped in by rmcall */
ld r3, VCPU_LR(r4) /* r3 = vcpu->arch.lr */
mtlr r3 /* LR = r3 */
- ld r3, VCPU_XER(r4) /* r3 = vcpu->arch.xer */
- std r3, (PACA_EXMC + EX_R3)(r13)
+ DISABLE_INTERRUPTS
/* Some guests may need to have dcbz set to 32 byte length.
*
@@ -141,36 +127,15 @@ kvm_start_lightweight:
mtspr SPRN_HID5,r3
no_dcbz32_on:
- /* Load guest GPRs */
-
- ld r3, VCPU_GPR(r9)(r4)
- std r3, (PACA_EXMC + EX_R9)(r13)
- ld r3, VCPU_GPR(r10)(r4)
- std r3, (PACA_EXMC + EX_R10)(r13)
- ld r3, VCPU_GPR(r11)(r4)
- std r3, (PACA_EXMC + EX_R11)(r13)
- ld r3, VCPU_GPR(r12)(r4)
- std r3, (PACA_EXMC + EX_R12)(r13)
- ld r3, VCPU_GPR(r13)(r4)
- std r3, (PACA_EXMC + EX_R13)(r13)
-
- ld r0, VCPU_GPR(r0)(r4)
- ld r1, VCPU_GPR(r1)(r4)
- ld r2, VCPU_GPR(r2)(r4)
- ld r3, VCPU_GPR(r3)(r4)
- ld r5, VCPU_GPR(r5)(r4)
- ld r6, VCPU_GPR(r6)(r4)
- ld r7, VCPU_GPR(r7)(r4)
- ld r8, VCPU_GPR(r8)(r4)
- ld r4, VCPU_GPR(r4)(r4)
-
- /* This sets the Magic value for the trampoline */
-
- li r11, 1
- stb r11, PACA_KVM_IN_GUEST(r13)
+
+ ld r6, VCPU_RMCALL(r4)
+ mtctr r6
+
+ ld r3, VCPU_TRAMPOLINE_ENTER(r4)
+ LOAD_REG_IMMEDIATE(r4, MSR_KERNEL & ~(MSR_IR | MSR_DR))
/* Jump to SLB patching handlder and into our guest */
- RFI
+ bctr
/*
* This is the handler in module memory. It gets jumped at from the
@@ -184,125 +149,70 @@ kvmppc_handler_highmem:
/*
* Register usage at this point:
*
- * R00 = guest R13
- * R01 = host R1
- * R02 = host R2
- * R10 = guest PC
- * R11 = guest MSR
- * R12 = exit handler id
- * R13 = PACA
- * PACA.exmc.R9 = guest R1
- * PACA.exmc.R10 = guest R10
- * PACA.exmc.R11 = guest R11
- * PACA.exmc.R12 = guest R12
- * PACA.exmc.R13 = guest R2
- * PACA.exmc.DAR = guest DAR
- * PACA.exmc.DSISR = guest DSISR
- * PACA.exmc.LR = guest instruction
- * PACA.exmc.CCR = guest CR
- * PACA.exmc.SRR0 = guest R0
+ * R0 = guest last inst
+ * R1 = host R1
+ * R2 = host R2
+ * R3 = guest PC
+ * R4 = guest MSR
+ * R5 = guest DAR
+ * R6 = guest DSISR
+ * R13 = PACA
+ * PACA.KVM.* = guest *
*
*/
- std r3, (PACA_EXMC+EX_R3)(r13)
+ /* R7 = vcpu */
+ ld r7, GPR4(r1)
- /* save the exit id in R3 */
- mr r3, r12
+ /* Now save the guest state */
- /* R12 = vcpu */
- ld r12, GPR4(r1)
+ stw r0, VCPU_LAST_INST(r7)
- /* Now save the guest state */
+ std r3, VCPU_PC(r7)
+ std r4, VCPU_SHADOW_SRR1(r7)
+ std r5, VCPU_FAULT_DEAR(r7)
+ std r6, VCPU_FAULT_DSISR(r7)
- std r0, VCPU_GPR(r13)(r12)
- std r4, VCPU_GPR(r4)(r12)
- std r5, VCPU_GPR(r5)(r12)
- std r6, VCPU_GPR(r6)(r12)
- std r7, VCPU_GPR(r7)(r12)
- std r8, VCPU_GPR(r8)(r12)
- std r9, VCPU_GPR(r9)(r12)
-
- /* get registers from PACA */
- mfpaca r5, r0, EX_SRR0, r12
- mfpaca r5, r3, EX_R3, r12
- mfpaca r5, r1, EX_R9, r12
- mfpaca r5, r10, EX_R10, r12
- mfpaca r5, r11, EX_R11, r12
- mfpaca r5, r12, EX_R12, r12
- mfpaca r5, r2, EX_R13, r12
-
- lwz r5, (PACA_EXMC+EX_LR)(r13)
- stw r5, VCPU_LAST_INST(r12)
-
- lwz r5, (PACA_EXMC+EX_CCR)(r13)
- stw r5, VCPU_CR(r12)
-
- ld r5, VCPU_HFLAGS(r12)
+ ld r5, VCPU_HFLAGS(r7)
rldicl. r5, r5, 0, 63 /* CR = ((r5 & 1) == 0) */
beq no_dcbz32_off
+ li r4, 0
mfspr r5,SPRN_HID5
- rldimi r5,r5,6,56
+ rldimi r5,r4,6,56
mtspr SPRN_HID5,r5
no_dcbz32_off:
- /* XXX maybe skip on lightweight? */
- std r14, VCPU_GPR(r14)(r12)
- std r15, VCPU_GPR(r15)(r12)
- std r16, VCPU_GPR(r16)(r12)
- std r17, VCPU_GPR(r17)(r12)
- std r18, VCPU_GPR(r18)(r12)
- std r19, VCPU_GPR(r19)(r12)
- std r20, VCPU_GPR(r20)(r12)
- std r21, VCPU_GPR(r21)(r12)
- std r22, VCPU_GPR(r22)(r12)
- std r23, VCPU_GPR(r23)(r12)
- std r24, VCPU_GPR(r24)(r12)
- std r25, VCPU_GPR(r25)(r12)
- std r26, VCPU_GPR(r26)(r12)
- std r27, VCPU_GPR(r27)(r12)
- std r28, VCPU_GPR(r28)(r12)
- std r29, VCPU_GPR(r29)(r12)
- std r30, VCPU_GPR(r30)(r12)
- std r31, VCPU_GPR(r31)(r12)
-
- /* Restore non-volatile host registers (r14 - r31) */
- REST_NVGPRS(r1)
-
- /* Save guest PC (R10) */
- std r10, VCPU_PC(r12)
-
- /* Save guest msr (R11) */
- std r11, VCPU_SHADOW_MSR(r12)
-
- /* Save guest CTR (in R12) */
+ std r14, VCPU_GPR(r14)(r7)
+ std r15, VCPU_GPR(r15)(r7)
+ std r16, VCPU_GPR(r16)(r7)
+ std r17, VCPU_GPR(r17)(r7)
+ std r18, VCPU_GPR(r18)(r7)
+ std r19, VCPU_GPR(r19)(r7)
+ std r20, VCPU_GPR(r20)(r7)
+ std r21, VCPU_GPR(r21)(r7)
+ std r22, VCPU_GPR(r22)(r7)
+ std r23, VCPU_GPR(r23)(r7)
+ std r24, VCPU_GPR(r24)(r7)
+ std r25, VCPU_GPR(r25)(r7)
+ std r26, VCPU_GPR(r26)(r7)
+ std r27, VCPU_GPR(r27)(r7)
+ std r28, VCPU_GPR(r28)(r7)
+ std r29, VCPU_GPR(r29)(r7)
+ std r30, VCPU_GPR(r30)(r7)
+ std r31, VCPU_GPR(r31)(r7)
+
+ /* Save guest CTR */
mfctr r5
- std r5, VCPU_CTR(r12)
+ std r5, VCPU_CTR(r7)
/* Save guest LR */
mflr r5
- std r5, VCPU_LR(r12)
-
- /* Save guest XER */
- mfxer r5
- std r5, VCPU_XER(r12)
-
- /* Save guest DAR */
- ld r5, (PACA_EXMC+EX_DAR)(r13)
- std r5, VCPU_FAULT_DEAR(r12)
-
- /* Save guest DSISR */
- lwz r5, (PACA_EXMC+EX_DSISR)(r13)
- std r5, VCPU_FAULT_DSISR(r12)
+ std r5, VCPU_LR(r7)
/* Restore host msr -> SRR1 */
- ld r7, VCPU_HOST_MSR(r12)
- mtsrr1 r7
-
- /* Restore host IP -> SRR0 */
- ld r6, VCPU_HOST_RETIP(r12)
- mtsrr0 r6
+ ld r6, VCPU_HOST_MSR(r7)
/*
* For some interrupts, we need to call the real Linux
@@ -314,13 +224,14 @@ no_dcbz32_off:
* r3 = address of interrupt handler (exit reason)
*/
- cmpwi r3, BOOK3S_INTERRUPT_EXTERNAL
+ cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL
beq call_linux_handler
- cmpwi r3, BOOK3S_INTERRUPT_DECREMENTER
+ cmpwi r12, BOOK3S_INTERRUPT_DECREMENTER
beq call_linux_handler
- /* Back to Interruptable Mode! (goto kvm_return_point) */
- RFI
+ /* Back to EE=1 */
+ mtmsr r6
+ b kvm_return_point
call_linux_handler:
@@ -333,16 +244,22 @@ call_linux_handler:
* interrupt handler!
*
* R3 still contains the exit code,
- * R6 VCPU_HOST_RETIP and
- * R7 VCPU_HOST_MSR
+ * R5 VCPU_HOST_RETIP and
+ * R6 VCPU_HOST_MSR
*/
- mtlr r3
+ /* Restore host IP -> SRR0 */
+ ld r5, VCPU_HOST_RETIP(r7)
+
+ /* XXX Better move to a safe function?
+ * What if we get an HTAB flush in between mtsrr0 and mtsrr1? */
- ld r5, VCPU_TRAMPOLINE_LOWMEM(r12)
- mtsrr0 r5
- LOAD_REG_IMMEDIATE(r5, MSR_KERNEL & ~(MSR_IR | MSR_DR))
- mtsrr1 r5
+ mtlr r12
+
+ ld r4, VCPU_TRAMPOLINE_LOWMEM(r7)
+ mtsrr0 r4
+ LOAD_REG_IMMEDIATE(r3, MSR_KERNEL & ~(MSR_IR | MSR_DR))
+ mtsrr1 r3
RFI
@@ -351,42 +268,51 @@ kvm_return_point:
/* Jump back to lightweight entry if we're supposed to */
/* go back into the guest */
- mr r5, r3
+
+ /* Pass the exit number as 3rd argument to kvmppc_handle_exit */
+ mr r5, r12
+
/* Restore r3 (kvm_run) and r4 (vcpu) */
REST_2GPRS(3, r1)
bl KVMPPC_HANDLE_EXIT
-#if 0 /* XXX get lightweight exits back */
+ /* If RESUME_GUEST, get back in the loop */
cmpwi r3, RESUME_GUEST
- bne kvm_exit_heavyweight
+ beq kvm_loop_lightweight
- /* put VCPU and KVM_RUN back into place and roll again! */
- REST_2GPRS(3, r1)
- b kvm_start_lightweight
+ cmpwi r3, RESUME_GUEST_NV
+ beq kvm_loop_heavyweight
-kvm_exit_heavyweight:
- /* Restore non-volatile host registers */
- ld r14, _LINK(r1)
- mtlr r14
- REST_NVGPRS(r1)
+kvm_exit_loop:
- addi r1, r1, SWITCH_FRAME_SIZE
-#else
ld r4, _LINK(r1)
mtlr r4
- cmpwi r3, RESUME_GUEST
- bne kvm_exit_heavyweight
+ /* Restore non-volatile host registers (r14 - r31) */
+ REST_NVGPRS(r1)
+
+ addi r1, r1, SWITCH_FRAME_SIZE
+ blr
+
+kvm_loop_heavyweight:
+
+ ld r4, _LINK(r1)
+ std r4, (16 + SWITCH_FRAME_SIZE)(r1)
+ /* Load vcpu and cpu_run */
REST_2GPRS(3, r1)
- addi r1, r1, SWITCH_FRAME_SIZE
+ /* Load non-volatile guest state from the vcpu */
+ VCPU_LOAD_NVGPRS(r4)
- b kvm_start_entry
+ /* Jump back into the beginning of this function */
+ b kvm_start_lightweight
-kvm_exit_heavyweight:
+kvm_loop_lightweight:
- addi r1, r1, SWITCH_FRAME_SIZE
-#endif
+ /* We'll need the vcpu pointer */
+ REST_GPR(4, r1)
+
+ /* Jump back into the beginning of this function */
+ b kvm_start_lightweight
- blr
diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c
index e4beeb371a7..512dcff7755 100644
--- a/arch/powerpc/kvm/book3s_64_mmu.c
+++ b/arch/powerpc/kvm/book3s_64_mmu.c
@@ -54,7 +54,7 @@ static struct kvmppc_slb *kvmppc_mmu_book3s_64_find_slbe(
if (!vcpu_book3s->slb[i].valid)
continue;
- if (vcpu_book3s->slb[i].large)
+ if (vcpu_book3s->slb[i].tb)
cmp_esid = esid_1t;
if (vcpu_book3s->slb[i].esid == cmp_esid)
@@ -65,9 +65,10 @@ static struct kvmppc_slb *kvmppc_mmu_book3s_64_find_slbe(
eaddr, esid, esid_1t);
for (i = 0; i < vcpu_book3s->slb_nr; i++) {
if (vcpu_book3s->slb[i].vsid)
- dprintk(" %d: %c%c %llx %llx\n", i,
+ dprintk(" %d: %c%c%c %llx %llx\n", i,
vcpu_book3s->slb[i].valid ? 'v' : ' ',
vcpu_book3s->slb[i].large ? 'l' : ' ',
+ vcpu_book3s->slb[i].tb ? 't' : ' ',
vcpu_book3s->slb[i].esid,
vcpu_book3s->slb[i].vsid);
}
@@ -84,7 +85,7 @@ static u64 kvmppc_mmu_book3s_64_ea_to_vp(struct kvm_vcpu *vcpu, gva_t eaddr,
if (!slb)
return 0;
- if (slb->large)
+ if (slb->tb)
return (((u64)eaddr >> 12) & 0xfffffff) |
(((u64)slb->vsid) << 28);
@@ -309,7 +310,8 @@ static void kvmppc_mmu_book3s_64_slbmte(struct kvm_vcpu *vcpu, u64 rs, u64 rb)
slbe = &vcpu_book3s->slb[slb_nr];
slbe->large = (rs & SLB_VSID_L) ? 1 : 0;
- slbe->esid = slbe->large ? esid_1t : esid;
+ slbe->tb = (rs & SLB_VSID_B_1T) ? 1 : 0;
+ slbe->esid = slbe->tb ? esid_1t : esid;
slbe->vsid = rs >> 12;
slbe->valid = (rb & SLB_ESID_V) ? 1 : 0;
slbe->Ks = (rs & SLB_VSID_KS) ? 1 : 0;
diff --git a/arch/powerpc/kvm/book3s_64_rmhandlers.S b/arch/powerpc/kvm/book3s_64_rmhandlers.S
index fb7dd2e9ac8..c83c60ad96c 100644
--- a/arch/powerpc/kvm/book3s_64_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_64_rmhandlers.S
@@ -45,36 +45,25 @@ kvmppc_trampoline_\intno:
* To distinguish, we check a magic byte in the PACA
*/
mfspr r13, SPRN_SPRG_PACA /* r13 = PACA */
- std r12, (PACA_EXMC + EX_R12)(r13)
+ std r12, PACA_KVM_SCRATCH0(r13)
mfcr r12
- stw r12, (PACA_EXMC + EX_CCR)(r13)
+ stw r12, PACA_KVM_SCRATCH1(r13)
lbz r12, PACA_KVM_IN_GUEST(r13)
- cmpwi r12, 0
+ cmpwi r12, KVM_GUEST_MODE_NONE
bne ..kvmppc_handler_hasmagic_\intno
/* No KVM guest? Then jump back to the Linux handler! */
- lwz r12, (PACA_EXMC + EX_CCR)(r13)
+ lwz r12, PACA_KVM_SCRATCH1(r13)
mtcr r12
- ld r12, (PACA_EXMC + EX_R12)(r13)
+ ld r12, PACA_KVM_SCRATCH0(r13)
mfspr r13, SPRN_SPRG_SCRATCH0 /* r13 = original r13 */
b kvmppc_resume_\intno /* Get back original handler */
/* Now we know we're handling a KVM guest */
..kvmppc_handler_hasmagic_\intno:
- /* Unset guest state */
- li r12, 0
- stb r12, PACA_KVM_IN_GUEST(r13)
- std r1, (PACA_EXMC+EX_R9)(r13)
- std r10, (PACA_EXMC+EX_R10)(r13)
- std r11, (PACA_EXMC+EX_R11)(r13)
- std r2, (PACA_EXMC+EX_R13)(r13)
-
- mfsrr0 r10
- mfsrr1 r11
-
- /* Restore R1/R2 so we can handle faults */
- ld r1, PACAR1(r13)
- ld r2, (PACA_EXMC+EX_SRR0)(r13)
+ /* Should we just skip the faulting instruction? */
+ cmpwi r12, KVM_GUEST_MODE_SKIP
+ beq kvmppc_handler_skip_ins
/* Let's store which interrupt we're handling */
li r12, \intno
@@ -102,23 +91,107 @@ INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_ALTIVEC
INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_VSX
/*
+ * Bring us back to the faulting code, but skip the
+ * faulting instruction.
+ *
+ * This is a generic exit path from the interrupt
+ * trampolines above.
+ *
+ * Input Registers:
+ *
+ * R12 = free
+ * R13 = PACA
+ * PACA.KVM.SCRATCH0 = guest R12
+ * PACA.KVM.SCRATCH1 = guest CR
+ * SPRG_SCRATCH0 = guest R13
+ *
+ */
+kvmppc_handler_skip_ins:
+
+ /* Patch the IP to the next instruction */
+ mfsrr0 r12
+ addi r12, r12, 4
+ mtsrr0 r12
+
+ /* Clean up all state */
+ lwz r12, PACA_KVM_SCRATCH1(r13)
+ mtcr r12
+ ld r12, PACA_KVM_SCRATCH0(r13)
+ mfspr r13, SPRN_SPRG_SCRATCH0
+
+ /* And get back into the code */
+ RFI
+
+/*
* This trampoline brings us back to a real mode handler
*
* Input Registers:
*
- * R6 = SRR0
- * R7 = SRR1
+ * R5 = SRR0
+ * R6 = SRR1
* LR = real-mode IP
*
*/
.global kvmppc_handler_lowmem_trampoline
kvmppc_handler_lowmem_trampoline:
- mtsrr0 r6
- mtsrr1 r7
+ mtsrr0 r5
+ mtsrr1 r6
blr
kvmppc_handler_lowmem_trampoline_end:
+/*
+ * Call a function in real mode
+ *
+ * Input Registers:
+ *
+ * R3 = function
+ * R4 = MSR
+ * R5 = CTR
+ *
+ */
+_GLOBAL(kvmppc_rmcall)
+ mtmsr r4 /* Disable relocation, so mtsrr
+ doesn't get interrupted */
+ mtctr r5
+ mtsrr0 r3
+ mtsrr1 r4
+ RFI
+
+/*
+ * Activate current's external feature (FPU/Altivec/VSX)
+ */
+#define define_load_up(what) \
+ \
+_GLOBAL(kvmppc_load_up_ ## what); \
+ subi r1, r1, INT_FRAME_SIZE; \
+ mflr r3; \
+ std r3, _LINK(r1); \
+ mfmsr r4; \
+ std r31, GPR3(r1); \
+ mr r31, r4; \
+ li r5, MSR_DR; \
+ oris r5, r5, MSR_EE@h; \
+ andc r4, r4, r5; \
+ mtmsr r4; \
+ \
+ bl .load_up_ ## what; \
+ \
+ mtmsr r31; \
+ ld r3, _LINK(r1); \
+ ld r31, GPR3(r1); \
+ addi r1, r1, INT_FRAME_SIZE; \
+ mtlr r3; \
+ blr
+
+define_load_up(fpu)
+#ifdef CONFIG_ALTIVEC
+define_load_up(altivec)
+#endif
+#ifdef CONFIG_VSX
+define_load_up(vsx)
+#endif
+
.global kvmppc_trampoline_lowmem
kvmppc_trampoline_lowmem:
.long kvmppc_handler_lowmem_trampoline - _stext
diff --git a/arch/powerpc/kvm/book3s_64_slb.S b/arch/powerpc/kvm/book3s_64_slb.S
index ecd237a03fd..35b76272218 100644
--- a/arch/powerpc/kvm/book3s_64_slb.S
+++ b/arch/powerpc/kvm/book3s_64_slb.S
@@ -31,7 +31,7 @@
#define REBOLT_SLB_ENTRY(num) \
ld r10, SHADOW_SLB_ESID(num)(r11); \
cmpdi r10, 0; \
- beq slb_exit_skip_1; \
+ beq slb_exit_skip_ ## num; \
oris r10, r10, SLB_ESID_V@h; \
ld r9, SHADOW_SLB_VSID(num)(r11); \
slbmte r9, r10; \
@@ -51,23 +51,21 @@ kvmppc_handler_trampoline_enter:
*
* MSR = ~IR|DR
* R13 = PACA
+ * R1 = host R1
+ * R2 = host R2
* R9 = guest IP
* R10 = guest MSR
- * R11 = free
- * R12 = free
- * PACA[PACA_EXMC + EX_R9] = guest R9
- * PACA[PACA_EXMC + EX_R10] = guest R10
- * PACA[PACA_EXMC + EX_R11] = guest R11
- * PACA[PACA_EXMC + EX_R12] = guest R12
- * PACA[PACA_EXMC + EX_R13] = guest R13
- * PACA[PACA_EXMC + EX_CCR] = guest CR
- * PACA[PACA_EXMC + EX_R3] = guest XER
+ * all other GPRS = free
+ * PACA[KVM_CR] = guest CR
+ * PACA[KVM_XER] = guest XER
*/
mtsrr0 r9
mtsrr1 r10
- mtspr SPRN_SPRG_SCRATCH0, r0
+ /* Activate guest mode, so faults get handled by KVM */
+ li r11, KVM_GUEST_MODE_GUEST
+ stb r11, PACA_KVM_IN_GUEST(r13)
/* Remove LPAR shadow entries */
@@ -131,20 +129,27 @@ slb_do_enter:
/* Enter guest */
- mfspr r0, SPRN_SPRG_SCRATCH0
-
- ld r9, (PACA_EXMC+EX_R9)(r13)
- ld r10, (PACA_EXMC+EX_R10)(r13)
- ld r12, (PACA_EXMC+EX_R12)(r13)
-
- lwz r11, (PACA_EXMC+EX_CCR)(r13)
+ ld r0, (PACA_KVM_R0)(r13)
+ ld r1, (PACA_KVM_R1)(r13)
+ ld r2, (PACA_KVM_R2)(r13)
+ ld r3, (PACA_KVM_R3)(r13)
+ ld r4, (PACA_KVM_R4)(r13)
+ ld r5, (PACA_KVM_R5)(r13)
+ ld r6, (PACA_KVM_R6)(r13)
+ ld r7, (PACA_KVM_R7)(r13)
+ ld r8, (PACA_KVM_R8)(r13)
+ ld r9, (PACA_KVM_R9)(r13)
+ ld r10, (PACA_KVM_R10)(r13)
+ ld r12, (PACA_KVM_R12)(r13)
+
+ lwz r11, (PACA_KVM_CR)(r13)
mtcr r11
- ld r11, (PACA_EXMC+EX_R3)(r13)
+ ld r11, (PACA_KVM_XER)(r13)
mtxer r11
- ld r11, (PACA_EXMC+EX_R11)(r13)
- ld r13, (PACA_EXMC+EX_R13)(r13)
+ ld r11, (PACA_KVM_R11)(r13)
+ ld r13, (PACA_KVM_R13)(r13)
RFI
kvmppc_handler_trampoline_enter_end:
@@ -162,28 +167,54 @@ kvmppc_handler_trampoline_exit:
/* Register usage at this point:
*
- * SPRG_SCRATCH0 = guest R13
- * R01 = host R1
- * R02 = host R2
- * R10 = guest PC
- * R11 = guest MSR
- * R12 = exit handler id
- * R13 = PACA
- * PACA.exmc.CCR = guest CR
- * PACA.exmc.R9 = guest R1
- * PACA.exmc.R10 = guest R10
- * PACA.exmc.R11 = guest R11
- * PACA.exmc.R12 = guest R12
- * PACA.exmc.R13 = guest R2
+ * SPRG_SCRATCH0 = guest R13
+ * R12 = exit handler id
+ * R13 = PACA
+ * PACA.KVM.SCRATCH0 = guest R12
+ * PACA.KVM.SCRATCH1 = guest CR
*
*/
/* Save registers */
- std r0, (PACA_EXMC+EX_SRR0)(r13)
- std r9, (PACA_EXMC+EX_R3)(r13)
- std r10, (PACA_EXMC+EX_LR)(r13)
- std r11, (PACA_EXMC+EX_DAR)(r13)
+ std r0, PACA_KVM_R0(r13)
+ std r1, PACA_KVM_R1(r13)
+ std r2, PACA_KVM_R2(r13)
+ std r3, PACA_KVM_R3(r13)
+ std r4, PACA_KVM_R4(r13)
+ std r5, PACA_KVM_R5(r13)
+ std r6, PACA_KVM_R6(r13)
+ std r7, PACA_KVM_R7(r13)
+ std r8, PACA_KVM_R8(r13)
+ std r9, PACA_KVM_R9(r13)
+ std r10, PACA_KVM_R10(r13)
+ std r11, PACA_KVM_R11(r13)
+
+ /* Restore R1/R2 so we can handle faults */
+ ld r1, PACA_KVM_HOST_R1(r13)
+ ld r2, PACA_KVM_HOST_R2(r13)
+
+ /* Save guest PC and MSR in GPRs */
+ mfsrr0 r3
+ mfsrr1 r4
+
+ /* Get scratch'ed off registers */
+ mfspr r9, SPRN_SPRG_SCRATCH0
+ std r9, PACA_KVM_R13(r13)
+
+ ld r8, PACA_KVM_SCRATCH0(r13)
+ std r8, PACA_KVM_R12(r13)
+
+ lwz r7, PACA_KVM_SCRATCH1(r13)
+ stw r7, PACA_KVM_CR(r13)
+
+ /* Save more register state */
+
+ mfxer r6
+ stw r6, PACA_KVM_XER(r13)
+
+ mfdar r5
+ mfdsisr r6
/*
* In order for us to easily get the last instruction,
@@ -202,17 +233,28 @@ kvmppc_handler_trampoline_exit:
ld_last_inst:
/* Save off the guest instruction we're at */
+
+ /* Set guest mode to 'jump over instruction' so if lwz faults
+ * we'll just continue at the next IP. */
+ li r9, KVM_GUEST_MODE_SKIP
+ stb r9, PACA_KVM_IN_GUEST(r13)
+
/* 1) enable paging for data */
mfmsr r9
ori r11, r9, MSR_DR /* Enable paging for data */
mtmsr r11
/* 2) fetch the instruction */
- lwz r0, 0(r10)
+ li r0, KVM_INST_FETCH_FAILED /* In case lwz faults */
+ lwz r0, 0(r3)
/* 3) disable paging again */
mtmsr r9
no_ld_last_inst:
+ /* Unset guest mode */
+ li r9, KVM_GUEST_MODE_NONE
+ stb r9, PACA_KVM_IN_GUEST(r13)
+
/* Restore bolted entries from the shadow and fix it along the way */
/* We don't store anything in entry 0, so we don't need to take care of it */
@@ -233,29 +275,27 @@ no_ld_last_inst:
slb_do_exit:
- /* Restore registers */
-
- ld r11, (PACA_EXMC+EX_DAR)(r13)
- ld r10, (PACA_EXMC+EX_LR)(r13)
- ld r9, (PACA_EXMC+EX_R3)(r13)
-
- /* Save last inst */
- stw r0, (PACA_EXMC+EX_LR)(r13)
-
- /* Save DAR and DSISR before going to paged mode */
- mfdar r0
- std r0, (PACA_EXMC+EX_DAR)(r13)
- mfdsisr r0
- stw r0, (PACA_EXMC+EX_DSISR)(r13)
+ /* Register usage at this point:
+ *
+ * R0 = guest last inst
+ * R1 = host R1
+ * R2 = host R2
+ * R3 = guest PC
+ * R4 = guest MSR
+ * R5 = guest DAR
+ * R6 = guest DSISR
+ * R12 = exit handler id
+ * R13 = PACA
+ * PACA.KVM.* = guest *
+ *
+ */
/* RFI into the highmem handler */
- mfmsr r0
- ori r0, r0, MSR_IR|MSR_DR|MSR_RI /* Enable paging */
- mtsrr1 r0
- ld r0, PACASAVEDMSR(r13) /* Highmem handler address */
- mtsrr0 r0
-
- mfspr r0, SPRN_SPRG_SCRATCH0
+ mfmsr r7
+ ori r7, r7, MSR_IR|MSR_DR|MSR_RI /* Enable paging */
+ mtsrr1 r7
+ ld r8, PACA_KVM_VMHANDLER(r13) /* Highmem handler address */
+ mtsrr0 r8
RFI
kvmppc_handler_trampoline_exit_end:
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 06f5a9ecc42..4d686cc6b26 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -69,10 +69,10 @@ void kvmppc_dump_vcpu(struct kvm_vcpu *vcpu)
for (i = 0; i < 32; i += 4) {
printk("gpr%02d: %08lx %08lx %08lx %08lx\n", i,
- vcpu->arch.gpr[i],
- vcpu->arch.gpr[i+1],
- vcpu->arch.gpr[i+2],
- vcpu->arch.gpr[i+3]);
+ kvmppc_get_gpr(vcpu, i),
+ kvmppc_get_gpr(vcpu, i+1),
+ kvmppc_get_gpr(vcpu, i+2),
+ kvmppc_get_gpr(vcpu, i+3));
}
}
@@ -82,8 +82,32 @@ static void kvmppc_booke_queue_irqprio(struct kvm_vcpu *vcpu,
set_bit(priority, &vcpu->arch.pending_exceptions);
}
-void kvmppc_core_queue_program(struct kvm_vcpu *vcpu)
+static void kvmppc_core_queue_dtlb_miss(struct kvm_vcpu *vcpu,
+ ulong dear_flags, ulong esr_flags)
{
+ vcpu->arch.queued_dear = dear_flags;
+ vcpu->arch.queued_esr = esr_flags;
+ kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_DTLB_MISS);
+}
+
+static void kvmppc_core_queue_data_storage(struct kvm_vcpu *vcpu,
+ ulong dear_flags, ulong esr_flags)
+{
+ vcpu->arch.queued_dear = dear_flags;
+ vcpu->arch.queued_esr = esr_flags;
+ kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_DATA_STORAGE);
+}
+
+static void kvmppc_core_queue_inst_storage(struct kvm_vcpu *vcpu,
+ ulong esr_flags)
+{
+ vcpu->arch.queued_esr = esr_flags;
+ kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_INST_STORAGE);
+}
+
+void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong esr_flags)
+{
+ vcpu->arch.queued_esr = esr_flags;
kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_PROGRAM);
}
@@ -97,6 +121,11 @@ int kvmppc_core_pending_dec(struct kvm_vcpu *vcpu)
return test_bit(BOOKE_IRQPRIO_DECREMENTER, &vcpu->arch.pending_exceptions);
}
+void kvmppc_core_dequeue_dec(struct kvm_vcpu *vcpu)
+{
+ clear_bit(BOOKE_IRQPRIO_DECREMENTER, &vcpu->arch.pending_exceptions);
+}
+
void kvmppc_core_queue_external(struct kvm_vcpu *vcpu,
struct kvm_interrupt *irq)
{
@@ -109,14 +138,19 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
{
int allowed = 0;
ulong msr_mask;
+ bool update_esr = false, update_dear = false;
switch (priority) {
- case BOOKE_IRQPRIO_PROGRAM:
case BOOKE_IRQPRIO_DTLB_MISS:
- case BOOKE_IRQPRIO_ITLB_MISS:
- case BOOKE_IRQPRIO_SYSCALL:
case BOOKE_IRQPRIO_DATA_STORAGE:
+ update_dear = true;
+ /* fall through */
case BOOKE_IRQPRIO_INST_STORAGE:
+ case BOOKE_IRQPRIO_PROGRAM:
+ update_esr = true;
+ /* fall through */
+ case BOOKE_IRQPRIO_ITLB_MISS:
+ case BOOKE_IRQPRIO_SYSCALL:
case BOOKE_IRQPRIO_FP_UNAVAIL:
case BOOKE_IRQPRIO_SPE_UNAVAIL:
case BOOKE_IRQPRIO_SPE_FP_DATA:
@@ -151,6 +185,10 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
vcpu->arch.srr0 = vcpu->arch.pc;
vcpu->arch.srr1 = vcpu->arch.msr;
vcpu->arch.pc = vcpu->arch.ivpr | vcpu->arch.ivor[priority];
+ if (update_esr == true)
+ vcpu->arch.esr = vcpu->arch.queued_esr;
+ if (update_dear == true)
+ vcpu->arch.dear = vcpu->arch.queued_dear;
kvmppc_set_msr(vcpu, vcpu->arch.msr & msr_mask);
clear_bit(priority, &vcpu->arch.pending_exceptions);
@@ -223,8 +261,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
if (vcpu->arch.msr & MSR_PR) {
/* Program traps generated by user-level software must be handled
* by the guest kernel. */
- vcpu->arch.esr = vcpu->arch.fault_esr;
- kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_PROGRAM);
+ kvmppc_core_queue_program(vcpu, vcpu->arch.fault_esr);
r = RESUME_GUEST;
kvmppc_account_exit(vcpu, USR_PR_INST);
break;
@@ -280,16 +317,14 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
break;
case BOOKE_INTERRUPT_DATA_STORAGE:
- vcpu->arch.dear = vcpu->arch.fault_dear;
- vcpu->arch.esr = vcpu->arch.fault_esr;
- kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_DATA_STORAGE);
+ kvmppc_core_queue_data_storage(vcpu, vcpu->arch.fault_dear,
+ vcpu->arch.fault_esr);
kvmppc_account_exit(vcpu, DSI_EXITS);
r = RESUME_GUEST;
break;
case BOOKE_INTERRUPT_INST_STORAGE:
- vcpu->arch.esr = vcpu->arch.fault_esr;
- kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_INST_STORAGE);
+ kvmppc_core_queue_inst_storage(vcpu, vcpu->arch.fault_esr);
kvmppc_account_exit(vcpu, ISI_EXITS);
r = RESUME_GUEST;
break;
@@ -310,9 +345,9 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
gtlb_index = kvmppc_mmu_dtlb_index(vcpu, eaddr);
if (gtlb_index < 0) {
/* The guest didn't have a mapping for it. */
- kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_DTLB_MISS);
- vcpu->arch.dear = vcpu->arch.fault_dear;
- vcpu->arch.esr = vcpu->arch.fault_esr;
+ kvmppc_core_queue_dtlb_miss(vcpu,
+ vcpu->arch.fault_dear,
+ vcpu->arch.fault_esr);
kvmppc_mmu_dtlb_miss(vcpu);
kvmppc_account_exit(vcpu, DTLB_REAL_MISS_EXITS);
r = RESUME_GUEST;
@@ -426,7 +461,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
{
vcpu->arch.pc = 0;
vcpu->arch.msr = 0;
- vcpu->arch.gpr[1] = (16<<20) - 8; /* -8 for the callee-save LR slot */
+ kvmppc_set_gpr(vcpu, 1, (16<<20) - 8); /* -8 for the callee-save LR slot */
vcpu->arch.shadow_pid = 1;
@@ -444,10 +479,10 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
int i;
regs->pc = vcpu->arch.pc;
- regs->cr = vcpu->arch.cr;
+ regs->cr = kvmppc_get_cr(vcpu);
regs->ctr = vcpu->arch.ctr;
regs->lr = vcpu->arch.lr;
- regs->xer = vcpu->arch.xer;
+ regs->xer = kvmppc_get_xer(vcpu);
regs->msr = vcpu->arch.msr;
regs->srr0 = vcpu->arch.srr0;
regs->srr1 = vcpu->arch.srr1;
@@ -461,7 +496,7 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
regs->sprg7 = vcpu->arch.sprg6;
for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
- regs->gpr[i] = vcpu->arch.gpr[i];
+ regs->gpr[i] = kvmppc_get_gpr(vcpu, i);
return 0;
}
@@ -471,10 +506,10 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
int i;
vcpu->arch.pc = regs->pc;
- vcpu->arch.cr = regs->cr;
+ kvmppc_set_cr(vcpu, regs->cr);
vcpu->arch.ctr = regs->ctr;
vcpu->arch.lr = regs->lr;
- vcpu->arch.xer = regs->xer;
+ kvmppc_set_xer(vcpu, regs->xer);
kvmppc_set_msr(vcpu, regs->msr);
vcpu->arch.srr0 = regs->srr0;
vcpu->arch.srr1 = regs->srr1;
@@ -486,8 +521,8 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
vcpu->arch.sprg6 = regs->sprg5;
vcpu->arch.sprg7 = regs->sprg6;
- for (i = 0; i < ARRAY_SIZE(vcpu->arch.gpr); i++)
- vcpu->arch.gpr[i] = regs->gpr[i];
+ for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
+ kvmppc_set_gpr(vcpu, i, regs->gpr[i]);
return 0;
}
diff --git a/arch/powerpc/kvm/booke_emulate.c b/arch/powerpc/kvm/booke_emulate.c
index aebc65e93f4..cbc790ee192 100644
--- a/arch/powerpc/kvm/booke_emulate.c
+++ b/arch/powerpc/kvm/booke_emulate.c
@@ -62,20 +62,20 @@ int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
case OP_31_XOP_MFMSR:
rt = get_rt(inst);
- vcpu->arch.gpr[rt] = vcpu->arch.msr;
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.msr);
kvmppc_set_exit_type(vcpu, EMULATED_MFMSR_EXITS);
break;
case OP_31_XOP_MTMSR:
rs = get_rs(inst);
kvmppc_set_exit_type(vcpu, EMULATED_MTMSR_EXITS);
- kvmppc_set_msr(vcpu, vcpu->arch.gpr[rs]);
+ kvmppc_set_msr(vcpu, kvmppc_get_gpr(vcpu, rs));
break;
case OP_31_XOP_WRTEE:
rs = get_rs(inst);
vcpu->arch.msr = (vcpu->arch.msr & ~MSR_EE)
- | (vcpu->arch.gpr[rs] & MSR_EE);
+ | (kvmppc_get_gpr(vcpu, rs) & MSR_EE);
kvmppc_set_exit_type(vcpu, EMULATED_WRTEE_EXITS);
break;
@@ -101,22 +101,23 @@ int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
{
int emulated = EMULATE_DONE;
+ ulong spr_val = kvmppc_get_gpr(vcpu, rs);
switch (sprn) {
case SPRN_DEAR:
- vcpu->arch.dear = vcpu->arch.gpr[rs]; break;
+ vcpu->arch.dear = spr_val; break;
case SPRN_ESR:
- vcpu->arch.esr = vcpu->arch.gpr[rs]; break;
+ vcpu->arch.esr = spr_val; break;
case SPRN_DBCR0:
- vcpu->arch.dbcr0 = vcpu->arch.gpr[rs]; break;
+ vcpu->arch.dbcr0 = spr_val; break;
case SPRN_DBCR1:
- vcpu->arch.dbcr1 = vcpu->arch.gpr[rs]; break;
+ vcpu->arch.dbcr1 = spr_val; break;
case SPRN_DBSR:
- vcpu->arch.dbsr &= ~vcpu->arch.gpr[rs]; break;
+ vcpu->arch.dbsr &= ~spr_val; break;
case SPRN_TSR:
- vcpu->arch.tsr &= ~vcpu->arch.gpr[rs]; break;
+ vcpu->arch.tsr &= ~spr_val; break;
case SPRN_TCR:
- vcpu->arch.tcr = vcpu->arch.gpr[rs];
+ vcpu->arch.tcr = spr_val;
kvmppc_emulate_dec(vcpu);
break;
@@ -124,64 +125,64 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
* loaded into the real SPRGs when resuming the
* guest. */
case SPRN_SPRG4:
- vcpu->arch.sprg4 = vcpu->arch.gpr[rs]; break;
+ vcpu->arch.sprg4 = spr_val; break;
case SPRN_SPRG5:
- vcpu->arch.sprg5 = vcpu->arch.gpr[rs]; break;
+ vcpu->arch.sprg5 = spr_val; break;
case SPRN_SPRG6:
- vcpu->arch.sprg6 = vcpu->arch.gpr[rs]; break;
+ vcpu->arch.sprg6 = spr_val; break;
case SPRN_SPRG7:
- vcpu->arch.sprg7 = vcpu->arch.gpr[rs]; break;
+ vcpu->arch.sprg7 = spr_val; break;
case SPRN_IVPR:
- vcpu->arch.ivpr = vcpu->arch.gpr[rs];
+ vcpu->arch.ivpr = spr_val;
break;
case SPRN_IVOR0:
- vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL] = vcpu->arch.gpr[rs];
+ vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL] = spr_val;
break;
case SPRN_IVOR1:
- vcpu->arch.ivor[BOOKE_IRQPRIO_MACHINE_CHECK] = vcpu->arch.gpr[rs];
+ vcpu->arch.ivor[BOOKE_IRQPRIO_MACHINE_CHECK] = spr_val;
break;
case SPRN_IVOR2:
- vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE] = vcpu->arch.gpr[rs];
+ vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE] = spr_val;
break;
case SPRN_IVOR3:
- vcpu->arch.ivor[BOOKE_IRQPRIO_INST_STORAGE] = vcpu->arch.gpr[rs];
+ vcpu->arch.ivor[BOOKE_IRQPRIO_INST_STORAGE] = spr_val;
break;
case SPRN_IVOR4:
- vcpu->arch.ivor[BOOKE_IRQPRIO_EXTERNAL] = vcpu->arch.gpr[rs];
+ vcpu->arch.ivor[BOOKE_IRQPRIO_EXTERNAL] = spr_val;
break;
case SPRN_IVOR5:
- vcpu->arch.ivor[BOOKE_IRQPRIO_ALIGNMENT] = vcpu->arch.gpr[rs];
+ vcpu->arch.ivor[BOOKE_IRQPRIO_ALIGNMENT] = spr_val;
break;
case SPRN_IVOR6:
- vcpu->arch.ivor[BOOKE_IRQPRIO_PROGRAM] = vcpu->arch.gpr[rs];
+ vcpu->arch.ivor[BOOKE_IRQPRIO_PROGRAM] = spr_val;
break;
case SPRN_IVOR7:
- vcpu->arch.ivor[BOOKE_IRQPRIO_FP_UNAVAIL] = vcpu->arch.gpr[rs];
+ vcpu->arch.ivor[BOOKE_IRQPRIO_FP_UNAVAIL] = spr_val;
break;
case SPRN_IVOR8:
- vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL] = vcpu->arch.gpr[rs];
+ vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL] = spr_val;
break;
case SPRN_IVOR9:
- vcpu->arch.ivor[BOOKE_IRQPRIO_AP_UNAVAIL] = vcpu->arch.gpr[rs];
+ vcpu->arch.ivor[BOOKE_IRQPRIO_AP_UNAVAIL] = spr_val;
break;
case SPRN_IVOR10:
- vcpu->arch.ivor[BOOKE_IRQPRIO_DECREMENTER] = vcpu->arch.gpr[rs];
+ vcpu->arch.ivor[BOOKE_IRQPRIO_DECREMENTER] = spr_val;
break;
case SPRN_IVOR11:
- vcpu->arch.ivor[BOOKE_IRQPRIO_FIT] = vcpu->arch.gpr[rs];
+ vcpu->arch.ivor[BOOKE_IRQPRIO_FIT] = spr_val;
break;
case SPRN_IVOR12:
- vcpu->arch.ivor[BOOKE_IRQPRIO_WATCHDOG] = vcpu->arch.gpr[rs];
+ vcpu->arch.ivor[BOOKE_IRQPRIO_WATCHDOG] = spr_val;
break;
case SPRN_IVOR13:
- vcpu->arch.ivor[BOOKE_IRQPRIO_DTLB_MISS] = vcpu->arch.gpr[rs];
+ vcpu->arch.ivor[BOOKE_IRQPRIO_DTLB_MISS] = spr_val;
break;
case SPRN_IVOR14:
- vcpu->arch.ivor[BOOKE_IRQPRIO_ITLB_MISS] = vcpu->arch.gpr[rs];
+ vcpu->arch.ivor[BOOKE_IRQPRIO_ITLB_MISS] = spr_val;
break;
case SPRN_IVOR15:
- vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG] = vcpu->arch.gpr[rs];
+ vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG] = spr_val;
break;
default:
@@ -197,65 +198,65 @@ int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
switch (sprn) {
case SPRN_IVPR:
- vcpu->arch.gpr[rt] = vcpu->arch.ivpr; break;
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivpr); break;
case SPRN_DEAR:
- vcpu->arch.gpr[rt] = vcpu->arch.dear; break;
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.dear); break;
case SPRN_ESR:
- vcpu->arch.gpr[rt] = vcpu->arch.esr; break;
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.esr); break;
case SPRN_DBCR0:
- vcpu->arch.gpr[rt] = vcpu->arch.dbcr0; break;
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.dbcr0); break;
case SPRN_DBCR1:
- vcpu->arch.gpr[rt] = vcpu->arch.dbcr1; break;
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.dbcr1); break;
case SPRN_DBSR:
- vcpu->arch.gpr[rt] = vcpu->arch.dbsr; break;
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.dbsr); break;
case SPRN_IVOR0:
- vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL];
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL]);
break;
case SPRN_IVOR1:
- vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_MACHINE_CHECK];
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_MACHINE_CHECK]);
break;
case SPRN_IVOR2:
- vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE];
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE]);
break;
case SPRN_IVOR3:
- vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_INST_STORAGE];
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_INST_STORAGE]);
break;
case SPRN_IVOR4:
- vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_EXTERNAL];
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_EXTERNAL]);
break;
case SPRN_IVOR5:
- vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_ALIGNMENT];
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_ALIGNMENT]);
break;
case SPRN_IVOR6:
- vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_PROGRAM];
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_PROGRAM]);
break;
case SPRN_IVOR7:
- vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_FP_UNAVAIL];
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_FP_UNAVAIL]);
break;
case SPRN_IVOR8:
- vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL];
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL]);
break;
case SPRN_IVOR9:
- vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_AP_UNAVAIL];
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_AP_UNAVAIL]);
break;
case SPRN_IVOR10:
- vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_DECREMENTER];
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_DECREMENTER]);
break;
case SPRN_IVOR11:
- vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_FIT];
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_FIT]);
break;
case SPRN_IVOR12:
- vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_WATCHDOG];
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_WATCHDOG]);
break;
case SPRN_IVOR13:
- vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_DTLB_MISS];
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_DTLB_MISS]);
break;
case SPRN_IVOR14:
- vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_ITLB_MISS];
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_ITLB_MISS]);
break;
case SPRN_IVOR15:
- vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG];
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG]);
break;
default:
diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c
index 64949eef43f..efa1198940a 100644
--- a/arch/powerpc/kvm/e500.c
+++ b/arch/powerpc/kvm/e500.c
@@ -60,6 +60,12 @@ int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu)
kvmppc_e500_tlb_setup(vcpu_e500);
+ /* Registers init */
+ vcpu->arch.pvr = mfspr(SPRN_PVR);
+
+ /* Since booke kvm only support one core, update all vcpus' PIR to 0 */
+ vcpu->vcpu_id = 0;
+
return 0;
}
diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c
index be95b8d8e3b..8e3edfbc963 100644
--- a/arch/powerpc/kvm/e500_emulate.c
+++ b/arch/powerpc/kvm/e500_emulate.c
@@ -74,54 +74,59 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
{
struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
int emulated = EMULATE_DONE;
+ ulong spr_val = kvmppc_get_gpr(vcpu, rs);
switch (sprn) {
case SPRN_PID:
vcpu_e500->pid[0] = vcpu->arch.shadow_pid =
- vcpu->arch.pid = vcpu->arch.gpr[rs];
+ vcpu->arch.pid = spr_val;
break;
case SPRN_PID1:
- vcpu_e500->pid[1] = vcpu->arch.gpr[rs]; break;
+ vcpu_e500->pid[1] = spr_val; break;
case SPRN_PID2:
- vcpu_e500->pid[2] = vcpu->arch.gpr[rs]; break;
+ vcpu_e500->pid[2] = spr_val; break;
case SPRN_MAS0:
- vcpu_e500->mas0 = vcpu->arch.gpr[rs]; break;
+ vcpu_e500->mas0 = spr_val; break;
case SPRN_MAS1:
- vcpu_e500->mas1 = vcpu->arch.gpr[rs]; break;
+ vcpu_e500->mas1 = spr_val; break;
case SPRN_MAS2:
- vcpu_e500->mas2 = vcpu->arch.gpr[rs]; break;
+ vcpu_e500->mas2 = spr_val; break;
case SPRN_MAS3:
- vcpu_e500->mas3 = vcpu->arch.gpr[rs]; break;
+ vcpu_e500->mas3 = spr_val; break;
case SPRN_MAS4:
- vcpu_e500->mas4 = vcpu->arch.gpr[rs]; break;
+ vcpu_e500->mas4 = spr_val; break;
case SPRN_MAS6:
- vcpu_e500->mas6 = vcpu->arch.gpr[rs]; break;
+ vcpu_e500->mas6 = spr_val; break;
case SPRN_MAS7:
- vcpu_e500->mas7 = vcpu->arch.gpr[rs]; break;
+ vcpu_e500->mas7 = spr_val; break;
+ case SPRN_L1CSR0:
+ vcpu_e500->l1csr0 = spr_val;
+ vcpu_e500->l1csr0 &= ~(L1CSR0_DCFI | L1CSR0_CLFC);
+ break;
case SPRN_L1CSR1:
- vcpu_e500->l1csr1 = vcpu->arch.gpr[rs]; break;
+ vcpu_e500->l1csr1 = spr_val; break;
case SPRN_HID0:
- vcpu_e500->hid0 = vcpu->arch.gpr[rs]; break;
+ vcpu_e500->hid0 = spr_val; break;
case SPRN_HID1:
- vcpu_e500->hid1 = vcpu->arch.gpr[rs]; break;
+ vcpu_e500->hid1 = spr_val; break;
case SPRN_MMUCSR0:
emulated = kvmppc_e500_emul_mt_mmucsr0(vcpu_e500,
- vcpu->arch.gpr[rs]);
+ spr_val);
break;
/* extra exceptions */
case SPRN_IVOR32:
- vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_UNAVAIL] = vcpu->arch.gpr[rs];
+ vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_UNAVAIL] = spr_val;
break;
case SPRN_IVOR33:
- vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_DATA] = vcpu->arch.gpr[rs];
+ vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_DATA] = spr_val;
break;
case SPRN_IVOR34:
- vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_ROUND] = vcpu->arch.gpr[rs];
+ vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_ROUND] = spr_val;
break;
case SPRN_IVOR35:
- vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR] = vcpu->arch.gpr[rs];
+ vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR] = spr_val;
break;
default:
@@ -138,63 +143,57 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
switch (sprn) {
case SPRN_PID:
- vcpu->arch.gpr[rt] = vcpu_e500->pid[0]; break;
+ kvmppc_set_gpr(vcpu, rt, vcpu_e500->pid[0]); break;
case SPRN_PID1:
- vcpu->arch.gpr[rt] = vcpu_e500->pid[1]; break;
+ kvmppc_set_gpr(vcpu, rt, vcpu_e500->pid[1]); break;
case SPRN_PID2:
- vcpu->arch.gpr[rt] = vcpu_e500->pid[2]; break;
+ kvmppc_set_gpr(vcpu, rt, vcpu_e500->pid[2]); break;
case SPRN_MAS0:
- vcpu->arch.gpr[rt] = vcpu_e500->mas0; break;
+ kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas0); break;
case SPRN_MAS1:
- vcpu->arch.gpr[rt] = vcpu_e500->mas1; break;
+ kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas1); break;
case SPRN_MAS2:
- vcpu->arch.gpr[rt] = vcpu_e500->mas2; break;
+ kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas2); break;
case SPRN_MAS3:
- vcpu->arch.gpr[rt] = vcpu_e500->mas3; break;
+ kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas3); break;
case SPRN_MAS4:
- vcpu->arch.gpr[rt] = vcpu_e500->mas4; break;
+ kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas4); break;
case SPRN_MAS6:
- vcpu->arch.gpr[rt] = vcpu_e500->mas6; break;
+ kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas6); break;
case SPRN_MAS7:
- vcpu->arch.gpr[rt] = vcpu_e500->mas7; break;
+ kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas7); break;
case SPRN_TLB0CFG:
- vcpu->arch.gpr[rt] = mfspr(SPRN_TLB0CFG);
- vcpu->arch.gpr[rt] &= ~0xfffUL;
- vcpu->arch.gpr[rt] |= vcpu_e500->guest_tlb_size[0];
- break;
-
+ kvmppc_set_gpr(vcpu, rt, vcpu_e500->tlb0cfg); break;
case SPRN_TLB1CFG:
- vcpu->arch.gpr[rt] = mfspr(SPRN_TLB1CFG);
- vcpu->arch.gpr[rt] &= ~0xfffUL;
- vcpu->arch.gpr[rt] |= vcpu_e500->guest_tlb_size[1];
- break;
-
+ kvmppc_set_gpr(vcpu, rt, vcpu_e500->tlb1cfg); break;
+ case SPRN_L1CSR0:
+ kvmppc_set_gpr(vcpu, rt, vcpu_e500->l1csr0); break;
case SPRN_L1CSR1:
- vcpu->arch.gpr[rt] = vcpu_e500->l1csr1; break;
+ kvmppc_set_gpr(vcpu, rt, vcpu_e500->l1csr1); break;
case SPRN_HID0:
- vcpu->arch.gpr[rt] = vcpu_e500->hid0; break;
+ kvmppc_set_gpr(vcpu, rt, vcpu_e500->hid0); break;
case SPRN_HID1:
- vcpu->arch.gpr[rt] = vcpu_e500->hid1; break;
+ kvmppc_set_gpr(vcpu, rt, vcpu_e500->hid1); break;
case SPRN_MMUCSR0:
- vcpu->arch.gpr[rt] = 0; break;
+ kvmppc_set_gpr(vcpu, rt, 0); break;
case SPRN_MMUCFG:
- vcpu->arch.gpr[rt] = mfspr(SPRN_MMUCFG); break;
+ kvmppc_set_gpr(vcpu, rt, mfspr(SPRN_MMUCFG)); break;
/* extra exceptions */
case SPRN_IVOR32:
- vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_UNAVAIL];
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_UNAVAIL]);
break;
case SPRN_IVOR33:
- vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_DATA];
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_DATA]);
break;
case SPRN_IVOR34:
- vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_ROUND];
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_ROUND]);
break;
case SPRN_IVOR35:
- vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR];
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR]);
break;
default:
emulated = kvmppc_booke_emulate_mfspr(vcpu, sprn, rt);
diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c
index fb1e1dc11ba..0d772e6b631 100644
--- a/arch/powerpc/kvm/e500_tlb.c
+++ b/arch/powerpc/kvm/e500_tlb.c
@@ -417,7 +417,7 @@ int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, int ra, int rb)
int esel, tlbsel;
gva_t ea;
- ea = ((ra) ? vcpu->arch.gpr[ra] : 0) + vcpu->arch.gpr[rb];
+ ea = ((ra) ? kvmppc_get_gpr(vcpu, ra) : 0) + kvmppc_get_gpr(vcpu, rb);
ia = (ea >> 2) & 0x1;
@@ -470,7 +470,7 @@ int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, int rb)
struct tlbe *gtlbe = NULL;
gva_t ea;
- ea = vcpu->arch.gpr[rb];
+ ea = kvmppc_get_gpr(vcpu, rb);
for (tlbsel = 0; tlbsel < 2; tlbsel++) {
esel = kvmppc_e500_tlb_index(vcpu_e500, ea, tlbsel, pid, as);
@@ -728,6 +728,12 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500)
if (vcpu_e500->shadow_pages[1] == NULL)
goto err_out_page0;
+ /* Init TLB configuration register */
+ vcpu_e500->tlb0cfg = mfspr(SPRN_TLB0CFG) & ~0xfffUL;
+ vcpu_e500->tlb0cfg |= vcpu_e500->guest_tlb_size[0];
+ vcpu_e500->tlb1cfg = mfspr(SPRN_TLB1CFG) & ~0xfffUL;
+ vcpu_e500->tlb1cfg |= vcpu_e500->guest_tlb_size[1];
+
return 0;
err_out_page0:
diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c
index 4a9ac6640fa..cb72a65f4ec 100644
--- a/arch/powerpc/kvm/emulate.c
+++ b/arch/powerpc/kvm/emulate.c
@@ -83,6 +83,9 @@ void kvmppc_emulate_dec(struct kvm_vcpu *vcpu)
pr_debug("mtDEC: %x\n", vcpu->arch.dec);
#ifdef CONFIG_PPC64
+ /* mtdec lowers the interrupt line when positive. */
+ kvmppc_core_dequeue_dec(vcpu);
+
/* POWER4+ triggers a dec interrupt if the value is < 0 */
if (vcpu->arch.dec & 0x80000000) {
hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
@@ -140,14 +143,18 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
pr_debug(KERN_INFO "Emulating opcode %d / %d\n", get_op(inst), get_xop(inst));
+ /* Try again next time */
+ if (inst == KVM_INST_FETCH_FAILED)
+ return EMULATE_DONE;
+
switch (get_op(inst)) {
case OP_TRAP:
#ifdef CONFIG_PPC64
case OP_TRAP_64:
+ kvmppc_core_queue_program(vcpu, SRR1_PROGTRAP);
#else
- vcpu->arch.esr |= ESR_PTR;
+ kvmppc_core_queue_program(vcpu, vcpu->arch.esr | ESR_PTR);
#endif
- kvmppc_core_queue_program(vcpu);
advance = 0;
break;
@@ -167,14 +174,14 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
case OP_31_XOP_STWX:
rs = get_rs(inst);
emulated = kvmppc_handle_store(run, vcpu,
- vcpu->arch.gpr[rs],
+ kvmppc_get_gpr(vcpu, rs),
4, 1);
break;
case OP_31_XOP_STBX:
rs = get_rs(inst);
emulated = kvmppc_handle_store(run, vcpu,
- vcpu->arch.gpr[rs],
+ kvmppc_get_gpr(vcpu, rs),
1, 1);
break;
@@ -183,14 +190,14 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
ra = get_ra(inst);
rb = get_rb(inst);
- ea = vcpu->arch.gpr[rb];
+ ea = kvmppc_get_gpr(vcpu, rb);
if (ra)
- ea += vcpu->arch.gpr[ra];
+ ea += kvmppc_get_gpr(vcpu, ra);
emulated = kvmppc_handle_store(run, vcpu,
- vcpu->arch.gpr[rs],
+ kvmppc_get_gpr(vcpu, rs),
1, 1);
- vcpu->arch.gpr[rs] = ea;
+ kvmppc_set_gpr(vcpu, rs, ea);
break;
case OP_31_XOP_LHZX:
@@ -203,12 +210,12 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
ra = get_ra(inst);
rb = get_rb(inst);
- ea = vcpu->arch.gpr[rb];
+ ea = kvmppc_get_gpr(vcpu, rb);
if (ra)
- ea += vcpu->arch.gpr[ra];
+ ea += kvmppc_get_gpr(vcpu, ra);
emulated = kvmppc_handle_load(run, vcpu, rt, 2, 1);
- vcpu->arch.gpr[ra] = ea;
+ kvmppc_set_gpr(vcpu, ra, ea);
break;
case OP_31_XOP_MFSPR:
@@ -217,47 +224,49 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
switch (sprn) {
case SPRN_SRR0:
- vcpu->arch.gpr[rt] = vcpu->arch.srr0; break;
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.srr0); break;
case SPRN_SRR1:
- vcpu->arch.gpr[rt] = vcpu->arch.srr1; break;
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.srr1); break;
case SPRN_PVR:
- vcpu->arch.gpr[rt] = vcpu->arch.pvr; break;
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.pvr); break;
case SPRN_PIR:
- vcpu->arch.gpr[rt] = vcpu->vcpu_id; break;
+ kvmppc_set_gpr(vcpu, rt, vcpu->vcpu_id); break;
case SPRN_MSSSR0:
- vcpu->arch.gpr[rt] = 0; break;
+ kvmppc_set_gpr(vcpu, rt, 0); break;
/* Note: mftb and TBRL/TBWL are user-accessible, so
* the guest can always access the real TB anyways.
* In fact, we probably will never see these traps. */
case SPRN_TBWL:
- vcpu->arch.gpr[rt] = get_tb() >> 32; break;
+ kvmppc_set_gpr(vcpu, rt, get_tb() >> 32); break;
case SPRN_TBWU:
- vcpu->arch.gpr[rt] = get_tb(); break;
+ kvmppc_set_gpr(vcpu, rt, get_tb()); break;
case SPRN_SPRG0:
- vcpu->arch.gpr[rt] = vcpu->arch.sprg0; break;
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.sprg0); break;
case SPRN_SPRG1:
- vcpu->arch.gpr[rt] = vcpu->arch.sprg1; break;
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.sprg1); break;
case SPRN_SPRG2:
- vcpu->arch.gpr[rt] = vcpu->arch.sprg2; break;
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.sprg2); break;
case SPRN_SPRG3:
- vcpu->arch.gpr[rt] = vcpu->arch.sprg3; break;
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.sprg3); break;
/* Note: SPRG4-7 are user-readable, so we don't get
* a trap. */
case SPRN_DEC:
{
u64 jd = get_tb() - vcpu->arch.dec_jiffies;
- vcpu->arch.gpr[rt] = vcpu->arch.dec - jd;
- pr_debug(KERN_INFO "mfDEC: %x - %llx = %lx\n", vcpu->arch.dec, jd, vcpu->arch.gpr[rt]);
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.dec - jd);
+ pr_debug(KERN_INFO "mfDEC: %x - %llx = %lx\n",
+ vcpu->arch.dec, jd,
+ kvmppc_get_gpr(vcpu, rt));
break;
}
default:
emulated = kvmppc_core_emulate_mfspr(vcpu, sprn, rt);
if (emulated == EMULATE_FAIL) {
printk("mfspr: unknown spr %x\n", sprn);
- vcpu->arch.gpr[rt] = 0;
+ kvmppc_set_gpr(vcpu, rt, 0);
}
break;
}
@@ -269,7 +278,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
rb = get_rb(inst);
emulated = kvmppc_handle_store(run, vcpu,
- vcpu->arch.gpr[rs],
+ kvmppc_get_gpr(vcpu, rs),
2, 1);
break;
@@ -278,14 +287,14 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
ra = get_ra(inst);
rb = get_rb(inst);
- ea = vcpu->arch.gpr[rb];
+ ea = kvmppc_get_gpr(vcpu, rb);
if (ra)
- ea += vcpu->arch.gpr[ra];
+ ea += kvmppc_get_gpr(vcpu, ra);
emulated = kvmppc_handle_store(run, vcpu,
- vcpu->arch.gpr[rs],
+ kvmppc_get_gpr(vcpu, rs),
2, 1);
- vcpu->arch.gpr[ra] = ea;
+ kvmppc_set_gpr(vcpu, ra, ea);
break;
case OP_31_XOP_MTSPR:
@@ -293,9 +302,9 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
rs = get_rs(inst);
switch (sprn) {
case SPRN_SRR0:
- vcpu->arch.srr0 = vcpu->arch.gpr[rs]; break;
+ vcpu->arch.srr0 = kvmppc_get_gpr(vcpu, rs); break;
case SPRN_SRR1:
- vcpu->arch.srr1 = vcpu->arch.gpr[rs]; break;
+ vcpu->arch.srr1 = kvmppc_get_gpr(vcpu, rs); break;
/* XXX We need to context-switch the timebase for
* watchdog and FIT. */
@@ -305,18 +314,18 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
case SPRN_MSSSR0: break;
case SPRN_DEC:
- vcpu->arch.dec = vcpu->arch.gpr[rs];
+ vcpu->arch.dec = kvmppc_get_gpr(vcpu, rs);
kvmppc_emulate_dec(vcpu);
break;
case SPRN_SPRG0:
- vcpu->arch.sprg0 = vcpu->arch.gpr[rs]; break;
+ vcpu->arch.sprg0 = kvmppc_get_gpr(vcpu, rs); break;
case SPRN_SPRG1:
- vcpu->arch.sprg1 = vcpu->arch.gpr[rs]; break;
+ vcpu->arch.sprg1 = kvmppc_get_gpr(vcpu, rs); break;
case SPRN_SPRG2:
- vcpu->arch.sprg2 = vcpu->arch.gpr[rs]; break;
+ vcpu->arch.sprg2 = kvmppc_get_gpr(vcpu, rs); break;
case SPRN_SPRG3:
- vcpu->arch.sprg3 = vcpu->arch.gpr[rs]; break;
+ vcpu->arch.sprg3 = kvmppc_get_gpr(vcpu, rs); break;
default:
emulated = kvmppc_core_emulate_mtspr(vcpu, sprn, rs);
@@ -348,7 +357,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
rb = get_rb(inst);
emulated = kvmppc_handle_store(run, vcpu,
- vcpu->arch.gpr[rs],
+ kvmppc_get_gpr(vcpu, rs),
4, 0);
break;
@@ -363,7 +372,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
rb = get_rb(inst);
emulated = kvmppc_handle_store(run, vcpu,
- vcpu->arch.gpr[rs],
+ kvmppc_get_gpr(vcpu, rs),
2, 0);
break;
@@ -382,7 +391,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
ra = get_ra(inst);
rt = get_rt(inst);
emulated = kvmppc_handle_load(run, vcpu, rt, 4, 1);
- vcpu->arch.gpr[ra] = vcpu->arch.paddr_accessed;
+ kvmppc_set_gpr(vcpu, ra, vcpu->arch.paddr_accessed);
break;
case OP_LBZ:
@@ -394,35 +403,39 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
ra = get_ra(inst);
rt = get_rt(inst);
emulated = kvmppc_handle_load(run, vcpu, rt, 1, 1);
- vcpu->arch.gpr[ra] = vcpu->arch.paddr_accessed;
+ kvmppc_set_gpr(vcpu, ra, vcpu->arch.paddr_accessed);
break;
case OP_STW:
rs = get_rs(inst);
- emulated = kvmppc_handle_store(run, vcpu, vcpu->arch.gpr[rs],
+ emulated = kvmppc_handle_store(run, vcpu,
+ kvmppc_get_gpr(vcpu, rs),
4, 1);
break;
case OP_STWU:
ra = get_ra(inst);
rs = get_rs(inst);
- emulated = kvmppc_handle_store(run, vcpu, vcpu->arch.gpr[rs],
+ emulated = kvmppc_handle_store(run, vcpu,
+ kvmppc_get_gpr(vcpu, rs),
4, 1);
- vcpu->arch.gpr[ra] = vcpu->arch.paddr_accessed;
+ kvmppc_set_gpr(vcpu, ra, vcpu->arch.paddr_accessed);
break;
case OP_STB:
rs = get_rs(inst);
- emulated = kvmppc_handle_store(run, vcpu, vcpu->arch.gpr[rs],
+ emulated = kvmppc_handle_store(run, vcpu,
+ kvmppc_get_gpr(vcpu, rs),
1, 1);
break;
case OP_STBU:
ra = get_ra(inst);
rs = get_rs(inst);
- emulated = kvmppc_handle_store(run, vcpu, vcpu->arch.gpr[rs],
+ emulated = kvmppc_handle_store(run, vcpu,
+ kvmppc_get_gpr(vcpu, rs),
1, 1);
- vcpu->arch.gpr[ra] = vcpu->arch.paddr_accessed;
+ kvmppc_set_gpr(vcpu, ra, vcpu->arch.paddr_accessed);
break;
case OP_LHZ:
@@ -434,21 +447,23 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
ra = get_ra(inst);
rt = get_rt(inst);
emulated = kvmppc_handle_load(run, vcpu, rt, 2, 1);
- vcpu->arch.gpr[ra] = vcpu->arch.paddr_accessed;
+ kvmppc_set_gpr(vcpu, ra, vcpu->arch.paddr_accessed);
break;
case OP_STH:
rs = get_rs(inst);
- emulated = kvmppc_handle_store(run, vcpu, vcpu->arch.gpr[rs],
+ emulated = kvmppc_handle_store(run, vcpu,
+ kvmppc_get_gpr(vcpu, rs),
2, 1);
break;
case OP_STHU:
ra = get_ra(inst);
rs = get_rs(inst);
- emulated = kvmppc_handle_store(run, vcpu, vcpu->arch.gpr[rs],
+ emulated = kvmppc_handle_store(run, vcpu,
+ kvmppc_get_gpr(vcpu, rs),
2, 1);
- vcpu->arch.gpr[ra] = vcpu->arch.paddr_accessed;
+ kvmppc_set_gpr(vcpu, ra, vcpu->arch.paddr_accessed);
break;
default:
@@ -461,6 +476,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
advance = 0;
printk(KERN_ERR "Couldn't emulate instruction 0x%08x "
"(op %d xop %d)\n", inst, get_op(inst), get_xop(inst));
+ kvmppc_core_queue_program(vcpu, 0);
}
}
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index f06cf93b178..51aedd7f16b 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -137,6 +137,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
{
kvmppc_free_vcpus(kvm);
kvm_free_physmem(kvm);
+ cleanup_srcu_struct(&kvm->srcu);
kfree(kvm);
}
@@ -165,14 +166,24 @@ long kvm_arch_dev_ioctl(struct file *filp,
return -EINVAL;
}
-int kvm_arch_set_memory_region(struct kvm *kvm,
- struct kvm_userspace_memory_region *mem,
- struct kvm_memory_slot old,
- int user_alloc)
+int kvm_arch_prepare_memory_region(struct kvm *kvm,
+ struct kvm_memory_slot *memslot,
+ struct kvm_memory_slot old,
+ struct kvm_userspace_memory_region *mem,
+ int user_alloc)
{
return 0;
}
+void kvm_arch_commit_memory_region(struct kvm *kvm,
+ struct kvm_userspace_memory_region *mem,
+ struct kvm_memory_slot old,
+ int user_alloc)
+{
+ return;
+}
+
+
void kvm_arch_flush_shadow(struct kvm *kvm)
{
}
@@ -260,34 +271,35 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
static void kvmppc_complete_dcr_load(struct kvm_vcpu *vcpu,
struct kvm_run *run)
{
- ulong *gpr = &vcpu->arch.gpr[vcpu->arch.io_gpr];
- *gpr = run->dcr.data;
+ kvmppc_set_gpr(vcpu, vcpu->arch.io_gpr, run->dcr.data);
}
static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu,
struct kvm_run *run)
{
- ulong *gpr = &vcpu->arch.gpr[vcpu->arch.io_gpr];
+ ulong gpr;
- if (run->mmio.len > sizeof(*gpr)) {
+ if (run->mmio.len > sizeof(gpr)) {
printk(KERN_ERR "bad MMIO length: %d\n", run->mmio.len);
return;
}
if (vcpu->arch.mmio_is_bigendian) {
switch (run->mmio.len) {
- case 4: *gpr = *(u32 *)run->mmio.data; break;
- case 2: *gpr = *(u16 *)run->mmio.data; break;
- case 1: *gpr = *(u8 *)run->mmio.data; break;
+ case 4: gpr = *(u32 *)run->mmio.data; break;
+ case 2: gpr = *(u16 *)run->mmio.data; break;
+ case 1: gpr = *(u8 *)run->mmio.data; break;
}
} else {
/* Convert BE data from userland back to LE. */
switch (run->mmio.len) {
- case 4: *gpr = ld_le32((u32 *)run->mmio.data); break;
- case 2: *gpr = ld_le16((u16 *)run->mmio.data); break;
- case 1: *gpr = *(u8 *)run->mmio.data; break;
+ case 4: gpr = ld_le32((u32 *)run->mmio.data); break;
+ case 2: gpr = ld_le16((u16 *)run->mmio.data); break;
+ case 1: gpr = *(u8 *)run->mmio.data; break;
}
}
+
+ kvmppc_set_gpr(vcpu, vcpu->arch.io_gpr, gpr);
}
int kvmppc_handle_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 3fa0a10e466..49292869a5c 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -242,6 +242,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
kvm_free_physmem(kvm);
free_page((unsigned long)(kvm->arch.sca));
debug_unregister(kvm->arch.dbf);
+ cleanup_srcu_struct(&kvm->srcu);
kfree(kvm);
}
@@ -690,14 +691,12 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
}
/* Section: memory related */
-int kvm_arch_set_memory_region(struct kvm *kvm,
- struct kvm_userspace_memory_region *mem,
- struct kvm_memory_slot old,
- int user_alloc)
+int kvm_arch_prepare_memory_region(struct kvm *kvm,
+ struct kvm_memory_slot *memslot,
+ struct kvm_memory_slot old,
+ struct kvm_userspace_memory_region *mem,
+ int user_alloc)
{
- int i;
- struct kvm_vcpu *vcpu;
-
/* A few sanity checks. We can have exactly one memory slot which has
to start at guest virtual zero and which has to be located at a
page boundary in userland and which has to end at a page boundary.
@@ -720,14 +719,23 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
if (!user_alloc)
return -EINVAL;
+ return 0;
+}
+
+void kvm_arch_commit_memory_region(struct kvm *kvm,
+ struct kvm_userspace_memory_region *mem,
+ struct kvm_memory_slot old,
+ int user_alloc)
+{
+ int i;
+ struct kvm_vcpu *vcpu;
+
/* request update of sie control block for all available vcpus */
kvm_for_each_vcpu(i, vcpu, kvm) {
if (test_and_set_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
continue;
kvm_s390_inject_sigp_stop(vcpu, ACTION_RELOADVCPU_ON_STOP);
}
-
- return 0;
}
void kvm_arch_flush_shadow(struct kvm *kvm)
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index 06cce8285ba..60f09ab3672 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -67,10 +67,14 @@ static inline long kvm_s390_vcpu_get_memsize(struct kvm_vcpu *vcpu)
static inline void kvm_s390_vcpu_set_mem(struct kvm_vcpu *vcpu)
{
+ int idx;
struct kvm_memory_slot *mem;
+ struct kvm_memslots *memslots;
- down_read(&vcpu->kvm->slots_lock);
- mem = &vcpu->kvm->memslots[0];
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ memslots = rcu_dereference(vcpu->kvm->memslots);
+
+ mem = &memslots->memslots[0];
vcpu->arch.sie_block->gmsor = mem->userspace_addr;
vcpu->arch.sie_block->gmslm =
@@ -78,7 +82,7 @@ static inline void kvm_s390_vcpu_set_mem(struct kvm_vcpu *vcpu)
(mem->npages << PAGE_SHIFT) +
VIRTIODESCSPACE - 1ul;
- up_read(&vcpu->kvm->slots_lock);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
}
/* implemented in priv.c */
diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild
index 9f828f87ca3..493092efaa3 100644
--- a/arch/x86/include/asm/Kbuild
+++ b/arch/x86/include/asm/Kbuild
@@ -11,6 +11,7 @@ header-y += sigcontext32.h
header-y += ucontext.h
header-y += processor-flags.h
header-y += hw_breakpoint.h
+header-y += hyperv.h
unifdef-y += e820.h
unifdef-y += ist.h
diff --git a/arch/x86/include/asm/hyperv.h b/arch/x86/include/asm/hyperv.h
new file mode 100644
index 00000000000..e153a2b3889
--- /dev/null
+++ b/arch/x86/include/asm/hyperv.h
@@ -0,0 +1,186 @@
+#ifndef _ASM_X86_KVM_HYPERV_H
+#define _ASM_X86_KVM_HYPERV_H
+
+#include <linux/types.h>
+
+/*
+ * The below CPUID leaves are present if VersionAndFeatures.HypervisorPresent
+ * is set by CPUID(HvCpuIdFunctionVersionAndFeatures).
+ */
+#define HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS 0x40000000
+#define HYPERV_CPUID_INTERFACE 0x40000001
+#define HYPERV_CPUID_VERSION 0x40000002
+#define HYPERV_CPUID_FEATURES 0x40000003
+#define HYPERV_CPUID_ENLIGHTMENT_INFO 0x40000004
+#define HYPERV_CPUID_IMPLEMENT_LIMITS 0x40000005
+
+/*
+ * Feature identification. EAX indicates which features are available
+ * to the partition based upon the current partition privileges.
+ */
+
+/* VP Runtime (HV_X64_MSR_VP_RUNTIME) available */
+#define HV_X64_MSR_VP_RUNTIME_AVAILABLE (1 << 0)
+/* Partition Reference Counter (HV_X64_MSR_TIME_REF_COUNT) available*/
+#define HV_X64_MSR_TIME_REF_COUNT_AVAILABLE (1 << 1)
+/*
+ * Basic SynIC MSRs (HV_X64_MSR_SCONTROL through HV_X64_MSR_EOM
+ * and HV_X64_MSR_SINT0 through HV_X64_MSR_SINT15) available
+ */
+#define HV_X64_MSR_SYNIC_AVAILABLE (1 << 2)
+/*
+ * Synthetic Timer MSRs (HV_X64_MSR_STIMER0_CONFIG through
+ * HV_X64_MSR_STIMER3_COUNT) available
+ */
+#define HV_X64_MSR_SYNTIMER_AVAILABLE (1 << 3)
+/*
+ * APIC access MSRs (HV_X64_MSR_EOI, HV_X64_MSR_ICR and HV_X64_MSR_TPR)
+ * are available
+ */
+#define HV_X64_MSR_APIC_ACCESS_AVAILABLE (1 << 4)
+/* Hypercall MSRs (HV_X64_MSR_GUEST_OS_ID and HV_X64_MSR_HYPERCALL) available*/
+#define HV_X64_MSR_HYPERCALL_AVAILABLE (1 << 5)
+/* Access virtual processor index MSR (HV_X64_MSR_VP_INDEX) available*/
+#define HV_X64_MSR_VP_INDEX_AVAILABLE (1 << 6)
+/* Virtual system reset MSR (HV_X64_MSR_RESET) is available*/
+#define HV_X64_MSR_RESET_AVAILABLE (1 << 7)
+ /*
+ * Access statistics pages MSRs (HV_X64_MSR_STATS_PARTITION_RETAIL_PAGE,
+ * HV_X64_MSR_STATS_PARTITION_INTERNAL_PAGE, HV_X64_MSR_STATS_VP_RETAIL_PAGE,
+ * HV_X64_MSR_STATS_VP_INTERNAL_PAGE) available
+ */
+#define HV_X64_MSR_STAT_PAGES_AVAILABLE (1 << 8)
+
+/*
+ * Feature identification: EBX indicates which flags were specified at
+ * partition creation. The format is the same as the partition creation
+ * flag structure defined in section Partition Creation Flags.
+ */
+#define HV_X64_CREATE_PARTITIONS (1 << 0)
+#define HV_X64_ACCESS_PARTITION_ID (1 << 1)
+#define HV_X64_ACCESS_MEMORY_POOL (1 << 2)
+#define HV_X64_ADJUST_MESSAGE_BUFFERS (1 << 3)
+#define HV_X64_POST_MESSAGES (1 << 4)
+#define HV_X64_SIGNAL_EVENTS (1 << 5)
+#define HV_X64_CREATE_PORT (1 << 6)
+#define HV_X64_CONNECT_PORT (1 << 7)
+#define HV_X64_ACCESS_STATS (1 << 8)
+#define HV_X64_DEBUGGING (1 << 11)
+#define HV_X64_CPU_POWER_MANAGEMENT (1 << 12)
+#define HV_X64_CONFIGURE_PROFILER (1 << 13)
+
+/*
+ * Feature identification. EDX indicates which miscellaneous features
+ * are available to the partition.
+ */
+/* The MWAIT instruction is available (per section MONITOR / MWAIT) */
+#define HV_X64_MWAIT_AVAILABLE (1 << 0)
+/* Guest debugging support is available */
+#define HV_X64_GUEST_DEBUGGING_AVAILABLE (1 << 1)
+/* Performance Monitor support is available*/
+#define HV_X64_PERF_MONITOR_AVAILABLE (1 << 2)
+/* Support for physical CPU dynamic partitioning events is available*/
+#define HV_X64_CPU_DYNAMIC_PARTITIONING_AVAILABLE (1 << 3)
+/*
+ * Support for passing hypercall input parameter block via XMM
+ * registers is available
+ */
+#define HV_X64_HYPERCALL_PARAMS_XMM_AVAILABLE (1 << 4)
+/* Support for a virtual guest idle state is available */
+#define HV_X64_GUEST_IDLE_STATE_AVAILABLE (1 << 5)
+
+/*
+ * Implementation recommendations. Indicates which behaviors the hypervisor
+ * recommends the OS implement for optimal performance.
+ */
+ /*
+ * Recommend using hypercall for address space switches rather
+ * than MOV to CR3 instruction
+ */
+#define HV_X64_MWAIT_RECOMMENDED (1 << 0)
+/* Recommend using hypercall for local TLB flushes rather
+ * than INVLPG or MOV to CR3 instructions */
+#define HV_X64_LOCAL_TLB_FLUSH_RECOMMENDED (1 << 1)
+/*
+ * Recommend using hypercall for remote TLB flushes rather
+ * than inter-processor interrupts
+ */
+#define HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED (1 << 2)
+/*
+ * Recommend using MSRs for accessing APIC registers
+ * EOI, ICR and TPR rather than their memory-mapped counterparts
+ */
+#define HV_X64_APIC_ACCESS_RECOMMENDED (1 << 3)
+/* Recommend using the hypervisor-provided MSR to initiate a system RESET */
+#define HV_X64_SYSTEM_RESET_RECOMMENDED (1 << 4)
+/*
+ * Recommend using relaxed timing for this partition. If used,
+ * the VM should disable any watchdog timeouts that rely on the
+ * timely delivery of external interrupts
+ */
+#define HV_X64_RELAXED_TIMING_RECOMMENDED (1 << 5)
+
+/* MSR used to identify the guest OS. */
+#define HV_X64_MSR_GUEST_OS_ID 0x40000000
+
+/* MSR used to setup pages used to communicate with the hypervisor. */
+#define HV_X64_MSR_HYPERCALL 0x40000001
+
+/* MSR used to provide vcpu index */
+#define HV_X64_MSR_VP_INDEX 0x40000002
+
+/* Define the virtual APIC registers */
+#define HV_X64_MSR_EOI 0x40000070
+#define HV_X64_MSR_ICR 0x40000071
+#define HV_X64_MSR_TPR 0x40000072
+#define HV_X64_MSR_APIC_ASSIST_PAGE 0x40000073
+
+/* Define synthetic interrupt controller model specific registers. */
+#define HV_X64_MSR_SCONTROL 0x40000080
+#define HV_X64_MSR_SVERSION 0x40000081
+#define HV_X64_MSR_SIEFP 0x40000082
+#define HV_X64_MSR_SIMP 0x40000083
+#define HV_X64_MSR_EOM 0x40000084
+#define HV_X64_MSR_SINT0 0x40000090
+#define HV_X64_MSR_SINT1 0x40000091
+#define HV_X64_MSR_SINT2 0x40000092
+#define HV_X64_MSR_SINT3 0x40000093
+#define HV_X64_MSR_SINT4 0x40000094
+#define HV_X64_MSR_SINT5 0x40000095
+#define HV_X64_MSR_SINT6 0x40000096
+#define HV_X64_MSR_SINT7 0x40000097
+#define HV_X64_MSR_SINT8 0x40000098
+#define HV_X64_MSR_SINT9 0x40000099
+#define HV_X64_MSR_SINT10 0x4000009A
+#define HV_X64_MSR_SINT11 0x4000009B
+#define HV_X64_MSR_SINT12 0x4000009C
+#define HV_X64_MSR_SINT13 0x4000009D
+#define HV_X64_MSR_SINT14 0x4000009E
+#define HV_X64_MSR_SINT15 0x4000009F
+
+
+#define HV_X64_MSR_HYPERCALL_ENABLE 0x00000001
+#define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT 12
+#define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK \
+ (~((1ull << HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT) - 1))
+
+/* Declare the various hypercall operations. */
+#define HV_X64_HV_NOTIFY_LONG_SPIN_WAIT 0x0008
+
+#define HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE 0x00000001
+#define HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT 12
+#define HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_MASK \
+ (~((1ull << HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT) - 1))
+
+#define HV_PROCESSOR_POWER_STATE_C0 0
+#define HV_PROCESSOR_POWER_STATE_C1 1
+#define HV_PROCESSOR_POWER_STATE_C2 2
+#define HV_PROCESSOR_POWER_STATE_C3 3
+
+/* hypercall status code */
+#define HV_STATUS_SUCCESS 0
+#define HV_STATUS_INVALID_HYPERCALL_CODE 2
+#define HV_STATUS_INVALID_HYPERCALL_INPUT 3
+#define HV_STATUS_INVALID_ALIGNMENT 4
+
+#endif
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 7c18e1230f5..7a6f54fa13b 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -54,13 +54,23 @@ struct x86_emulate_ctxt;
struct x86_emulate_ops {
/*
* read_std: Read bytes of standard (non-emulated/special) memory.
- * Used for instruction fetch, stack operations, and others.
+ * Used for descriptor reading.
* @addr: [IN ] Linear address from which to read.
* @val: [OUT] Value read from memory, zero-extended to 'u_long'.
* @bytes: [IN ] Number of bytes to read from memory.
*/
int (*read_std)(unsigned long addr, void *val,
- unsigned int bytes, struct kvm_vcpu *vcpu);
+ unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error);
+
+ /*
+ * fetch: Read bytes of standard (non-emulated/special) memory.
+ * Used for instruction fetch.
+ * @addr: [IN ] Linear address from which to read.
+ * @val: [OUT] Value read from memory, zero-extended to 'u_long'.
+ * @bytes: [IN ] Number of bytes to read from memory.
+ */
+ int (*fetch)(unsigned long addr, void *val,
+ unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error);
/*
* read_emulated: Read bytes from emulated/special memory area.
@@ -74,7 +84,7 @@ struct x86_emulate_ops {
struct kvm_vcpu *vcpu);
/*
- * write_emulated: Read bytes from emulated/special memory area.
+ * write_emulated: Write bytes to emulated/special memory area.
* @addr: [IN ] Linear address to which to write.
* @val: [IN ] Value to write to memory (low-order bytes used as
* required).
@@ -168,6 +178,7 @@ struct x86_emulate_ctxt {
/* Execution mode, passed to the emulator. */
#define X86EMUL_MODE_REAL 0 /* Real mode. */
+#define X86EMUL_MODE_VM86 1 /* Virtual 8086 mode. */
#define X86EMUL_MODE_PROT16 2 /* 16-bit protected mode. */
#define X86EMUL_MODE_PROT32 4 /* 32-bit protected mode. */
#define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 4f865e8b854..06d9e79ca37 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -25,7 +25,7 @@
#include <asm/mtrr.h>
#include <asm/msr-index.h>
-#define KVM_MAX_VCPUS 16
+#define KVM_MAX_VCPUS 64
#define KVM_MEMORY_SLOTS 32
/* memory slots that does not exposed to userspace */
#define KVM_PRIVATE_MEM_SLOTS 4
@@ -38,19 +38,6 @@
#define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \
0xFFFFFF0000000000ULL)
-#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \
- (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
-#define KVM_GUEST_CR0_MASK \
- (KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
-#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST \
- (X86_CR0_WP | X86_CR0_NE | X86_CR0_TS | X86_CR0_MP)
-#define KVM_VM_CR0_ALWAYS_ON \
- (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
-#define KVM_GUEST_CR4_MASK \
- (X86_CR4_VME | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_PGE | X86_CR4_VMXE)
-#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
-#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
-
#define INVALID_PAGE (~(hpa_t)0)
#define UNMAPPED_GVA (~(gpa_t)0)
@@ -256,7 +243,8 @@ struct kvm_mmu {
void (*new_cr3)(struct kvm_vcpu *vcpu);
int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err);
void (*free)(struct kvm_vcpu *vcpu);
- gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva);
+ gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access,
+ u32 *error);
void (*prefetch_page)(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *page);
int (*sync_page)(struct kvm_vcpu *vcpu,
@@ -282,13 +270,15 @@ struct kvm_vcpu_arch {
u32 regs_dirty;
unsigned long cr0;
+ unsigned long cr0_guest_owned_bits;
unsigned long cr2;
unsigned long cr3;
unsigned long cr4;
+ unsigned long cr4_guest_owned_bits;
unsigned long cr8;
u32 hflags;
u64 pdptrs[4]; /* pae */
- u64 shadow_efer;
+ u64 efer;
u64 apic_base;
struct kvm_lapic *apic; /* kernel irqchip context */
int32_t apic_arb_prio;
@@ -374,17 +364,27 @@ struct kvm_vcpu_arch {
/* used for guest single stepping over the given code position */
u16 singlestep_cs;
unsigned long singlestep_rip;
+ /* fields used by HYPER-V emulation */
+ u64 hv_vapic;
};
struct kvm_mem_alias {
gfn_t base_gfn;
unsigned long npages;
gfn_t target_gfn;
+#define KVM_ALIAS_INVALID 1UL
+ unsigned long flags;
};
-struct kvm_arch{
- int naliases;
+#define KVM_ARCH_HAS_UNALIAS_INSTANTIATION
+
+struct kvm_mem_aliases {
struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS];
+ int naliases;
+};
+
+struct kvm_arch {
+ struct kvm_mem_aliases *aliases;
unsigned int n_free_mmu_pages;
unsigned int n_requested_mmu_pages;
@@ -416,6 +416,10 @@ struct kvm_arch{
s64 kvmclock_offset;
struct kvm_xen_hvm_config xen_hvm_config;
+
+ /* fields used by HYPER-V emulation */
+ u64 hv_guest_os_id;
+ u64 hv_hypercall;
};
struct kvm_vm_stat {
@@ -471,6 +475,7 @@ struct kvm_x86_ops {
int (*hardware_setup)(void); /* __init */
void (*hardware_unsetup)(void); /* __exit */
bool (*cpu_has_accelerated_tpr)(void);
+ void (*cpuid_update)(struct kvm_vcpu *vcpu);
/* Create, but do not attach this VCPU */
struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id);
@@ -492,6 +497,7 @@ struct kvm_x86_ops {
void (*set_segment)(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg);
void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l);
+ void (*decache_cr0_guest_bits)(struct kvm_vcpu *vcpu);
void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu);
void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
@@ -501,12 +507,13 @@ struct kvm_x86_ops {
void (*set_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
void (*get_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
void (*set_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
- unsigned long (*get_dr)(struct kvm_vcpu *vcpu, int dr);
- void (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value,
- int *exception);
+ int (*get_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long *dest);
+ int (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value);
void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
+ void (*fpu_activate)(struct kvm_vcpu *vcpu);
+ void (*fpu_deactivate)(struct kvm_vcpu *vcpu);
void (*tlb_flush)(struct kvm_vcpu *vcpu);
@@ -531,7 +538,8 @@ struct kvm_x86_ops {
int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
int (*get_tdp_level)(void);
u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
- bool (*gb_page_enable)(void);
+ int (*get_lpage_level)(void);
+ bool (*rdtscp_supported)(void);
const struct trace_print_flags *exit_reasons_str;
};
@@ -606,8 +614,7 @@ int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
unsigned long value);
void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
-int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
- int type_bits, int seg);
+int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason);
@@ -653,6 +660,10 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
int kvm_mmu_load(struct kvm_vcpu *vcpu);
void kvm_mmu_unload(struct kvm_vcpu *vcpu);
void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
+gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error);
+gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error);
+gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error);
+gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error);
int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
@@ -666,6 +677,7 @@ void kvm_disable_tdp(void);
int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
int complete_pio(struct kvm_vcpu *vcpu);
+bool kvm_check_iopl(struct kvm_vcpu *vcpu);
struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn);
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index c584076a47f..ffae1420e7d 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -2,6 +2,7 @@
#define _ASM_X86_KVM_PARA_H
#include <linux/types.h>
+#include <asm/hyperv.h>
/* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx. It
* should be used to determine that a VM is running under KVM.
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 1fecb7e6113..38638cd2fa4 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -313,7 +313,7 @@ struct __attribute__ ((__packed__)) vmcb {
#define SVM_EXIT_ERR -1
-#define SVM_CR0_SELECTIVE_MASK (1 << 3 | 1) /* TS and MP */
+#define SVM_CR0_SELECTIVE_MASK (X86_CR0_TS | X86_CR0_MP)
#define SVM_VMLOAD ".byte 0x0f, 0x01, 0xda"
#define SVM_VMRUN ".byte 0x0f, 0x01, 0xd8"
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 2b4945419a8..fb9a080740e 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -53,6 +53,7 @@
*/
#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
#define SECONDARY_EXEC_ENABLE_EPT 0x00000002
+#define SECONDARY_EXEC_RDTSCP 0x00000008
#define SECONDARY_EXEC_ENABLE_VPID 0x00000020
#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040
#define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080
@@ -251,6 +252,7 @@ enum vmcs_field {
#define EXIT_REASON_MSR_READ 31
#define EXIT_REASON_MSR_WRITE 32
#define EXIT_REASON_MWAIT_INSTRUCTION 36
+#define EXIT_REASON_MONITOR_INSTRUCTION 39
#define EXIT_REASON_PAUSE_INSTRUCTION 40
#define EXIT_REASON_MCE_DURING_VMENTRY 41
#define EXIT_REASON_TPR_BELOW_THRESHOLD 43
@@ -362,6 +364,7 @@ enum vmcs_field {
#define VMX_EPTP_UC_BIT (1ull << 8)
#define VMX_EPTP_WB_BIT (1ull << 14)
#define VMX_EPT_2MB_PAGE_BIT (1ull << 16)
+#define VMX_EPT_1GB_PAGE_BIT (1ull << 17)
#define VMX_EPT_EXTENT_INDIVIDUAL_BIT (1ull << 24)
#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25)
#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26)
@@ -374,7 +377,7 @@ enum vmcs_field {
#define VMX_EPT_READABLE_MASK 0x1ull
#define VMX_EPT_WRITABLE_MASK 0x2ull
#define VMX_EPT_EXECUTABLE_MASK 0x4ull
-#define VMX_EPT_IGMT_BIT (1ull << 6)
+#define VMX_EPT_IPAT_BIT (1ull << 6)
#define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 9055e5872ff..1c0c6ab9c60 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -301,7 +301,8 @@ static int __init vsyscall_init(void)
register_sysctl_table(kernel_root_table2);
#endif
on_each_cpu(cpu_vsyscall_init, NULL, 1);
- hotcpu_notifier(cpu_vsyscall_notifier, 0);
+ /* notifier priority > KVM */
+ hotcpu_notifier(cpu_vsyscall_notifier, 30);
return 0;
}
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 3c4d0109ad2..970bbd47951 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -29,6 +29,7 @@ config KVM
select HAVE_KVM_EVENTFD
select KVM_APIC_ARCHITECTURE
select USER_RETURN_NOTIFIER
+ select KVM_MMIO
---help---
Support hosting fully virtualized guest machines using hardware
virtualization extensions. You will need a fairly recent
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 7e8faea4651..4dade6ac082 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -32,7 +32,7 @@
#include <linux/module.h>
#include <asm/kvm_emulate.h>
-#include "mmu.h" /* for is_long_mode() */
+#include "x86.h"
/*
* Opcode effective-address decode tables.
@@ -76,6 +76,8 @@
#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */
#define GroupMask 0xff /* Group number stored in bits 0:7 */
/* Misc flags */
+#define Lock (1<<26) /* lock prefix is allowed for the instruction */
+#define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */
#define No64 (1<<28)
/* Source 2 operand type */
#define Src2None (0<<29)
@@ -88,39 +90,40 @@
enum {
Group1_80, Group1_81, Group1_82, Group1_83,
Group1A, Group3_Byte, Group3, Group4, Group5, Group7,
+ Group8, Group9,
};
static u32 opcode_table[256] = {
/* 0x00 - 0x07 */
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+ ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
/* 0x08 - 0x0F */
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+ ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
ImplicitOps | Stack | No64, 0,
/* 0x10 - 0x17 */
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+ ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
/* 0x18 - 0x1F */
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+ ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
/* 0x20 - 0x27 */
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+ ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
/* 0x28 - 0x2F */
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+ ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
0, 0, 0, 0,
/* 0x30 - 0x37 */
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+ ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
0, 0, 0, 0,
/* 0x38 - 0x3F */
@@ -156,7 +159,7 @@ static u32 opcode_table[256] = {
Group | Group1_80, Group | Group1_81,
Group | Group1_82, Group | Group1_83,
ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+ ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
/* 0x88 - 0x8F */
ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
@@ -210,7 +213,7 @@ static u32 opcode_table[256] = {
SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
/* 0xF0 - 0xF7 */
0, 0, 0, 0,
- ImplicitOps, ImplicitOps, Group | Group3_Byte, Group | Group3,
+ ImplicitOps | Priv, ImplicitOps, Group | Group3_Byte, Group | Group3,
/* 0xF8 - 0xFF */
ImplicitOps, 0, ImplicitOps, ImplicitOps,
ImplicitOps, ImplicitOps, Group | Group4, Group | Group5,
@@ -218,16 +221,20 @@ static u32 opcode_table[256] = {
static u32 twobyte_table[256] = {
/* 0x00 - 0x0F */
- 0, Group | GroupDual | Group7, 0, 0, 0, ImplicitOps, ImplicitOps, 0,
- ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0,
+ 0, Group | GroupDual | Group7, 0, 0,
+ 0, ImplicitOps, ImplicitOps | Priv, 0,
+ ImplicitOps | Priv, ImplicitOps | Priv, 0, 0,
+ 0, ImplicitOps | ModRM, 0, 0,
/* 0x10 - 0x1F */
0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0,
/* 0x20 - 0x2F */
- ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0,
+ ModRM | ImplicitOps | Priv, ModRM | Priv,
+ ModRM | ImplicitOps | Priv, ModRM | Priv,
+ 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
/* 0x30 - 0x3F */
- ImplicitOps, 0, ImplicitOps, 0,
- ImplicitOps, ImplicitOps, 0, 0,
+ ImplicitOps | Priv, 0, ImplicitOps | Priv, 0,
+ ImplicitOps, ImplicitOps | Priv, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
/* 0x40 - 0x47 */
DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
@@ -257,21 +264,23 @@ static u32 twobyte_table[256] = {
DstMem | SrcReg | Src2CL | ModRM, 0, 0,
/* 0xA8 - 0xAF */
ImplicitOps | Stack, ImplicitOps | Stack,
- 0, DstMem | SrcReg | ModRM | BitOp,
+ 0, DstMem | SrcReg | ModRM | BitOp | Lock,
DstMem | SrcReg | Src2ImmByte | ModRM,
DstMem | SrcReg | Src2CL | ModRM,
ModRM, 0,
/* 0xB0 - 0xB7 */
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0,
- DstMem | SrcReg | ModRM | BitOp,
+ ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
+ 0, DstMem | SrcReg | ModRM | BitOp | Lock,
0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
DstReg | SrcMem16 | ModRM | Mov,
/* 0xB8 - 0xBF */
- 0, 0, DstMem | SrcImmByte | ModRM, DstMem | SrcReg | ModRM | BitOp,
+ 0, 0,
+ Group | Group8, DstMem | SrcReg | ModRM | BitOp | Lock,
0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
DstReg | SrcMem16 | ModRM | Mov,
/* 0xC0 - 0xCF */
- 0, 0, 0, DstMem | SrcReg | ModRM | Mov, 0, 0, 0, ImplicitOps | ModRM,
+ 0, 0, 0, DstMem | SrcReg | ModRM | Mov,
+ 0, 0, 0, Group | GroupDual | Group9,
0, 0, 0, 0, 0, 0, 0, 0,
/* 0xD0 - 0xDF */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@@ -283,25 +292,41 @@ static u32 twobyte_table[256] = {
static u32 group_table[] = {
[Group1_80*8] =
- ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
- ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
- ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
- ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
+ ByteOp | DstMem | SrcImm | ModRM | Lock,
+ ByteOp | DstMem | SrcImm | ModRM | Lock,
+ ByteOp | DstMem | SrcImm | ModRM | Lock,
+ ByteOp | DstMem | SrcImm | ModRM | Lock,
+ ByteOp | DstMem | SrcImm | ModRM | Lock,
+ ByteOp | DstMem | SrcImm | ModRM | Lock,
+ ByteOp | DstMem | SrcImm | ModRM | Lock,
+ ByteOp | DstMem | SrcImm | ModRM,
[Group1_81*8] =
- DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
- DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
- DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
- DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
+ DstMem | SrcImm | ModRM | Lock,
+ DstMem | SrcImm | ModRM | Lock,
+ DstMem | SrcImm | ModRM | Lock,
+ DstMem | SrcImm | ModRM | Lock,
+ DstMem | SrcImm | ModRM | Lock,
+ DstMem | SrcImm | ModRM | Lock,
+ DstMem | SrcImm | ModRM | Lock,
+ DstMem | SrcImm | ModRM,
[Group1_82*8] =
- ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
- ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
- ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
- ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
+ ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
+ ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
+ ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
+ ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
+ ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
+ ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
+ ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
+ ByteOp | DstMem | SrcImm | ModRM | No64,
[Group1_83*8] =
- DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM,
- DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM,
- DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM,
- DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM,
+ DstMem | SrcImmByte | ModRM | Lock,
+ DstMem | SrcImmByte | ModRM | Lock,
+ DstMem | SrcImmByte | ModRM | Lock,
+ DstMem | SrcImmByte | ModRM | Lock,
+ DstMem | SrcImmByte | ModRM | Lock,
+ DstMem | SrcImmByte | ModRM | Lock,
+ DstMem | SrcImmByte | ModRM | Lock,
+ DstMem | SrcImmByte | ModRM,
[Group1A*8] =
DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0,
[Group3_Byte*8] =
@@ -320,24 +345,39 @@ static u32 group_table[] = {
SrcMem | ModRM | Stack, 0,
SrcMem | ModRM | Stack, 0, SrcMem | ModRM | Stack, 0,
[Group7*8] =
- 0, 0, ModRM | SrcMem, ModRM | SrcMem,
+ 0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv,
SrcNone | ModRM | DstMem | Mov, 0,
- SrcMem16 | ModRM | Mov, SrcMem | ModRM | ByteOp,
+ SrcMem16 | ModRM | Mov | Priv, SrcMem | ModRM | ByteOp | Priv,
+ [Group8*8] =
+ 0, 0, 0, 0,
+ DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM | Lock,
+ DstMem | SrcImmByte | ModRM | Lock, DstMem | SrcImmByte | ModRM | Lock,
+ [Group9*8] =
+ 0, ImplicitOps | ModRM | Lock, 0, 0, 0, 0, 0, 0,
};
static u32 group2_table[] = {
[Group7*8] =
- SrcNone | ModRM, 0, 0, SrcNone | ModRM,
+ SrcNone | ModRM | Priv, 0, 0, SrcNone | ModRM,
SrcNone | ModRM | DstMem | Mov, 0,
SrcMem16 | ModRM | Mov, 0,
+ [Group9*8] =
+ 0, 0, 0, 0, 0, 0, 0, 0,
};
/* EFLAGS bit definitions. */
+#define EFLG_ID (1<<21)
+#define EFLG_VIP (1<<20)
+#define EFLG_VIF (1<<19)
+#define EFLG_AC (1<<18)
#define EFLG_VM (1<<17)
#define EFLG_RF (1<<16)
+#define EFLG_IOPL (3<<12)
+#define EFLG_NT (1<<14)
#define EFLG_OF (1<<11)
#define EFLG_DF (1<<10)
#define EFLG_IF (1<<9)
+#define EFLG_TF (1<<8)
#define EFLG_SF (1<<7)
#define EFLG_ZF (1<<6)
#define EFLG_AF (1<<4)
@@ -606,7 +646,7 @@ static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
if (linear < fc->start || linear >= fc->end) {
size = min(15UL, PAGE_SIZE - offset_in_page(linear));
- rc = ops->read_std(linear, fc->data, size, ctxt->vcpu);
+ rc = ops->fetch(linear, fc->data, size, ctxt->vcpu, NULL);
if (rc)
return rc;
fc->start = linear;
@@ -661,11 +701,11 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt,
op_bytes = 3;
*address = 0;
rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2,
- ctxt->vcpu);
+ ctxt->vcpu, NULL);
if (rc)
return rc;
rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes,
- ctxt->vcpu);
+ ctxt->vcpu, NULL);
return rc;
}
@@ -889,6 +929,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
switch (mode) {
case X86EMUL_MODE_REAL:
+ case X86EMUL_MODE_VM86:
case X86EMUL_MODE_PROT16:
def_op_bytes = def_ad_bytes = 2;
break;
@@ -975,7 +1016,7 @@ done_prefixes:
}
if (mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
- kvm_report_emulation_failure(ctxt->vcpu, "invalid x86/64 instruction");;
+ kvm_report_emulation_failure(ctxt->vcpu, "invalid x86/64 instruction");
return -1;
}
@@ -1196,13 +1237,56 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt,
rc = ops->read_emulated(register_address(c, ss_base(ctxt),
c->regs[VCPU_REGS_RSP]),
dest, len, ctxt->vcpu);
- if (rc != 0)
+ if (rc != X86EMUL_CONTINUE)
return rc;
register_address_increment(c, &c->regs[VCPU_REGS_RSP], len);
return rc;
}
+static int emulate_popf(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ void *dest, int len)
+{
+ int rc;
+ unsigned long val, change_mask;
+ int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
+ int cpl = kvm_x86_ops->get_cpl(ctxt->vcpu);
+
+ rc = emulate_pop(ctxt, ops, &val, len);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ change_mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_OF
+ | EFLG_TF | EFLG_DF | EFLG_NT | EFLG_RF | EFLG_AC | EFLG_ID;
+
+ switch(ctxt->mode) {
+ case X86EMUL_MODE_PROT64:
+ case X86EMUL_MODE_PROT32:
+ case X86EMUL_MODE_PROT16:
+ if (cpl == 0)
+ change_mask |= EFLG_IOPL;
+ if (cpl <= iopl)
+ change_mask |= EFLG_IF;
+ break;
+ case X86EMUL_MODE_VM86:
+ if (iopl < 3) {
+ kvm_inject_gp(ctxt->vcpu, 0);
+ return X86EMUL_PROPAGATE_FAULT;
+ }
+ change_mask |= EFLG_IF;
+ break;
+ default: /* real mode */
+ change_mask |= (EFLG_IOPL | EFLG_IF);
+ break;
+ }
+
+ *(unsigned long *)dest =
+ (ctxt->eflags & ~change_mask) | (val & change_mask);
+
+ return rc;
+}
+
static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt, int seg)
{
struct decode_cache *c = &ctxt->decode;
@@ -1225,7 +1309,7 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
if (rc != 0)
return rc;
- rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)selector, 1, seg);
+ rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)selector, seg);
return rc;
}
@@ -1370,7 +1454,7 @@ static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
int rc;
rc = ops->read_emulated(memop, &old, 8, ctxt->vcpu);
- if (rc != 0)
+ if (rc != X86EMUL_CONTINUE)
return rc;
if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) ||
@@ -1385,7 +1469,7 @@ static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
(u32) c->regs[VCPU_REGS_RBX];
rc = ops->cmpxchg_emulated(memop, &old, &new, 8, ctxt->vcpu);
- if (rc != 0)
+ if (rc != X86EMUL_CONTINUE)
return rc;
ctxt->eflags |= EFLG_ZF;
}
@@ -1407,7 +1491,7 @@ static int emulate_ret_far(struct x86_emulate_ctxt *ctxt,
rc = emulate_pop(ctxt, ops, &cs, c->op_bytes);
if (rc)
return rc;
- rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)cs, 1, VCPU_SREG_CS);
+ rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)cs, VCPU_SREG_CS);
return rc;
}
@@ -1451,7 +1535,7 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
&c->dst.val,
c->dst.bytes,
ctxt->vcpu);
- if (rc != 0)
+ if (rc != X86EMUL_CONTINUE)
return rc;
break;
case OP_NONE:
@@ -1514,9 +1598,8 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt)
u64 msr_data;
/* syscall is not available in real mode */
- if (c->lock_prefix || ctxt->mode == X86EMUL_MODE_REAL
- || !(ctxt->vcpu->arch.cr0 & X86_CR0_PE))
- return -1;
+ if (ctxt->mode == X86EMUL_MODE_REAL || ctxt->mode == X86EMUL_MODE_VM86)
+ return X86EMUL_UNHANDLEABLE;
setup_syscalls_segments(ctxt, &cs, &ss);
@@ -1553,7 +1636,7 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt)
ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
}
- return 0;
+ return X86EMUL_CONTINUE;
}
static int
@@ -1563,22 +1646,17 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt)
struct kvm_segment cs, ss;
u64 msr_data;
- /* inject #UD if LOCK prefix is used */
- if (c->lock_prefix)
- return -1;
-
- /* inject #GP if in real mode or paging is disabled */
- if (ctxt->mode == X86EMUL_MODE_REAL ||
- !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) {
+ /* inject #GP if in real mode */
+ if (ctxt->mode == X86EMUL_MODE_REAL) {
kvm_inject_gp(ctxt->vcpu, 0);
- return -1;
+ return X86EMUL_UNHANDLEABLE;
}
/* XXX sysenter/sysexit have not been tested in 64bit mode.
* Therefore, we inject an #UD.
*/
if (ctxt->mode == X86EMUL_MODE_PROT64)
- return -1;
+ return X86EMUL_UNHANDLEABLE;
setup_syscalls_segments(ctxt, &cs, &ss);
@@ -1587,13 +1665,13 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt)
case X86EMUL_MODE_PROT32:
if ((msr_data & 0xfffc) == 0x0) {
kvm_inject_gp(ctxt->vcpu, 0);
- return -1;
+ return X86EMUL_PROPAGATE_FAULT;
}
break;
case X86EMUL_MODE_PROT64:
if (msr_data == 0x0) {
kvm_inject_gp(ctxt->vcpu, 0);
- return -1;
+ return X86EMUL_PROPAGATE_FAULT;
}
break;
}
@@ -1618,7 +1696,7 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt)
kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data);
c->regs[VCPU_REGS_RSP] = msr_data;
- return 0;
+ return X86EMUL_CONTINUE;
}
static int
@@ -1629,21 +1707,11 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt)
u64 msr_data;
int usermode;
- /* inject #UD if LOCK prefix is used */
- if (c->lock_prefix)
- return -1;
-
- /* inject #GP if in real mode or paging is disabled */
- if (ctxt->mode == X86EMUL_MODE_REAL
- || !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) {
- kvm_inject_gp(ctxt->vcpu, 0);
- return -1;
- }
-
- /* sysexit must be called from CPL 0 */
- if (kvm_x86_ops->get_cpl(ctxt->vcpu) != 0) {
+ /* inject #GP if in real mode or Virtual 8086 mode */
+ if (ctxt->mode == X86EMUL_MODE_REAL ||
+ ctxt->mode == X86EMUL_MODE_VM86) {
kvm_inject_gp(ctxt->vcpu, 0);
- return -1;
+ return X86EMUL_UNHANDLEABLE;
}
setup_syscalls_segments(ctxt, &cs, &ss);
@@ -1661,7 +1729,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt)
cs.selector = (u16)(msr_data + 16);
if ((msr_data & 0xfffc) == 0x0) {
kvm_inject_gp(ctxt->vcpu, 0);
- return -1;
+ return X86EMUL_PROPAGATE_FAULT;
}
ss.selector = (u16)(msr_data + 24);
break;
@@ -1669,7 +1737,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt)
cs.selector = (u16)(msr_data + 32);
if (msr_data == 0x0) {
kvm_inject_gp(ctxt->vcpu, 0);
- return -1;
+ return X86EMUL_PROPAGATE_FAULT;
}
ss.selector = cs.selector + 8;
cs.db = 0;
@@ -1685,7 +1753,58 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt)
c->eip = ctxt->vcpu->arch.regs[VCPU_REGS_RDX];
c->regs[VCPU_REGS_RSP] = ctxt->vcpu->arch.regs[VCPU_REGS_RCX];
- return 0;
+ return X86EMUL_CONTINUE;
+}
+
+static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt)
+{
+ int iopl;
+ if (ctxt->mode == X86EMUL_MODE_REAL)
+ return false;
+ if (ctxt->mode == X86EMUL_MODE_VM86)
+ return true;
+ iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
+ return kvm_x86_ops->get_cpl(ctxt->vcpu) > iopl;
+}
+
+static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ u16 port, u16 len)
+{
+ struct kvm_segment tr_seg;
+ int r;
+ u16 io_bitmap_ptr;
+ u8 perm, bit_idx = port & 0x7;
+ unsigned mask = (1 << len) - 1;
+
+ kvm_get_segment(ctxt->vcpu, &tr_seg, VCPU_SREG_TR);
+ if (tr_seg.unusable)
+ return false;
+ if (tr_seg.limit < 103)
+ return false;
+ r = ops->read_std(tr_seg.base + 102, &io_bitmap_ptr, 2, ctxt->vcpu,
+ NULL);
+ if (r != X86EMUL_CONTINUE)
+ return false;
+ if (io_bitmap_ptr + port/8 > tr_seg.limit)
+ return false;
+ r = ops->read_std(tr_seg.base + io_bitmap_ptr + port/8, &perm, 1,
+ ctxt->vcpu, NULL);
+ if (r != X86EMUL_CONTINUE)
+ return false;
+ if ((perm >> bit_idx) & mask)
+ return false;
+ return true;
+}
+
+static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ u16 port, u16 len)
+{
+ if (emulator_bad_iopl(ctxt))
+ if (!emulator_io_port_access_allowed(ctxt, ops, port, len))
+ return false;
+ return true;
}
int
@@ -1709,6 +1828,18 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
saved_eip = c->eip;
+ /* LOCK prefix is allowed only with some instructions */
+ if (c->lock_prefix && !(c->d & Lock)) {
+ kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+ goto done;
+ }
+
+ /* Privileged instruction can be executed only in CPL=0 */
+ if ((c->d & Priv) && kvm_x86_ops->get_cpl(ctxt->vcpu)) {
+ kvm_inject_gp(ctxt->vcpu, 0);
+ goto done;
+ }
+
if (((c->d & ModRM) && (c->modrm_mod != 3)) || (c->d & MemAbs))
memop = c->modrm_ea;
@@ -1749,7 +1880,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
&c->src.val,
c->src.bytes,
ctxt->vcpu);
- if (rc != 0)
+ if (rc != X86EMUL_CONTINUE)
goto done;
c->src.orig_val = c->src.val;
}
@@ -1768,12 +1899,15 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
c->dst.ptr = (void *)c->dst.ptr +
(c->src.val & mask) / 8;
}
- if (!(c->d & Mov) &&
- /* optimisation - avoid slow emulated read */
- ((rc = ops->read_emulated((unsigned long)c->dst.ptr,
- &c->dst.val,
- c->dst.bytes, ctxt->vcpu)) != 0))
- goto done;
+ if (!(c->d & Mov)) {
+ /* optimisation - avoid slow emulated read */
+ rc = ops->read_emulated((unsigned long)c->dst.ptr,
+ &c->dst.val,
+ c->dst.bytes,
+ ctxt->vcpu);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ }
}
c->dst.orig_val = c->dst.val;
@@ -1876,7 +2010,12 @@ special_insn:
break;
case 0x6c: /* insb */
case 0x6d: /* insw/insd */
- if (kvm_emulate_pio_string(ctxt->vcpu,
+ if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
+ (c->d & ByteOp) ? 1 : c->op_bytes)) {
+ kvm_inject_gp(ctxt->vcpu, 0);
+ goto done;
+ }
+ if (kvm_emulate_pio_string(ctxt->vcpu,
1,
(c->d & ByteOp) ? 1 : c->op_bytes,
c->rep_prefix ?
@@ -1892,6 +2031,11 @@ special_insn:
return 0;
case 0x6e: /* outsb */
case 0x6f: /* outsw/outsd */
+ if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
+ (c->d & ByteOp) ? 1 : c->op_bytes)) {
+ kvm_inject_gp(ctxt->vcpu, 0);
+ goto done;
+ }
if (kvm_emulate_pio_string(ctxt->vcpu,
0,
(c->d & ByteOp) ? 1 : c->op_bytes,
@@ -1978,25 +2122,19 @@ special_insn:
break;
case 0x8e: { /* mov seg, r/m16 */
uint16_t sel;
- int type_bits;
- int err;
sel = c->src.val;
- if (c->modrm_reg == VCPU_SREG_SS)
- toggle_interruptibility(ctxt, X86_SHADOW_INT_MOV_SS);
- if (c->modrm_reg <= 5) {
- type_bits = (c->modrm_reg == 1) ? 9 : 1;
- err = kvm_load_segment_descriptor(ctxt->vcpu, sel,
- type_bits, c->modrm_reg);
- } else {
- printk(KERN_INFO "Invalid segreg in modrm byte 0x%02x\n",
- c->modrm);
- goto cannot_emulate;
+ if (c->modrm_reg == VCPU_SREG_CS ||
+ c->modrm_reg > VCPU_SREG_GS) {
+ kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+ goto done;
}
- if (err < 0)
- goto cannot_emulate;
+ if (c->modrm_reg == VCPU_SREG_SS)
+ toggle_interruptibility(ctxt, X86_SHADOW_INT_MOV_SS);
+
+ rc = kvm_load_segment_descriptor(ctxt->vcpu, sel, c->modrm_reg);
c->dst.type = OP_NONE; /* Disable writeback. */
break;
@@ -2025,7 +2163,10 @@ special_insn:
c->dst.type = OP_REG;
c->dst.ptr = (unsigned long *) &ctxt->eflags;
c->dst.bytes = c->op_bytes;
- goto pop_instruction;
+ rc = emulate_popf(ctxt, ops, &c->dst.val, c->op_bytes);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ break;
case 0xa0 ... 0xa1: /* mov */
c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
c->dst.val = c->src.val;
@@ -2039,11 +2180,12 @@ special_insn:
c->dst.ptr = (unsigned long *)register_address(c,
es_base(ctxt),
c->regs[VCPU_REGS_RDI]);
- if ((rc = ops->read_emulated(register_address(c,
- seg_override_base(ctxt, c),
- c->regs[VCPU_REGS_RSI]),
+ rc = ops->read_emulated(register_address(c,
+ seg_override_base(ctxt, c),
+ c->regs[VCPU_REGS_RSI]),
&c->dst.val,
- c->dst.bytes, ctxt->vcpu)) != 0)
+ c->dst.bytes, ctxt->vcpu);
+ if (rc != X86EMUL_CONTINUE)
goto done;
register_address_increment(c, &c->regs[VCPU_REGS_RSI],
(ctxt->eflags & EFLG_DF) ? -c->dst.bytes
@@ -2058,10 +2200,11 @@ special_insn:
c->src.ptr = (unsigned long *)register_address(c,
seg_override_base(ctxt, c),
c->regs[VCPU_REGS_RSI]);
- if ((rc = ops->read_emulated((unsigned long)c->src.ptr,
- &c->src.val,
- c->src.bytes,
- ctxt->vcpu)) != 0)
+ rc = ops->read_emulated((unsigned long)c->src.ptr,
+ &c->src.val,
+ c->src.bytes,
+ ctxt->vcpu);
+ if (rc != X86EMUL_CONTINUE)
goto done;
c->dst.type = OP_NONE; /* Disable writeback. */
@@ -2069,10 +2212,11 @@ special_insn:
c->dst.ptr = (unsigned long *)register_address(c,
es_base(ctxt),
c->regs[VCPU_REGS_RDI]);
- if ((rc = ops->read_emulated((unsigned long)c->dst.ptr,
- &c->dst.val,
- c->dst.bytes,
- ctxt->vcpu)) != 0)
+ rc = ops->read_emulated((unsigned long)c->dst.ptr,
+ &c->dst.val,
+ c->dst.bytes,
+ ctxt->vcpu);
+ if (rc != X86EMUL_CONTINUE)
goto done;
DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr);
@@ -2102,12 +2246,13 @@ special_insn:
c->dst.type = OP_REG;
c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
- if ((rc = ops->read_emulated(register_address(c,
- seg_override_base(ctxt, c),
- c->regs[VCPU_REGS_RSI]),
- &c->dst.val,
- c->dst.bytes,
- ctxt->vcpu)) != 0)
+ rc = ops->read_emulated(register_address(c,
+ seg_override_base(ctxt, c),
+ c->regs[VCPU_REGS_RSI]),
+ &c->dst.val,
+ c->dst.bytes,
+ ctxt->vcpu);
+ if (rc != X86EMUL_CONTINUE)
goto done;
register_address_increment(c, &c->regs[VCPU_REGS_RSI],
(ctxt->eflags & EFLG_DF) ? -c->dst.bytes
@@ -2163,11 +2308,9 @@ special_insn:
case 0xe9: /* jmp rel */
goto jmp;
case 0xea: /* jmp far */
- if (kvm_load_segment_descriptor(ctxt->vcpu, c->src2.val, 9,
- VCPU_SREG_CS) < 0) {
- DPRINTF("jmp far: Failed to load CS descriptor\n");
- goto cannot_emulate;
- }
+ if (kvm_load_segment_descriptor(ctxt->vcpu, c->src2.val,
+ VCPU_SREG_CS))
+ goto done;
c->eip = c->src.val;
break;
@@ -2185,7 +2328,13 @@ special_insn:
case 0xef: /* out (e/r)ax,dx */
port = c->regs[VCPU_REGS_RDX];
io_dir_in = 0;
- do_io: if (kvm_emulate_pio(ctxt->vcpu, io_dir_in,
+ do_io:
+ if (!emulator_io_permited(ctxt, ops, port,
+ (c->d & ByteOp) ? 1 : c->op_bytes)) {
+ kvm_inject_gp(ctxt->vcpu, 0);
+ goto done;
+ }
+ if (kvm_emulate_pio(ctxt->vcpu, io_dir_in,
(c->d & ByteOp) ? 1 : c->op_bytes,
port) != 0) {
c->eip = saved_eip;
@@ -2210,13 +2359,21 @@ special_insn:
c->dst.type = OP_NONE; /* Disable writeback. */
break;
case 0xfa: /* cli */
- ctxt->eflags &= ~X86_EFLAGS_IF;
- c->dst.type = OP_NONE; /* Disable writeback. */
+ if (emulator_bad_iopl(ctxt))
+ kvm_inject_gp(ctxt->vcpu, 0);
+ else {
+ ctxt->eflags &= ~X86_EFLAGS_IF;
+ c->dst.type = OP_NONE; /* Disable writeback. */
+ }
break;
case 0xfb: /* sti */
- toggle_interruptibility(ctxt, X86_SHADOW_INT_STI);
- ctxt->eflags |= X86_EFLAGS_IF;
- c->dst.type = OP_NONE; /* Disable writeback. */
+ if (emulator_bad_iopl(ctxt))
+ kvm_inject_gp(ctxt->vcpu, 0);
+ else {
+ toggle_interruptibility(ctxt, X86_SHADOW_INT_STI);
+ ctxt->eflags |= X86_EFLAGS_IF;
+ c->dst.type = OP_NONE; /* Disable writeback. */
+ }
break;
case 0xfc: /* cld */
ctxt->eflags &= ~EFLG_DF;
@@ -2319,8 +2476,9 @@ twobyte_insn:
}
break;
case 0x05: /* syscall */
- if (emulate_syscall(ctxt) == -1)
- goto cannot_emulate;
+ rc = emulate_syscall(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
else
goto writeback;
break;
@@ -2391,14 +2549,16 @@ twobyte_insn:
c->dst.type = OP_NONE;
break;
case 0x34: /* sysenter */
- if (emulate_sysenter(ctxt) == -1)
- goto cannot_emulate;
+ rc = emulate_sysenter(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
else
goto writeback;
break;
case 0x35: /* sysexit */
- if (emulate_sysexit(ctxt) == -1)
- goto cannot_emulate;
+ rc = emulate_sysexit(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
else
goto writeback;
break;
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 15578f180e5..294698b6daf 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -242,11 +242,11 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
{
struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state,
irq_ack_notifier);
- spin_lock(&ps->inject_lock);
+ raw_spin_lock(&ps->inject_lock);
if (atomic_dec_return(&ps->pit_timer.pending) < 0)
atomic_inc(&ps->pit_timer.pending);
ps->irq_ack = 1;
- spin_unlock(&ps->inject_lock);
+ raw_spin_unlock(&ps->inject_lock);
}
void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
@@ -605,7 +605,7 @@ static const struct kvm_io_device_ops speaker_dev_ops = {
.write = speaker_ioport_write,
};
-/* Caller must have writers lock on slots_lock */
+/* Caller must hold slots_lock */
struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
{
struct kvm_pit *pit;
@@ -624,7 +624,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
mutex_init(&pit->pit_state.lock);
mutex_lock(&pit->pit_state.lock);
- spin_lock_init(&pit->pit_state.inject_lock);
+ raw_spin_lock_init(&pit->pit_state.inject_lock);
kvm->arch.vpit = pit;
pit->kvm = kvm;
@@ -645,13 +645,13 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
kvm_iodevice_init(&pit->dev, &pit_dev_ops);
- ret = __kvm_io_bus_register_dev(&kvm->pio_bus, &pit->dev);
+ ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &pit->dev);
if (ret < 0)
goto fail;
if (flags & KVM_PIT_SPEAKER_DUMMY) {
kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops);
- ret = __kvm_io_bus_register_dev(&kvm->pio_bus,
+ ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS,
&pit->speaker_dev);
if (ret < 0)
goto fail_unregister;
@@ -660,11 +660,12 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
return pit;
fail_unregister:
- __kvm_io_bus_unregister_dev(&kvm->pio_bus, &pit->dev);
+ kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->dev);
fail:
- if (pit->irq_source_id >= 0)
- kvm_free_irq_source_id(kvm, pit->irq_source_id);
+ kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
+ kvm_unregister_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier);
+ kvm_free_irq_source_id(kvm, pit->irq_source_id);
kfree(pit);
return NULL;
@@ -723,12 +724,12 @@ void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu)
/* Try to inject pending interrupts when
* last one has been acked.
*/
- spin_lock(&ps->inject_lock);
+ raw_spin_lock(&ps->inject_lock);
if (atomic_read(&ps->pit_timer.pending) && ps->irq_ack) {
ps->irq_ack = 0;
inject = 1;
}
- spin_unlock(&ps->inject_lock);
+ raw_spin_unlock(&ps->inject_lock);
if (inject)
__inject_pit_timer_intr(kvm);
}
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index d4c1c7ffdc0..900d6b0ba7c 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -27,7 +27,7 @@ struct kvm_kpit_state {
u32 speaker_data_on;
struct mutex lock;
struct kvm_pit *pit;
- spinlock_t inject_lock;
+ raw_spinlock_t inject_lock;
unsigned long irq_ack;
struct kvm_irq_ack_notifier irq_ack_notifier;
};
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index d057c0cbd24..07771da85de 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -44,18 +44,19 @@ static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
* Other interrupt may be delivered to PIC while lock is dropped but
* it should be safe since PIC state is already updated at this stage.
*/
- spin_unlock(&s->pics_state->lock);
+ raw_spin_unlock(&s->pics_state->lock);
kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq);
- spin_lock(&s->pics_state->lock);
+ raw_spin_lock(&s->pics_state->lock);
}
void kvm_pic_clear_isr_ack(struct kvm *kvm)
{
struct kvm_pic *s = pic_irqchip(kvm);
- spin_lock(&s->lock);
+
+ raw_spin_lock(&s->lock);
s->pics[0].isr_ack = 0xff;
s->pics[1].isr_ack = 0xff;
- spin_unlock(&s->lock);
+ raw_spin_unlock(&s->lock);
}
/*
@@ -156,9 +157,9 @@ static void pic_update_irq(struct kvm_pic *s)
void kvm_pic_update_irq(struct kvm_pic *s)
{
- spin_lock(&s->lock);
+ raw_spin_lock(&s->lock);
pic_update_irq(s);
- spin_unlock(&s->lock);
+ raw_spin_unlock(&s->lock);
}
int kvm_pic_set_irq(void *opaque, int irq, int level)
@@ -166,14 +167,14 @@ int kvm_pic_set_irq(void *opaque, int irq, int level)
struct kvm_pic *s = opaque;
int ret = -1;
- spin_lock(&s->lock);
+ raw_spin_lock(&s->lock);
if (irq >= 0 && irq < PIC_NUM_PINS) {
ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
pic_update_irq(s);
trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr,
s->pics[irq >> 3].imr, ret == 0);
}
- spin_unlock(&s->lock);
+ raw_spin_unlock(&s->lock);
return ret;
}
@@ -203,7 +204,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
int irq, irq2, intno;
struct kvm_pic *s = pic_irqchip(kvm);
- spin_lock(&s->lock);
+ raw_spin_lock(&s->lock);
irq = pic_get_irq(&s->pics[0]);
if (irq >= 0) {
pic_intack(&s->pics[0], irq);
@@ -228,7 +229,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
intno = s->pics[0].irq_base + irq;
}
pic_update_irq(s);
- spin_unlock(&s->lock);
+ raw_spin_unlock(&s->lock);
return intno;
}
@@ -442,7 +443,7 @@ static int picdev_write(struct kvm_io_device *this,
printk(KERN_ERR "PIC: non byte write\n");
return 0;
}
- spin_lock(&s->lock);
+ raw_spin_lock(&s->lock);
switch (addr) {
case 0x20:
case 0x21:
@@ -455,7 +456,7 @@ static int picdev_write(struct kvm_io_device *this,
elcr_ioport_write(&s->pics[addr & 1], addr, data);
break;
}
- spin_unlock(&s->lock);
+ raw_spin_unlock(&s->lock);
return 0;
}
@@ -472,7 +473,7 @@ static int picdev_read(struct kvm_io_device *this,
printk(KERN_ERR "PIC: non byte read\n");
return 0;
}
- spin_lock(&s->lock);
+ raw_spin_lock(&s->lock);
switch (addr) {
case 0x20:
case 0x21:
@@ -486,7 +487,7 @@ static int picdev_read(struct kvm_io_device *this,
break;
}
*(unsigned char *)val = data;
- spin_unlock(&s->lock);
+ raw_spin_unlock(&s->lock);
return 0;
}
@@ -520,7 +521,7 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL);
if (!s)
return NULL;
- spin_lock_init(&s->lock);
+ raw_spin_lock_init(&s->lock);
s->kvm = kvm;
s->pics[0].elcr_mask = 0xf8;
s->pics[1].elcr_mask = 0xde;
@@ -533,7 +534,9 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
* Initialize PIO device
*/
kvm_iodevice_init(&s->dev, &picdev_ops);
- ret = kvm_io_bus_register_dev(kvm, &kvm->pio_bus, &s->dev);
+ mutex_lock(&kvm->slots_lock);
+ ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &s->dev);
+ mutex_unlock(&kvm->slots_lock);
if (ret < 0) {
kfree(s);
return NULL;
@@ -541,3 +544,14 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
return s;
}
+
+void kvm_destroy_pic(struct kvm *kvm)
+{
+ struct kvm_pic *vpic = kvm->arch.vpic;
+
+ if (vpic) {
+ kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev);
+ kvm->arch.vpic = NULL;
+ kfree(vpic);
+ }
+}
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index be399e207d5..34b15915754 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -62,7 +62,7 @@ struct kvm_kpic_state {
};
struct kvm_pic {
- spinlock_t lock;
+ raw_spinlock_t lock;
unsigned pending_acks;
struct kvm *kvm;
struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
@@ -75,6 +75,7 @@ struct kvm_pic {
};
struct kvm_pic *kvm_create_pic(struct kvm *kvm);
+void kvm_destroy_pic(struct kvm *kvm);
int kvm_pic_read_irq(struct kvm *kvm);
void kvm_pic_update_irq(struct kvm_pic *s);
void kvm_pic_clear_isr_ack(struct kvm *kvm);
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 7bcc5b6a440..cff851cf532 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -1,6 +1,11 @@
#ifndef ASM_KVM_CACHE_REGS_H
#define ASM_KVM_CACHE_REGS_H
+#define KVM_POSSIBLE_CR0_GUEST_BITS X86_CR0_TS
+#define KVM_POSSIBLE_CR4_GUEST_BITS \
+ (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \
+ | X86_CR4_OSXMMEXCPT | X86_CR4_PGE)
+
static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu,
enum kvm_reg reg)
{
@@ -38,4 +43,30 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
return vcpu->arch.pdptrs[index];
}
+static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask)
+{
+ ulong tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS;
+ if (tmask & vcpu->arch.cr0_guest_owned_bits)
+ kvm_x86_ops->decache_cr0_guest_bits(vcpu);
+ return vcpu->arch.cr0 & mask;
+}
+
+static inline ulong kvm_read_cr0(struct kvm_vcpu *vcpu)
+{
+ return kvm_read_cr0_bits(vcpu, ~0UL);
+}
+
+static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask)
+{
+ ulong tmask = mask & KVM_POSSIBLE_CR4_GUEST_BITS;
+ if (tmask & vcpu->arch.cr4_guest_owned_bits)
+ kvm_x86_ops->decache_cr4_guest_bits(vcpu);
+ return vcpu->arch.cr4 & mask;
+}
+
+static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu)
+{
+ return kvm_read_cr4_bits(vcpu, ~0UL);
+}
+
#endif
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index ba8c045da78..4b224f90087 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1246,3 +1246,34 @@ int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
return 0;
}
+
+int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 reg, u64 data)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ if (!irqchip_in_kernel(vcpu->kvm))
+ return 1;
+
+ /* if this is ICR write vector before command */
+ if (reg == APIC_ICR)
+ apic_reg_write(apic, APIC_ICR2, (u32)(data >> 32));
+ return apic_reg_write(apic, reg, (u32)data);
+}
+
+int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u32 low, high = 0;
+
+ if (!irqchip_in_kernel(vcpu->kvm))
+ return 1;
+
+ if (apic_reg_read(apic, reg, 4, &low))
+ return 1;
+ if (reg == APIC_ICR)
+ apic_reg_read(apic, APIC_ICR2, 4, &high);
+
+ *data = (((u64)high) << 32) | low;
+
+ return 0;
+}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 40010b09c4a..f5fe32c5eda 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -48,4 +48,12 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu);
int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data);
int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
+
+int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data);
+int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
+
+static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.hv_vapic & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE;
+}
#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 89a49fb46a2..741373e8ca7 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -18,6 +18,7 @@
*/
#include "mmu.h"
+#include "x86.h"
#include "kvm_cache_regs.h"
#include <linux/kvm_host.h>
@@ -29,6 +30,7 @@
#include <linux/swap.h>
#include <linux/hugetlb.h>
#include <linux/compiler.h>
+#include <linux/srcu.h>
#include <asm/page.h>
#include <asm/cmpxchg.h>
@@ -136,16 +138,6 @@ module_param(oos_shadow, bool, 0644);
#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
| PT64_NX_MASK)
-#define PFERR_PRESENT_MASK (1U << 0)
-#define PFERR_WRITE_MASK (1U << 1)
-#define PFERR_USER_MASK (1U << 2)
-#define PFERR_RSVD_MASK (1U << 3)
-#define PFERR_FETCH_MASK (1U << 4)
-
-#define PT_PDPE_LEVEL 3
-#define PT_DIRECTORY_LEVEL 2
-#define PT_PAGE_TABLE_LEVEL 1
-
#define RMAP_EXT 4
#define ACC_EXEC_MASK 1
@@ -153,6 +145,9 @@ module_param(oos_shadow, bool, 0644);
#define ACC_USER_MASK PT_USER_MASK
#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
+#include <trace/events/kvm.h>
+
+#undef TRACE_INCLUDE_FILE
#define CREATE_TRACE_POINTS
#include "mmutrace.h"
@@ -229,7 +224,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
static int is_write_protection(struct kvm_vcpu *vcpu)
{
- return vcpu->arch.cr0 & X86_CR0_WP;
+ return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
}
static int is_cpuid_PSE36(void)
@@ -239,7 +234,7 @@ static int is_cpuid_PSE36(void)
static int is_nx(struct kvm_vcpu *vcpu)
{
- return vcpu->arch.shadow_efer & EFER_NX;
+ return vcpu->arch.efer & EFER_NX;
}
static int is_shadow_present_pte(u64 pte)
@@ -253,7 +248,7 @@ static int is_large_pte(u64 pte)
return pte & PT_PAGE_SIZE_MASK;
}
-static int is_writeble_pte(unsigned long pte)
+static int is_writable_pte(unsigned long pte)
{
return pte & PT_WRITABLE_MASK;
}
@@ -470,24 +465,10 @@ static int has_wrprotected_page(struct kvm *kvm,
static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
{
- unsigned long page_size = PAGE_SIZE;
- struct vm_area_struct *vma;
- unsigned long addr;
+ unsigned long page_size;
int i, ret = 0;
- addr = gfn_to_hva(kvm, gfn);
- if (kvm_is_error_hva(addr))
- return PT_PAGE_TABLE_LEVEL;
-
- down_read(&current->mm->mmap_sem);
- vma = find_vma(current->mm, addr);
- if (!vma)
- goto out;
-
- page_size = vma_kernel_pagesize(vma);
-
-out:
- up_read(&current->mm->mmap_sem);
+ page_size = kvm_host_page_size(kvm, gfn);
for (i = PT_PAGE_TABLE_LEVEL;
i < (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES); ++i) {
@@ -503,8 +484,7 @@ out:
static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
{
struct kvm_memory_slot *slot;
- int host_level;
- int level = PT_PAGE_TABLE_LEVEL;
+ int host_level, level, max_level;
slot = gfn_to_memslot(vcpu->kvm, large_gfn);
if (slot && slot->dirty_bitmap)
@@ -515,7 +495,10 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
if (host_level == PT_PAGE_TABLE_LEVEL)
return host_level;
- for (level = PT_DIRECTORY_LEVEL; level <= host_level; ++level)
+ max_level = kvm_x86_ops->get_lpage_level() < host_level ?
+ kvm_x86_ops->get_lpage_level() : host_level;
+
+ for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
if (has_wrprotected_page(vcpu->kvm, large_gfn, level))
break;
@@ -633,7 +616,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
pfn = spte_to_pfn(*spte);
if (*spte & shadow_accessed_mask)
kvm_set_pfn_accessed(pfn);
- if (is_writeble_pte(*spte))
+ if (is_writable_pte(*spte))
kvm_set_pfn_dirty(pfn);
rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], sp->role.level);
if (!*rmapp) {
@@ -662,6 +645,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
prev_desc = desc;
desc = desc->more;
}
+ pr_err("rmap_remove: %p %llx many->many\n", spte, *spte);
BUG();
}
}
@@ -708,7 +692,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
BUG_ON(!spte);
BUG_ON(!(*spte & PT_PRESENT_MASK));
rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
- if (is_writeble_pte(*spte)) {
+ if (is_writable_pte(*spte)) {
__set_spte(spte, *spte & ~PT_WRITABLE_MASK);
write_protected = 1;
}
@@ -732,7 +716,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
BUG_ON(!(*spte & PT_PRESENT_MASK));
BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
- if (is_writeble_pte(*spte)) {
+ if (is_writable_pte(*spte)) {
rmap_remove(kvm, spte);
--kvm->stat.lpages;
__set_spte(spte, shadow_trap_nonpresent_pte);
@@ -787,7 +771,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
new_spte &= ~PT_WRITABLE_MASK;
new_spte &= ~SPTE_HOST_WRITEABLE;
- if (is_writeble_pte(*spte))
+ if (is_writable_pte(*spte))
kvm_set_pfn_dirty(spte_to_pfn(*spte));
__set_spte(spte, new_spte);
spte = rmap_next(kvm, rmapp, spte);
@@ -805,35 +789,32 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
unsigned long data))
{
int i, j;
+ int ret;
int retval = 0;
+ struct kvm_memslots *slots;
- /*
- * If mmap_sem isn't taken, we can look the memslots with only
- * the mmu_lock by skipping over the slots with userspace_addr == 0.
- */
- for (i = 0; i < kvm->nmemslots; i++) {
- struct kvm_memory_slot *memslot = &kvm->memslots[i];
+ slots = rcu_dereference(kvm->memslots);
+
+ for (i = 0; i < slots->nmemslots; i++) {
+ struct kvm_memory_slot *memslot = &slots->memslots[i];
unsigned long start = memslot->userspace_addr;
unsigned long end;
- /* mmu_lock protects userspace_addr */
- if (!start)
- continue;
-
end = start + (memslot->npages << PAGE_SHIFT);
if (hva >= start && hva < end) {
gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
- retval |= handler(kvm, &memslot->rmap[gfn_offset],
- data);
+ ret = handler(kvm, &memslot->rmap[gfn_offset], data);
for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
int idx = gfn_offset;
idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j);
- retval |= handler(kvm,
+ ret |= handler(kvm,
&memslot->lpage_info[j][idx].rmap_pde,
data);
}
+ trace_kvm_age_page(hva, memslot, ret);
+ retval |= ret;
}
}
@@ -856,9 +837,15 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
u64 *spte;
int young = 0;
- /* always return old for EPT */
+ /*
+ * Emulate the accessed bit for EPT, by checking if this page has
+ * an EPT mapping, and clearing it if it does. On the next access,
+ * a new EPT mapping will be established.
+ * This has some overhead, but not as much as the cost of swapping
+ * out actively used pages or breaking up actively used hugepages.
+ */
if (!shadow_accessed_mask)
- return 0;
+ return kvm_unmap_rmapp(kvm, rmapp, data);
spte = rmap_next(kvm, rmapp, NULL);
while (spte) {
@@ -1615,7 +1602,7 @@ static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
{
- int slot = memslot_id(kvm, gfn_to_memslot(kvm, gfn));
+ int slot = memslot_id(kvm, gfn);
struct kvm_mmu_page *sp = page_header(__pa(pte));
__set_bit(slot, sp->slot_bitmap);
@@ -1639,7 +1626,7 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
{
struct page *page;
- gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
+ gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
if (gpa == UNMAPPED_GVA)
return NULL;
@@ -1852,7 +1839,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
* is responsibility of mmu_get_page / kvm_sync_page.
* Same reasoning can be applied to dirty page accounting.
*/
- if (!can_unsync && is_writeble_pte(*sptep))
+ if (!can_unsync && is_writable_pte(*sptep))
goto set_pte;
if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
@@ -1860,7 +1847,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
__func__, gfn);
ret = 1;
pte_access &= ~ACC_WRITE_MASK;
- if (is_writeble_pte(spte))
+ if (is_writable_pte(spte))
spte &= ~PT_WRITABLE_MASK;
}
}
@@ -1881,7 +1868,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
bool reset_host_protection)
{
int was_rmapped = 0;
- int was_writeble = is_writeble_pte(*sptep);
+ int was_writable = is_writable_pte(*sptep);
int rmap_count;
pgprintk("%s: spte %llx access %x write_fault %d"
@@ -1932,7 +1919,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
if (rmap_count > RMAP_RECYCLE_THRESHOLD)
rmap_recycle(vcpu, sptep, gfn);
} else {
- if (was_writeble)
+ if (was_writable)
kvm_release_pfn_dirty(pfn);
else
kvm_release_pfn_clean(pfn);
@@ -2162,8 +2149,11 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
spin_unlock(&vcpu->kvm->mmu_lock);
}
-static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
+static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
+ u32 access, u32 *error)
{
+ if (error)
+ *error = 0;
return vaddr;
}
@@ -2747,7 +2737,7 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
if (tdp_enabled)
return 0;
- gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
+ gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
spin_lock(&vcpu->kvm->mmu_lock);
r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
@@ -2847,16 +2837,13 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
*/
page = alloc_page(GFP_KERNEL | __GFP_DMA32);
if (!page)
- goto error_1;
+ return -ENOMEM;
+
vcpu->arch.mmu.pae_root = page_address(page);
for (i = 0; i < 4; ++i)
vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
return 0;
-
-error_1:
- free_mmu_pages(vcpu);
- return -ENOMEM;
}
int kvm_mmu_create(struct kvm_vcpu *vcpu)
@@ -2936,10 +2923,9 @@ static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask)
spin_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list) {
- int npages;
+ int npages, idx;
- if (!down_read_trylock(&kvm->slots_lock))
- continue;
+ idx = srcu_read_lock(&kvm->srcu);
spin_lock(&kvm->mmu_lock);
npages = kvm->arch.n_alloc_mmu_pages -
kvm->arch.n_free_mmu_pages;
@@ -2952,7 +2938,7 @@ static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask)
nr_to_scan--;
spin_unlock(&kvm->mmu_lock);
- up_read(&kvm->slots_lock);
+ srcu_read_unlock(&kvm->srcu, idx);
}
if (kvm_freed)
list_move_tail(&kvm_freed->vm_list, &vm_list);
@@ -3019,9 +3005,11 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
int i;
unsigned int nr_mmu_pages;
unsigned int nr_pages = 0;
+ struct kvm_memslots *slots;
- for (i = 0; i < kvm->nmemslots; i++)
- nr_pages += kvm->memslots[i].npages;
+ slots = rcu_dereference(kvm->memslots);
+ for (i = 0; i < slots->nmemslots; i++)
+ nr_pages += slots->memslots[i].npages;
nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
nr_mmu_pages = max(nr_mmu_pages,
@@ -3246,7 +3234,7 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
if (is_shadow_present_pte(ent) && !is_last_spte(ent, level))
audit_mappings_page(vcpu, ent, va, level - 1);
else {
- gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va);
+ gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, va, NULL);
gfn_t gfn = gpa >> PAGE_SHIFT;
pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn);
hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT;
@@ -3291,10 +3279,12 @@ static void audit_mappings(struct kvm_vcpu *vcpu)
static int count_rmaps(struct kvm_vcpu *vcpu)
{
int nmaps = 0;
- int i, j, k;
+ int i, j, k, idx;
+ idx = srcu_read_lock(&kvm->srcu);
+ slots = rcu_dereference(kvm->memslots);
for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
- struct kvm_memory_slot *m = &vcpu->kvm->memslots[i];
+ struct kvm_memory_slot *m = &slots->memslots[i];
struct kvm_rmap_desc *d;
for (j = 0; j < m->npages; ++j) {
@@ -3317,6 +3307,7 @@ static int count_rmaps(struct kvm_vcpu *vcpu)
}
}
}
+ srcu_read_unlock(&kvm->srcu, idx);
return nmaps;
}
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 61a1b3884b4..be66759321a 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -2,6 +2,7 @@
#define __KVM_X86_MMU_H
#include <linux/kvm_host.h>
+#include "kvm_cache_regs.h"
#define PT64_PT_BITS 9
#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
@@ -37,6 +38,16 @@
#define PT32_ROOT_LEVEL 2
#define PT32E_ROOT_LEVEL 3
+#define PT_PDPE_LEVEL 3
+#define PT_DIRECTORY_LEVEL 2
+#define PT_PAGE_TABLE_LEVEL 1
+
+#define PFERR_PRESENT_MASK (1U << 0)
+#define PFERR_WRITE_MASK (1U << 1)
+#define PFERR_USER_MASK (1U << 2)
+#define PFERR_RSVD_MASK (1U << 3)
+#define PFERR_FETCH_MASK (1U << 4)
+
int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);
static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
@@ -53,30 +64,6 @@ static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
return kvm_mmu_load(vcpu);
}
-static inline int is_long_mode(struct kvm_vcpu *vcpu)
-{
-#ifdef CONFIG_X86_64
- return vcpu->arch.shadow_efer & EFER_LMA;
-#else
- return 0;
-#endif
-}
-
-static inline int is_pae(struct kvm_vcpu *vcpu)
-{
- return vcpu->arch.cr4 & X86_CR4_PAE;
-}
-
-static inline int is_pse(struct kvm_vcpu *vcpu)
-{
- return vcpu->arch.cr4 & X86_CR4_PSE;
-}
-
-static inline int is_paging(struct kvm_vcpu *vcpu)
-{
- return vcpu->arch.cr0 & X86_CR0_PG;
-}
-
static inline int is_present_gpte(unsigned long pte)
{
return pte & PT_PRESENT_MASK;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index ede2131a922..81eab9a50e6 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -162,7 +162,7 @@ walk:
if (rsvd_fault)
goto access_error;
- if (write_fault && !is_writeble_pte(pte))
+ if (write_fault && !is_writable_pte(pte))
if (user_fault || is_write_protection(vcpu))
goto access_error;
@@ -490,18 +490,23 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
spin_unlock(&vcpu->kvm->mmu_lock);
}
-static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
+static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
+ u32 *error)
{
struct guest_walker walker;
gpa_t gpa = UNMAPPED_GVA;
int r;
- r = FNAME(walk_addr)(&walker, vcpu, vaddr, 0, 0, 0);
+ r = FNAME(walk_addr)(&walker, vcpu, vaddr,
+ !!(access & PFERR_WRITE_MASK),
+ !!(access & PFERR_USER_MASK),
+ !!(access & PFERR_FETCH_MASK));
if (r) {
gpa = gfn_to_gpa(walker.gfn);
gpa |= vaddr & ~PAGE_MASK;
- }
+ } else if (error)
+ *error = walker.error_code;
return gpa;
}
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 1d9b33843c8..52f78dd0301 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -231,7 +231,7 @@ static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
efer &= ~EFER_LME;
to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME;
- vcpu->arch.shadow_efer = efer;
+ vcpu->arch.efer = efer;
}
static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
@@ -540,6 +540,8 @@ static void init_vmcb(struct vcpu_svm *svm)
struct vmcb_control_area *control = &svm->vmcb->control;
struct vmcb_save_area *save = &svm->vmcb->save;
+ svm->vcpu.fpu_active = 1;
+
control->intercept_cr_read = INTERCEPT_CR0_MASK |
INTERCEPT_CR3_MASK |
INTERCEPT_CR4_MASK;
@@ -552,13 +554,19 @@ static void init_vmcb(struct vcpu_svm *svm)
control->intercept_dr_read = INTERCEPT_DR0_MASK |
INTERCEPT_DR1_MASK |
INTERCEPT_DR2_MASK |
- INTERCEPT_DR3_MASK;
+ INTERCEPT_DR3_MASK |
+ INTERCEPT_DR4_MASK |
+ INTERCEPT_DR5_MASK |
+ INTERCEPT_DR6_MASK |
+ INTERCEPT_DR7_MASK;
control->intercept_dr_write = INTERCEPT_DR0_MASK |
INTERCEPT_DR1_MASK |
INTERCEPT_DR2_MASK |
INTERCEPT_DR3_MASK |
+ INTERCEPT_DR4_MASK |
INTERCEPT_DR5_MASK |
+ INTERCEPT_DR6_MASK |
INTERCEPT_DR7_MASK;
control->intercept_exceptions = (1 << PF_VECTOR) |
@@ -569,6 +577,7 @@ static void init_vmcb(struct vcpu_svm *svm)
control->intercept = (1ULL << INTERCEPT_INTR) |
(1ULL << INTERCEPT_NMI) |
(1ULL << INTERCEPT_SMI) |
+ (1ULL << INTERCEPT_SELECTIVE_CR0) |
(1ULL << INTERCEPT_CPUID) |
(1ULL << INTERCEPT_INVD) |
(1ULL << INTERCEPT_HLT) |
@@ -641,10 +650,8 @@ static void init_vmcb(struct vcpu_svm *svm)
control->intercept &= ~((1ULL << INTERCEPT_TASK_SWITCH) |
(1ULL << INTERCEPT_INVLPG));
control->intercept_exceptions &= ~(1 << PF_VECTOR);
- control->intercept_cr_read &= ~(INTERCEPT_CR0_MASK|
- INTERCEPT_CR3_MASK);
- control->intercept_cr_write &= ~(INTERCEPT_CR0_MASK|
- INTERCEPT_CR3_MASK);
+ control->intercept_cr_read &= ~INTERCEPT_CR3_MASK;
+ control->intercept_cr_write &= ~INTERCEPT_CR3_MASK;
save->g_pat = 0x0007040600070406ULL;
save->cr3 = 0;
save->cr4 = 0;
@@ -730,7 +737,6 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
init_vmcb(svm);
fx_init(&svm->vcpu);
- svm->vcpu.fpu_active = 1;
svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
if (kvm_vcpu_is_bsp(&svm->vcpu))
svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
@@ -765,14 +771,16 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
if (unlikely(cpu != vcpu->cpu)) {
u64 delta;
- /*
- * Make sure that the guest sees a monotonically
- * increasing TSC.
- */
- delta = vcpu->arch.host_tsc - native_read_tsc();
- svm->vmcb->control.tsc_offset += delta;
- if (is_nested(svm))
- svm->nested.hsave->control.tsc_offset += delta;
+ if (check_tsc_unstable()) {
+ /*
+ * Make sure that the guest sees a monotonically
+ * increasing TSC.
+ */
+ delta = vcpu->arch.host_tsc - native_read_tsc();
+ svm->vmcb->control.tsc_offset += delta;
+ if (is_nested(svm))
+ svm->nested.hsave->control.tsc_offset += delta;
+ }
vcpu->cpu = cpu;
kvm_migrate_timers(vcpu);
svm->asid_generation = 0;
@@ -954,42 +962,59 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
svm->vmcb->save.gdtr.base = dt->base ;
}
+static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
+{
+}
+
static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
{
}
+static void update_cr0_intercept(struct vcpu_svm *svm)
+{
+ ulong gcr0 = svm->vcpu.arch.cr0;
+ u64 *hcr0 = &svm->vmcb->save.cr0;
+
+ if (!svm->vcpu.fpu_active)
+ *hcr0 |= SVM_CR0_SELECTIVE_MASK;
+ else
+ *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK)
+ | (gcr0 & SVM_CR0_SELECTIVE_MASK);
+
+
+ if (gcr0 == *hcr0 && svm->vcpu.fpu_active) {
+ svm->vmcb->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK;
+ svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK;
+ } else {
+ svm->vmcb->control.intercept_cr_read |= INTERCEPT_CR0_MASK;
+ svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR0_MASK;
+ }
+}
+
static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
struct vcpu_svm *svm = to_svm(vcpu);
#ifdef CONFIG_X86_64
- if (vcpu->arch.shadow_efer & EFER_LME) {
+ if (vcpu->arch.efer & EFER_LME) {
if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
- vcpu->arch.shadow_efer |= EFER_LMA;
+ vcpu->arch.efer |= EFER_LMA;
svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
}
if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
- vcpu->arch.shadow_efer &= ~EFER_LMA;
+ vcpu->arch.efer &= ~EFER_LMA;
svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
}
}
#endif
- if (npt_enabled)
- goto set;
+ vcpu->arch.cr0 = cr0;
- if ((vcpu->arch.cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) {
- svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
- vcpu->fpu_active = 1;
- }
+ if (!npt_enabled)
+ cr0 |= X86_CR0_PG | X86_CR0_WP;
- vcpu->arch.cr0 = cr0;
- cr0 |= X86_CR0_PG | X86_CR0_WP;
- if (!vcpu->fpu_active) {
- svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR);
+ if (!vcpu->fpu_active)
cr0 |= X86_CR0_TS;
- }
-set:
/*
* re-enable caching here because the QEMU bios
* does not do it - this results in some delay at
@@ -997,6 +1022,7 @@ set:
*/
cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
svm->vmcb->save.cr0 = cr0;
+ update_cr0_intercept(svm);
}
static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
@@ -1102,76 +1128,70 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
svm->vmcb->control.asid = sd->next_asid++;
}
-static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr)
+static int svm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *dest)
{
struct vcpu_svm *svm = to_svm(vcpu);
- unsigned long val;
switch (dr) {
case 0 ... 3:
- val = vcpu->arch.db[dr];
+ *dest = vcpu->arch.db[dr];
break;
+ case 4:
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
+ return EMULATE_FAIL; /* will re-inject UD */
+ /* fall through */
case 6:
if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
- val = vcpu->arch.dr6;
+ *dest = vcpu->arch.dr6;
else
- val = svm->vmcb->save.dr6;
+ *dest = svm->vmcb->save.dr6;
break;
+ case 5:
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
+ return EMULATE_FAIL; /* will re-inject UD */
+ /* fall through */
case 7:
if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
- val = vcpu->arch.dr7;
+ *dest = vcpu->arch.dr7;
else
- val = svm->vmcb->save.dr7;
+ *dest = svm->vmcb->save.dr7;
break;
- default:
- val = 0;
}
- return val;
+ return EMULATE_DONE;
}
-static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value,
- int *exception)
+static int svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value)
{
struct vcpu_svm *svm = to_svm(vcpu);
- *exception = 0;
-
switch (dr) {
case 0 ... 3:
vcpu->arch.db[dr] = value;
if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
vcpu->arch.eff_db[dr] = value;
- return;
- case 4 ... 5:
- if (vcpu->arch.cr4 & X86_CR4_DE)
- *exception = UD_VECTOR;
- return;
+ break;
+ case 4:
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
+ return EMULATE_FAIL; /* will re-inject UD */
+ /* fall through */
case 6:
- if (value & 0xffffffff00000000ULL) {
- *exception = GP_VECTOR;
- return;
- }
vcpu->arch.dr6 = (value & DR6_VOLATILE) | DR6_FIXED_1;
- return;
+ break;
+ case 5:
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
+ return EMULATE_FAIL; /* will re-inject UD */
+ /* fall through */
case 7:
- if (value & 0xffffffff00000000ULL) {
- *exception = GP_VECTOR;
- return;
- }
vcpu->arch.dr7 = (value & DR7_VOLATILE) | DR7_FIXED_1;
if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
svm->vmcb->save.dr7 = vcpu->arch.dr7;
vcpu->arch.switch_db_regs = (value & DR7_BP_EN_MASK);
}
- return;
- default:
- /* FIXME: Possible case? */
- printk(KERN_DEBUG "%s: unexpected dr %u\n",
- __func__, dr);
- *exception = UD_VECTOR;
- return;
+ break;
}
+
+ return EMULATE_DONE;
}
static int pf_interception(struct vcpu_svm *svm)
@@ -1239,13 +1259,17 @@ static int ud_interception(struct vcpu_svm *svm)
return 1;
}
-static int nm_interception(struct vcpu_svm *svm)
+static void svm_fpu_activate(struct kvm_vcpu *vcpu)
{
+ struct vcpu_svm *svm = to_svm(vcpu);
svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
- if (!(svm->vcpu.arch.cr0 & X86_CR0_TS))
- svm->vmcb->save.cr0 &= ~X86_CR0_TS;
svm->vcpu.fpu_active = 1;
+ update_cr0_intercept(svm);
+}
+static int nm_interception(struct vcpu_svm *svm)
+{
+ svm_fpu_activate(&svm->vcpu);
return 1;
}
@@ -1337,7 +1361,7 @@ static int vmmcall_interception(struct vcpu_svm *svm)
static int nested_svm_check_permissions(struct vcpu_svm *svm)
{
- if (!(svm->vcpu.arch.shadow_efer & EFER_SVME)
+ if (!(svm->vcpu.arch.efer & EFER_SVME)
|| !is_paging(&svm->vcpu)) {
kvm_queue_exception(&svm->vcpu, UD_VECTOR);
return 1;
@@ -1740,8 +1764,8 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
hsave->save.ds = vmcb->save.ds;
hsave->save.gdtr = vmcb->save.gdtr;
hsave->save.idtr = vmcb->save.idtr;
- hsave->save.efer = svm->vcpu.arch.shadow_efer;
- hsave->save.cr0 = svm->vcpu.arch.cr0;
+ hsave->save.efer = svm->vcpu.arch.efer;
+ hsave->save.cr0 = kvm_read_cr0(&svm->vcpu);
hsave->save.cr4 = svm->vcpu.arch.cr4;
hsave->save.rflags = vmcb->save.rflags;
hsave->save.rip = svm->next_rip;
@@ -2153,9 +2177,10 @@ static int rdmsr_interception(struct vcpu_svm *svm)
u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
u64 data;
- if (svm_get_msr(&svm->vcpu, ecx, &data))
+ if (svm_get_msr(&svm->vcpu, ecx, &data)) {
+ trace_kvm_msr_read_ex(ecx);
kvm_inject_gp(&svm->vcpu, 0);
- else {
+ } else {
trace_kvm_msr_read(ecx, data);
svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff;
@@ -2247,13 +2272,15 @@ static int wrmsr_interception(struct vcpu_svm *svm)
u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u)
| ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32);
- trace_kvm_msr_write(ecx, data);
svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
- if (svm_set_msr(&svm->vcpu, ecx, data))
+ if (svm_set_msr(&svm->vcpu, ecx, data)) {
+ trace_kvm_msr_write_ex(ecx, data);
kvm_inject_gp(&svm->vcpu, 0);
- else
+ } else {
+ trace_kvm_msr_write(ecx, data);
skip_emulated_instruction(&svm->vcpu);
+ }
return 1;
}
@@ -2297,7 +2324,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
[SVM_EXIT_READ_CR3] = emulate_on_interception,
[SVM_EXIT_READ_CR4] = emulate_on_interception,
[SVM_EXIT_READ_CR8] = emulate_on_interception,
- /* for now: */
+ [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception,
[SVM_EXIT_WRITE_CR0] = emulate_on_interception,
[SVM_EXIT_WRITE_CR3] = emulate_on_interception,
[SVM_EXIT_WRITE_CR4] = emulate_on_interception,
@@ -2306,11 +2333,17 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
[SVM_EXIT_READ_DR1] = emulate_on_interception,
[SVM_EXIT_READ_DR2] = emulate_on_interception,
[SVM_EXIT_READ_DR3] = emulate_on_interception,
+ [SVM_EXIT_READ_DR4] = emulate_on_interception,
+ [SVM_EXIT_READ_DR5] = emulate_on_interception,
+ [SVM_EXIT_READ_DR6] = emulate_on_interception,
+ [SVM_EXIT_READ_DR7] = emulate_on_interception,
[SVM_EXIT_WRITE_DR0] = emulate_on_interception,
[SVM_EXIT_WRITE_DR1] = emulate_on_interception,
[SVM_EXIT_WRITE_DR2] = emulate_on_interception,
[SVM_EXIT_WRITE_DR3] = emulate_on_interception,
+ [SVM_EXIT_WRITE_DR4] = emulate_on_interception,
[SVM_EXIT_WRITE_DR5] = emulate_on_interception,
+ [SVM_EXIT_WRITE_DR6] = emulate_on_interception,
[SVM_EXIT_WRITE_DR7] = emulate_on_interception,
[SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception,
[SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception,
@@ -2383,20 +2416,10 @@ static int handle_exit(struct kvm_vcpu *vcpu)
svm_complete_interrupts(svm);
- if (npt_enabled) {
- int mmu_reload = 0;
- if ((vcpu->arch.cr0 ^ svm->vmcb->save.cr0) & X86_CR0_PG) {
- svm_set_cr0(vcpu, svm->vmcb->save.cr0);
- mmu_reload = 1;
- }
+ if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR0_MASK))
vcpu->arch.cr0 = svm->vmcb->save.cr0;
+ if (npt_enabled)
vcpu->arch.cr3 = svm->vmcb->save.cr3;
- if (mmu_reload) {
- kvm_mmu_reset_context(vcpu);
- kvm_mmu_load(vcpu);
- }
- }
-
if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
@@ -2798,12 +2821,6 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
svm->vmcb->save.cr3 = root;
force_new_asid(vcpu);
-
- if (vcpu->fpu_active) {
- svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR);
- svm->vmcb->save.cr0 |= X86_CR0_TS;
- vcpu->fpu_active = 0;
- }
}
static int is_disabled(void)
@@ -2852,6 +2869,10 @@ static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
return 0;
}
+static void svm_cpuid_update(struct kvm_vcpu *vcpu)
+{
+}
+
static const struct trace_print_flags svm_exit_reasons_str[] = {
{ SVM_EXIT_READ_CR0, "read_cr0" },
{ SVM_EXIT_READ_CR3, "read_cr3" },
@@ -2905,9 +2926,22 @@ static const struct trace_print_flags svm_exit_reasons_str[] = {
{ -1, NULL }
};
-static bool svm_gb_page_enable(void)
+static int svm_get_lpage_level(void)
{
- return true;
+ return PT_PDPE_LEVEL;
+}
+
+static bool svm_rdtscp_supported(void)
+{
+ return false;
+}
+
+static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ update_cr0_intercept(svm);
+ svm->vmcb->control.intercept_exceptions |= 1 << NM_VECTOR;
}
static struct kvm_x86_ops svm_x86_ops = {
@@ -2936,6 +2970,7 @@ static struct kvm_x86_ops svm_x86_ops = {
.set_segment = svm_set_segment,
.get_cpl = svm_get_cpl,
.get_cs_db_l_bits = kvm_get_cs_db_l_bits,
+ .decache_cr0_guest_bits = svm_decache_cr0_guest_bits,
.decache_cr4_guest_bits = svm_decache_cr4_guest_bits,
.set_cr0 = svm_set_cr0,
.set_cr3 = svm_set_cr3,
@@ -2950,6 +2985,8 @@ static struct kvm_x86_ops svm_x86_ops = {
.cache_reg = svm_cache_reg,
.get_rflags = svm_get_rflags,
.set_rflags = svm_set_rflags,
+ .fpu_activate = svm_fpu_activate,
+ .fpu_deactivate = svm_fpu_deactivate,
.tlb_flush = svm_flush_tlb,
@@ -2975,7 +3012,11 @@ static struct kvm_x86_ops svm_x86_ops = {
.get_mt_mask = svm_get_mt_mask,
.exit_reasons_str = svm_exit_reasons_str,
- .gb_page_enable = svm_gb_page_enable,
+ .get_lpage_level = svm_get_lpage_level,
+
+ .cpuid_update = svm_cpuid_update,
+
+ .rdtscp_supported = svm_rdtscp_supported,
};
static int __init svm_init(void)
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 816e0449db0..6ad30a29f04 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -56,6 +56,38 @@ TRACE_EVENT(kvm_hypercall,
);
/*
+ * Tracepoint for hypercall.
+ */
+TRACE_EVENT(kvm_hv_hypercall,
+ TP_PROTO(__u16 code, bool fast, __u16 rep_cnt, __u16 rep_idx,
+ __u64 ingpa, __u64 outgpa),
+ TP_ARGS(code, fast, rep_cnt, rep_idx, ingpa, outgpa),
+
+ TP_STRUCT__entry(
+ __field( __u16, code )
+ __field( bool, fast )
+ __field( __u16, rep_cnt )
+ __field( __u16, rep_idx )
+ __field( __u64, ingpa )
+ __field( __u64, outgpa )
+ ),
+
+ TP_fast_assign(
+ __entry->code = code;
+ __entry->fast = fast;
+ __entry->rep_cnt = rep_cnt;
+ __entry->rep_idx = rep_idx;
+ __entry->ingpa = ingpa;
+ __entry->outgpa = outgpa;
+ ),
+
+ TP_printk("code 0x%x %s cnt 0x%x idx 0x%x in 0x%llx out 0x%llx",
+ __entry->code, __entry->fast ? "fast" : "slow",
+ __entry->rep_cnt, __entry->rep_idx, __entry->ingpa,
+ __entry->outgpa)
+);
+
+/*
* Tracepoint for PIO.
*/
TRACE_EVENT(kvm_pio,
@@ -214,28 +246,33 @@ TRACE_EVENT(kvm_page_fault,
* Tracepoint for guest MSR access.
*/
TRACE_EVENT(kvm_msr,
- TP_PROTO(unsigned int rw, unsigned int ecx, unsigned long data),
- TP_ARGS(rw, ecx, data),
+ TP_PROTO(unsigned write, u32 ecx, u64 data, bool exception),
+ TP_ARGS(write, ecx, data, exception),
TP_STRUCT__entry(
- __field( unsigned int, rw )
- __field( unsigned int, ecx )
- __field( unsigned long, data )
+ __field( unsigned, write )
+ __field( u32, ecx )
+ __field( u64, data )
+ __field( u8, exception )
),
TP_fast_assign(
- __entry->rw = rw;
+ __entry->write = write;
__entry->ecx = ecx;
__entry->data = data;
+ __entry->exception = exception;
),
- TP_printk("msr_%s %x = 0x%lx",
- __entry->rw ? "write" : "read",
- __entry->ecx, __entry->data)
+ TP_printk("msr_%s %x = 0x%llx%s",
+ __entry->write ? "write" : "read",
+ __entry->ecx, __entry->data,
+ __entry->exception ? " (#GP)" : "")
);
-#define trace_kvm_msr_read(ecx, data) trace_kvm_msr(0, ecx, data)
-#define trace_kvm_msr_write(ecx, data) trace_kvm_msr(1, ecx, data)
+#define trace_kvm_msr_read(ecx, data) trace_kvm_msr(0, ecx, data, false)
+#define trace_kvm_msr_write(ecx, data) trace_kvm_msr(1, ecx, data, false)
+#define trace_kvm_msr_read_ex(ecx) trace_kvm_msr(0, ecx, 0, true)
+#define trace_kvm_msr_write_ex(ecx, data) trace_kvm_msr(1, ecx, data, true)
/*
* Tracepoint for guest CR access.
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d4918d6fc92..14873b9f843 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -61,6 +61,21 @@ module_param_named(unrestricted_guest,
static int __read_mostly emulate_invalid_guest_state = 0;
module_param(emulate_invalid_guest_state, bool, S_IRUGO);
+#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \
+ (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
+#define KVM_GUEST_CR0_MASK \
+ (KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
+#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST \
+ (X86_CR0_WP | X86_CR0_NE)
+#define KVM_VM_CR0_ALWAYS_ON \
+ (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
+#define KVM_CR4_GUEST_OWNED_BITS \
+ (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \
+ | X86_CR4_OSXMMEXCPT)
+
+#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
+#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
+
/*
* These 2 parameters are used to config the controls for Pause-Loop Exiting:
* ple_gap: upper bound on the amount of time between two successive
@@ -136,6 +151,8 @@ struct vcpu_vmx {
ktime_t entry_time;
s64 vnmi_blocked_time;
u32 exit_reason;
+
+ bool rdtscp_enabled;
};
static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
@@ -210,7 +227,7 @@ static const u32 vmx_msr_index[] = {
#ifdef CONFIG_X86_64
MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
#endif
- MSR_EFER, MSR_K6_STAR,
+ MSR_EFER, MSR_TSC_AUX, MSR_K6_STAR,
};
#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
@@ -301,6 +318,11 @@ static inline bool cpu_has_vmx_ept_2m_page(void)
return !!(vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT);
}
+static inline bool cpu_has_vmx_ept_1g_page(void)
+{
+ return !!(vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT);
+}
+
static inline int cpu_has_vmx_invept_individual_addr(void)
{
return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT);
@@ -336,9 +358,7 @@ static inline int cpu_has_vmx_ple(void)
static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
{
- return flexpriority_enabled &&
- (cpu_has_vmx_virtualize_apic_accesses()) &&
- (irqchip_in_kernel(kvm));
+ return flexpriority_enabled && irqchip_in_kernel(kvm);
}
static inline int cpu_has_vmx_vpid(void)
@@ -347,6 +367,12 @@ static inline int cpu_has_vmx_vpid(void)
SECONDARY_EXEC_ENABLE_VPID;
}
+static inline int cpu_has_vmx_rdtscp(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_RDTSCP;
+}
+
static inline int cpu_has_virtual_nmis(void)
{
return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
@@ -551,22 +577,18 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
{
u32 eb;
- eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR);
- if (!vcpu->fpu_active)
- eb |= 1u << NM_VECTOR;
- /*
- * Unconditionally intercept #DB so we can maintain dr6 without
- * reading it every exit.
- */
- eb |= 1u << DB_VECTOR;
- if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
- if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
- eb |= 1u << BP_VECTOR;
- }
+ eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
+ (1u << NM_VECTOR) | (1u << DB_VECTOR);
+ if ((vcpu->guest_debug &
+ (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
+ (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
+ eb |= 1u << BP_VECTOR;
if (to_vmx(vcpu)->rmode.vm86_active)
eb = ~0;
if (enable_ept)
eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
+ if (vcpu->fpu_active)
+ eb &= ~(1u << NM_VECTOR);
vmcs_write32(EXCEPTION_BITMAP, eb);
}
@@ -589,7 +611,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
u64 guest_efer;
u64 ignore_bits;
- guest_efer = vmx->vcpu.arch.shadow_efer;
+ guest_efer = vmx->vcpu.arch.efer;
/*
* NX is emulated; LMA and LME handled by hardware; SCE meaninless
@@ -767,22 +789,30 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
{
+ ulong cr0;
+
if (vcpu->fpu_active)
return;
vcpu->fpu_active = 1;
- vmcs_clear_bits(GUEST_CR0, X86_CR0_TS);
- if (vcpu->arch.cr0 & X86_CR0_TS)
- vmcs_set_bits(GUEST_CR0, X86_CR0_TS);
+ cr0 = vmcs_readl(GUEST_CR0);
+ cr0 &= ~(X86_CR0_TS | X86_CR0_MP);
+ cr0 |= kvm_read_cr0_bits(vcpu, X86_CR0_TS | X86_CR0_MP);
+ vmcs_writel(GUEST_CR0, cr0);
update_exception_bitmap(vcpu);
+ vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
+ vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
}
+static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
+
static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
{
- if (!vcpu->fpu_active)
- return;
- vcpu->fpu_active = 0;
- vmcs_set_bits(GUEST_CR0, X86_CR0_TS);
+ vmx_decache_cr0_guest_bits(vcpu);
+ vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP);
update_exception_bitmap(vcpu);
+ vcpu->arch.cr0_guest_owned_bits = 0;
+ vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
+ vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
}
static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
@@ -878,6 +908,11 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
}
+static bool vmx_rdtscp_supported(void)
+{
+ return cpu_has_vmx_rdtscp();
+}
+
/*
* Swap MSR entry in host/guest MSR entry array.
*/
@@ -913,12 +948,15 @@ static void setup_msrs(struct vcpu_vmx *vmx)
index = __find_msr_index(vmx, MSR_CSTAR);
if (index >= 0)
move_msr_up(vmx, index, save_nmsrs++);
+ index = __find_msr_index(vmx, MSR_TSC_AUX);
+ if (index >= 0 && vmx->rdtscp_enabled)
+ move_msr_up(vmx, index, save_nmsrs++);
/*
* MSR_K6_STAR is only needed on long mode guests, and only
* if efer.sce is enabled.
*/
index = __find_msr_index(vmx, MSR_K6_STAR);
- if ((index >= 0) && (vmx->vcpu.arch.shadow_efer & EFER_SCE))
+ if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE))
move_msr_up(vmx, index, save_nmsrs++);
}
#endif
@@ -1002,6 +1040,10 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
case MSR_IA32_SYSENTER_ESP:
data = vmcs_readl(GUEST_SYSENTER_ESP);
break;
+ case MSR_TSC_AUX:
+ if (!to_vmx(vcpu)->rdtscp_enabled)
+ return 1;
+ /* Otherwise falls through */
default:
vmx_load_host_state(to_vmx(vcpu));
msr = find_msr_entry(to_vmx(vcpu), msr_index);
@@ -1065,7 +1107,15 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
vcpu->arch.pat = data;
break;
}
- /* Otherwise falls through to kvm_set_msr_common */
+ ret = kvm_set_msr_common(vcpu, msr_index, data);
+ break;
+ case MSR_TSC_AUX:
+ if (!vmx->rdtscp_enabled)
+ return 1;
+ /* Check reserved bit, higher 32 bits should be zero */
+ if ((data >> 32) != 0)
+ return 1;
+ /* Otherwise falls through */
default:
msr = find_msr_entry(vmx, msr_index);
if (msr) {
@@ -1224,6 +1274,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
CPU_BASED_USE_IO_BITMAPS |
CPU_BASED_MOV_DR_EXITING |
CPU_BASED_USE_TSC_OFFSETING |
+ CPU_BASED_MWAIT_EXITING |
+ CPU_BASED_MONITOR_EXITING |
CPU_BASED_INVLPG_EXITING;
opt = CPU_BASED_TPR_SHADOW |
CPU_BASED_USE_MSR_BITMAPS |
@@ -1243,7 +1295,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
SECONDARY_EXEC_ENABLE_VPID |
SECONDARY_EXEC_ENABLE_EPT |
SECONDARY_EXEC_UNRESTRICTED_GUEST |
- SECONDARY_EXEC_PAUSE_LOOP_EXITING;
+ SECONDARY_EXEC_PAUSE_LOOP_EXITING |
+ SECONDARY_EXEC_RDTSCP;
if (adjust_vmx_controls(min2, opt2,
MSR_IA32_VMX_PROCBASED_CTLS2,
&_cpu_based_2nd_exec_control) < 0)
@@ -1457,8 +1510,12 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
static gva_t rmode_tss_base(struct kvm *kvm)
{
if (!kvm->arch.tss_addr) {
- gfn_t base_gfn = kvm->memslots[0].base_gfn +
- kvm->memslots[0].npages - 3;
+ struct kvm_memslots *slots;
+ gfn_t base_gfn;
+
+ slots = rcu_dereference(kvm->memslots);
+ base_gfn = kvm->memslots->memslots[0].base_gfn +
+ kvm->memslots->memslots[0].npages - 3;
return base_gfn << PAGE_SHIFT;
}
return kvm->arch.tss_addr;
@@ -1544,9 +1601,7 @@ static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
* of this msr depends on is_long_mode().
*/
vmx_load_host_state(to_vmx(vcpu));
- vcpu->arch.shadow_efer = efer;
- if (!msr)
- return;
+ vcpu->arch.efer = efer;
if (efer & EFER_LMA) {
vmcs_write32(VM_ENTRY_CONTROLS,
vmcs_read32(VM_ENTRY_CONTROLS) |
@@ -1576,13 +1631,13 @@ static void enter_lmode(struct kvm_vcpu *vcpu)
(guest_tr_ar & ~AR_TYPE_MASK)
| AR_TYPE_BUSY_64_TSS);
}
- vcpu->arch.shadow_efer |= EFER_LMA;
- vmx_set_efer(vcpu, vcpu->arch.shadow_efer);
+ vcpu->arch.efer |= EFER_LMA;
+ vmx_set_efer(vcpu, vcpu->arch.efer);
}
static void exit_lmode(struct kvm_vcpu *vcpu)
{
- vcpu->arch.shadow_efer &= ~EFER_LMA;
+ vcpu->arch.efer &= ~EFER_LMA;
vmcs_write32(VM_ENTRY_CONTROLS,
vmcs_read32(VM_ENTRY_CONTROLS)
@@ -1598,10 +1653,20 @@ static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa));
}
+static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
+{
+ ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
+
+ vcpu->arch.cr0 &= ~cr0_guest_owned_bits;
+ vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits;
+}
+
static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
{
- vcpu->arch.cr4 &= KVM_GUEST_CR4_MASK;
- vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK;
+ ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
+
+ vcpu->arch.cr4 &= ~cr4_guest_owned_bits;
+ vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits;
}
static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
@@ -1646,7 +1711,7 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
(CPU_BASED_CR3_LOAD_EXITING |
CPU_BASED_CR3_STORE_EXITING));
vcpu->arch.cr0 = cr0;
- vmx_set_cr4(vcpu, vcpu->arch.cr4);
+ vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
} else if (!is_paging(vcpu)) {
/* From nonpaging to paging */
vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
@@ -1654,23 +1719,13 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
~(CPU_BASED_CR3_LOAD_EXITING |
CPU_BASED_CR3_STORE_EXITING));
vcpu->arch.cr0 = cr0;
- vmx_set_cr4(vcpu, vcpu->arch.cr4);
+ vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
}
if (!(cr0 & X86_CR0_WP))
*hw_cr0 &= ~X86_CR0_WP;
}
-static void ept_update_paging_mode_cr4(unsigned long *hw_cr4,
- struct kvm_vcpu *vcpu)
-{
- if (!is_paging(vcpu)) {
- *hw_cr4 &= ~X86_CR4_PAE;
- *hw_cr4 |= X86_CR4_PSE;
- } else if (!(vcpu->arch.cr4 & X86_CR4_PAE))
- *hw_cr4 &= ~X86_CR4_PAE;
-}
-
static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -1682,8 +1737,6 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
else
hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON;
- vmx_fpu_deactivate(vcpu);
-
if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
enter_pmode(vcpu);
@@ -1691,7 +1744,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
enter_rmode(vcpu);
#ifdef CONFIG_X86_64
- if (vcpu->arch.shadow_efer & EFER_LME) {
+ if (vcpu->arch.efer & EFER_LME) {
if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
enter_lmode(vcpu);
if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
@@ -1702,12 +1755,12 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
if (enable_ept)
ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
+ if (!vcpu->fpu_active)
+ hw_cr0 |= X86_CR0_TS | X86_CR0_MP;
+
vmcs_writel(CR0_READ_SHADOW, cr0);
vmcs_writel(GUEST_CR0, hw_cr0);
vcpu->arch.cr0 = cr0;
-
- if (!(cr0 & X86_CR0_TS) || !(cr0 & X86_CR0_PE))
- vmx_fpu_activate(vcpu);
}
static u64 construct_eptp(unsigned long root_hpa)
@@ -1738,8 +1791,6 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
vmx_flush_tlb(vcpu);
vmcs_writel(GUEST_CR3, guest_cr3);
- if (vcpu->arch.cr0 & X86_CR0_PE)
- vmx_fpu_deactivate(vcpu);
}
static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
@@ -1748,8 +1799,14 @@ static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
vcpu->arch.cr4 = cr4;
- if (enable_ept)
- ept_update_paging_mode_cr4(&hw_cr4, vcpu);
+ if (enable_ept) {
+ if (!is_paging(vcpu)) {
+ hw_cr4 &= ~X86_CR4_PAE;
+ hw_cr4 |= X86_CR4_PSE;
+ } else if (!(cr4 & X86_CR4_PAE)) {
+ hw_cr4 &= ~X86_CR4_PAE;
+ }
+ }
vmcs_writel(CR4_READ_SHADOW, cr4);
vmcs_writel(GUEST_CR4, hw_cr4);
@@ -1787,7 +1844,7 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
static int vmx_get_cpl(struct kvm_vcpu *vcpu)
{
- if (!(vcpu->arch.cr0 & X86_CR0_PE)) /* if real mode */
+ if (!is_protmode(vcpu))
return 0;
if (vmx_get_rflags(vcpu) & X86_EFLAGS_VM) /* if virtual 8086 */
@@ -2042,7 +2099,7 @@ static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
static bool guest_state_valid(struct kvm_vcpu *vcpu)
{
/* real mode guest state checks */
- if (!(vcpu->arch.cr0 & X86_CR0_PE)) {
+ if (!is_protmode(vcpu)) {
if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
return false;
if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))
@@ -2175,7 +2232,7 @@ static int alloc_apic_access_page(struct kvm *kvm)
struct kvm_userspace_memory_region kvm_userspace_mem;
int r = 0;
- down_write(&kvm->slots_lock);
+ mutex_lock(&kvm->slots_lock);
if (kvm->arch.apic_access_page)
goto out;
kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT;
@@ -2188,7 +2245,7 @@ static int alloc_apic_access_page(struct kvm *kvm)
kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00);
out:
- up_write(&kvm->slots_lock);
+ mutex_unlock(&kvm->slots_lock);
return r;
}
@@ -2197,7 +2254,7 @@ static int alloc_identity_pagetable(struct kvm *kvm)
struct kvm_userspace_memory_region kvm_userspace_mem;
int r = 0;
- down_write(&kvm->slots_lock);
+ mutex_lock(&kvm->slots_lock);
if (kvm->arch.ept_identity_pagetable)
goto out;
kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT;
@@ -2212,7 +2269,7 @@ static int alloc_identity_pagetable(struct kvm *kvm)
kvm->arch.ept_identity_pagetable = gfn_to_page(kvm,
kvm->arch.ept_identity_map_addr >> PAGE_SHIFT);
out:
- up_write(&kvm->slots_lock);
+ mutex_unlock(&kvm->slots_lock);
return r;
}
@@ -2384,14 +2441,12 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
for (i = 0; i < NR_VMX_MSR; ++i) {
u32 index = vmx_msr_index[i];
u32 data_low, data_high;
- u64 data;
int j = vmx->nmsrs;
if (rdmsr_safe(index, &data_low, &data_high) < 0)
continue;
if (wrmsr_safe(index, data_low, data_high) < 0)
continue;
- data = data_low | ((u64)data_high << 32);
vmx->guest_msrs[j].index = i;
vmx->guest_msrs[j].data = 0;
vmx->guest_msrs[j].mask = -1ull;
@@ -2404,7 +2459,10 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl);
vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
- vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK);
+ vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS;
+ if (enable_ept)
+ vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
+ vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
tsc_base = vmx->vcpu.kvm->arch.vm_init_tsc;
rdtscll(tsc_this);
@@ -2429,10 +2487,10 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
u64 msr;
- int ret;
+ int ret, idx;
vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
- down_read(&vcpu->kvm->slots_lock);
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
if (!init_rmode(vmx->vcpu.kvm)) {
ret = -ENOMEM;
goto out;
@@ -2526,7 +2584,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
- vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */
+ vmx_set_cr0(&vmx->vcpu, kvm_read_cr0(vcpu)); /* enter rmode */
vmx_set_cr4(&vmx->vcpu, 0);
vmx_set_efer(&vmx->vcpu, 0);
vmx_fpu_activate(&vmx->vcpu);
@@ -2540,7 +2598,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
vmx->emulation_required = 0;
out:
- up_read(&vcpu->kvm->slots_lock);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
return ret;
}
@@ -2717,6 +2775,12 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
kvm_queue_exception(vcpu, vec);
return 1;
case BP_VECTOR:
+ /*
+ * Update instruction length as we may reinject the exception
+ * from user space while in guest debugging mode.
+ */
+ to_vmx(vcpu)->vcpu.arch.event_exit_inst_len =
+ vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
return 0;
/* fall through */
@@ -2839,6 +2903,13 @@ static int handle_exception(struct kvm_vcpu *vcpu)
kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7);
/* fall through */
case BP_VECTOR:
+ /*
+ * Update instruction length as we may reinject #BP from
+ * user space while in guest debugging mode. Reading it for
+ * #DB as well causes no harm, it is not used in that case.
+ */
+ vmx->vcpu.arch.event_exit_inst_len =
+ vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
kvm_run->exit_reason = KVM_EXIT_DEBUG;
kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip;
kvm_run->debug.arch.exception = ex_no;
@@ -2940,11 +3011,10 @@ static int handle_cr(struct kvm_vcpu *vcpu)
};
break;
case 2: /* clts */
- vmx_fpu_deactivate(vcpu);
- vcpu->arch.cr0 &= ~X86_CR0_TS;
- vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
- vmx_fpu_activate(vcpu);
+ vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
+ trace_kvm_cr_write(0, kvm_read_cr0(vcpu));
skip_emulated_instruction(vcpu);
+ vmx_fpu_activate(vcpu);
return 1;
case 1: /*mov from cr*/
switch (cr) {
@@ -2962,7 +3032,9 @@ static int handle_cr(struct kvm_vcpu *vcpu)
}
break;
case 3: /* lmsw */
- kvm_lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f);
+ val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
+ trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val);
+ kvm_lmsw(vcpu, val);
skip_emulated_instruction(vcpu);
return 1;
@@ -2975,12 +3047,22 @@ static int handle_cr(struct kvm_vcpu *vcpu)
return 0;
}
+static int check_dr_alias(struct kvm_vcpu *vcpu)
+{
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return -1;
+ }
+ return 0;
+}
+
static int handle_dr(struct kvm_vcpu *vcpu)
{
unsigned long exit_qualification;
unsigned long val;
int dr, reg;
+ /* Do not handle if the CPL > 0, will trigger GP on re-entry */
if (!kvm_require_cpl(vcpu, 0))
return 1;
dr = vmcs_readl(GUEST_DR7);
@@ -3016,14 +3098,20 @@ static int handle_dr(struct kvm_vcpu *vcpu)
case 0 ... 3:
val = vcpu->arch.db[dr];
break;
+ case 4:
+ if (check_dr_alias(vcpu) < 0)
+ return 1;
+ /* fall through */
case 6:
val = vcpu->arch.dr6;
break;
- case 7:
+ case 5:
+ if (check_dr_alias(vcpu) < 0)
+ return 1;
+ /* fall through */
+ default: /* 7 */
val = vcpu->arch.dr7;
break;
- default:
- val = 0;
}
kvm_register_write(vcpu, reg, val);
} else {
@@ -3034,21 +3122,25 @@ static int handle_dr(struct kvm_vcpu *vcpu)
if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
vcpu->arch.eff_db[dr] = val;
break;
- case 4 ... 5:
- if (vcpu->arch.cr4 & X86_CR4_DE)
- kvm_queue_exception(vcpu, UD_VECTOR);
- break;
+ case 4:
+ if (check_dr_alias(vcpu) < 0)
+ return 1;
+ /* fall through */
case 6:
if (val & 0xffffffff00000000ULL) {
- kvm_queue_exception(vcpu, GP_VECTOR);
- break;
+ kvm_inject_gp(vcpu, 0);
+ return 1;
}
vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
break;
- case 7:
+ case 5:
+ if (check_dr_alias(vcpu) < 0)
+ return 1;
+ /* fall through */
+ default: /* 7 */
if (val & 0xffffffff00000000ULL) {
- kvm_queue_exception(vcpu, GP_VECTOR);
- break;
+ kvm_inject_gp(vcpu, 0);
+ return 1;
}
vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
@@ -3075,6 +3167,7 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu)
u64 data;
if (vmx_get_msr(vcpu, ecx, &data)) {
+ trace_kvm_msr_read_ex(ecx);
kvm_inject_gp(vcpu, 0);
return 1;
}
@@ -3094,13 +3187,13 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu)
u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
| ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
- trace_kvm_msr_write(ecx, data);
-
if (vmx_set_msr(vcpu, ecx, data) != 0) {
+ trace_kvm_msr_write_ex(ecx, data);
kvm_inject_gp(vcpu, 0);
return 1;
}
+ trace_kvm_msr_write(ecx, data);
skip_emulated_instruction(vcpu);
return 1;
}
@@ -3385,7 +3478,6 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
}
if (err != EMULATE_DONE) {
- kvm_report_emulation_failure(vcpu, "emulation failure");
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
vcpu->run->internal.ndata = 0;
@@ -3416,6 +3508,12 @@ static int handle_pause(struct kvm_vcpu *vcpu)
return 1;
}
+static int handle_invalid_op(struct kvm_vcpu *vcpu)
+{
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+}
+
/*
* The exit handlers return 1 if the exit was handled fully and guest execution
* may resume. Otherwise they set the kvm_run parameter to indicate what needs
@@ -3453,6 +3551,8 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
[EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig,
[EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause,
+ [EXIT_REASON_MWAIT_INSTRUCTION] = handle_invalid_op,
+ [EXIT_REASON_MONITOR_INSTRUCTION] = handle_invalid_op,
};
static const int kvm_vmx_max_exit_handlers =
@@ -3686,9 +3786,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
*/
vmcs_writel(HOST_CR0, read_cr0());
- if (vcpu->arch.switch_db_regs)
- set_debugreg(vcpu->arch.dr6, 6);
-
asm(
/* Store host registers */
"push %%"R"dx; push %%"R"bp;"
@@ -3789,9 +3886,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
| (1 << VCPU_EXREG_PDPTR));
vcpu->arch.regs_dirty = 0;
- if (vcpu->arch.switch_db_regs)
- get_debugreg(vcpu->arch.dr6, 6);
-
vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
if (vmx->rmode.irq.pending)
fixup_rmode_irq(vmx);
@@ -3920,7 +4014,7 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
* b. VT-d with snooping control feature: snooping control feature of
* VT-d engine can guarantee the cache correctness. Just set it
* to WB to keep consistent with host. So the same as item 3.
- * 3. EPT without VT-d: always map as WB and set IGMT=1 to keep
+ * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep
* consistent with host MTRR
*/
if (is_mmio)
@@ -3931,37 +4025,88 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
VMX_EPT_MT_EPTE_SHIFT;
else
ret = (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT)
- | VMX_EPT_IGMT_BIT;
+ | VMX_EPT_IPAT_BIT;
return ret;
}
+#define _ER(x) { EXIT_REASON_##x, #x }
+
static const struct trace_print_flags vmx_exit_reasons_str[] = {
- { EXIT_REASON_EXCEPTION_NMI, "exception" },
- { EXIT_REASON_EXTERNAL_INTERRUPT, "ext_irq" },
- { EXIT_REASON_TRIPLE_FAULT, "triple_fault" },
- { EXIT_REASON_NMI_WINDOW, "nmi_window" },
- { EXIT_REASON_IO_INSTRUCTION, "io_instruction" },
- { EXIT_REASON_CR_ACCESS, "cr_access" },
- { EXIT_REASON_DR_ACCESS, "dr_access" },
- { EXIT_REASON_CPUID, "cpuid" },
- { EXIT_REASON_MSR_READ, "rdmsr" },
- { EXIT_REASON_MSR_WRITE, "wrmsr" },
- { EXIT_REASON_PENDING_INTERRUPT, "interrupt_window" },
- { EXIT_REASON_HLT, "halt" },
- { EXIT_REASON_INVLPG, "invlpg" },
- { EXIT_REASON_VMCALL, "hypercall" },
- { EXIT_REASON_TPR_BELOW_THRESHOLD, "tpr_below_thres" },
- { EXIT_REASON_APIC_ACCESS, "apic_access" },
- { EXIT_REASON_WBINVD, "wbinvd" },
- { EXIT_REASON_TASK_SWITCH, "task_switch" },
- { EXIT_REASON_EPT_VIOLATION, "ept_violation" },
+ _ER(EXCEPTION_NMI),
+ _ER(EXTERNAL_INTERRUPT),
+ _ER(TRIPLE_FAULT),
+ _ER(PENDING_INTERRUPT),
+ _ER(NMI_WINDOW),
+ _ER(TASK_SWITCH),
+ _ER(CPUID),
+ _ER(HLT),
+ _ER(INVLPG),
+ _ER(RDPMC),
+ _ER(RDTSC),
+ _ER(VMCALL),
+ _ER(VMCLEAR),
+ _ER(VMLAUNCH),
+ _ER(VMPTRLD),
+ _ER(VMPTRST),
+ _ER(VMREAD),
+ _ER(VMRESUME),
+ _ER(VMWRITE),
+ _ER(VMOFF),
+ _ER(VMON),
+ _ER(CR_ACCESS),
+ _ER(DR_ACCESS),
+ _ER(IO_INSTRUCTION),
+ _ER(MSR_READ),
+ _ER(MSR_WRITE),
+ _ER(MWAIT_INSTRUCTION),
+ _ER(MONITOR_INSTRUCTION),
+ _ER(PAUSE_INSTRUCTION),
+ _ER(MCE_DURING_VMENTRY),
+ _ER(TPR_BELOW_THRESHOLD),
+ _ER(APIC_ACCESS),
+ _ER(EPT_VIOLATION),
+ _ER(EPT_MISCONFIG),
+ _ER(WBINVD),
{ -1, NULL }
};
-static bool vmx_gb_page_enable(void)
+#undef _ER
+
+static int vmx_get_lpage_level(void)
+{
+ if (enable_ept && !cpu_has_vmx_ept_1g_page())
+ return PT_DIRECTORY_LEVEL;
+ else
+ /* For shadow and EPT supported 1GB page */
+ return PT_PDPE_LEVEL;
+}
+
+static inline u32 bit(int bitno)
+{
+ return 1 << (bitno & 31);
+}
+
+static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
{
- return false;
+ struct kvm_cpuid_entry2 *best;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u32 exec_control;
+
+ vmx->rdtscp_enabled = false;
+ if (vmx_rdtscp_supported()) {
+ exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+ if (exec_control & SECONDARY_EXEC_RDTSCP) {
+ best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
+ if (best && (best->edx & bit(X86_FEATURE_RDTSCP)))
+ vmx->rdtscp_enabled = true;
+ else {
+ exec_control &= ~SECONDARY_EXEC_RDTSCP;
+ vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
+ exec_control);
+ }
+ }
+ }
}
static struct kvm_x86_ops vmx_x86_ops = {
@@ -3990,6 +4135,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
.set_segment = vmx_set_segment,
.get_cpl = vmx_get_cpl,
.get_cs_db_l_bits = vmx_get_cs_db_l_bits,
+ .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits,
.decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
.set_cr0 = vmx_set_cr0,
.set_cr3 = vmx_set_cr3,
@@ -4002,6 +4148,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
.cache_reg = vmx_cache_reg,
.get_rflags = vmx_get_rflags,
.set_rflags = vmx_set_rflags,
+ .fpu_activate = vmx_fpu_activate,
+ .fpu_deactivate = vmx_fpu_deactivate,
.tlb_flush = vmx_flush_tlb,
@@ -4027,7 +4175,11 @@ static struct kvm_x86_ops vmx_x86_ops = {
.get_mt_mask = vmx_get_mt_mask,
.exit_reasons_str = vmx_exit_reasons_str,
- .gb_page_enable = vmx_gb_page_enable,
+ .get_lpage_level = vmx_get_lpage_level,
+
+ .cpuid_update = vmx_cpuid_update,
+
+ .rdtscp_supported = vmx_rdtscp_supported,
};
static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a1e1bc9d412..e46282a5656 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -38,6 +38,7 @@
#include <linux/intel-iommu.h>
#include <linux/cpufreq.h>
#include <linux/user-return-notifier.h>
+#include <linux/srcu.h>
#include <trace/events/kvm.h>
#undef TRACE_INCLUDE_FILE
#define CREATE_TRACE_POINTS
@@ -93,16 +94,16 @@ module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR);
struct kvm_shared_msrs_global {
int nr;
- struct kvm_shared_msr {
- u32 msr;
- u64 value;
- } msrs[KVM_NR_SHARED_MSRS];
+ u32 msrs[KVM_NR_SHARED_MSRS];
};
struct kvm_shared_msrs {
struct user_return_notifier urn;
bool registered;
- u64 current_value[KVM_NR_SHARED_MSRS];
+ struct kvm_shared_msr_values {
+ u64 host;
+ u64 curr;
+ } values[KVM_NR_SHARED_MSRS];
};
static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
@@ -147,53 +148,64 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
static void kvm_on_user_return(struct user_return_notifier *urn)
{
unsigned slot;
- struct kvm_shared_msr *global;
struct kvm_shared_msrs *locals
= container_of(urn, struct kvm_shared_msrs, urn);
+ struct kvm_shared_msr_values *values;
for (slot = 0; slot < shared_msrs_global.nr; ++slot) {
- global = &shared_msrs_global.msrs[slot];
- if (global->value != locals->current_value[slot]) {
- wrmsrl(global->msr, global->value);
- locals->current_value[slot] = global->value;
+ values = &locals->values[slot];
+ if (values->host != values->curr) {
+ wrmsrl(shared_msrs_global.msrs[slot], values->host);
+ values->curr = values->host;
}
}
locals->registered = false;
user_return_notifier_unregister(urn);
}
-void kvm_define_shared_msr(unsigned slot, u32 msr)
+static void shared_msr_update(unsigned slot, u32 msr)
{
- int cpu;
+ struct kvm_shared_msrs *smsr;
u64 value;
+ smsr = &__get_cpu_var(shared_msrs);
+ /* only read, and nobody should modify it at this time,
+ * so don't need lock */
+ if (slot >= shared_msrs_global.nr) {
+ printk(KERN_ERR "kvm: invalid MSR slot!");
+ return;
+ }
+ rdmsrl_safe(msr, &value);
+ smsr->values[slot].host = value;
+ smsr->values[slot].curr = value;
+}
+
+void kvm_define_shared_msr(unsigned slot, u32 msr)
+{
if (slot >= shared_msrs_global.nr)
shared_msrs_global.nr = slot + 1;
- shared_msrs_global.msrs[slot].msr = msr;
- rdmsrl_safe(msr, &value);
- shared_msrs_global.msrs[slot].value = value;
- for_each_online_cpu(cpu)
- per_cpu(shared_msrs, cpu).current_value[slot] = value;
+ shared_msrs_global.msrs[slot] = msr;
+ /* we need ensured the shared_msr_global have been updated */
+ smp_wmb();
}
EXPORT_SYMBOL_GPL(kvm_define_shared_msr);
static void kvm_shared_msr_cpu_online(void)
{
unsigned i;
- struct kvm_shared_msrs *locals = &__get_cpu_var(shared_msrs);
for (i = 0; i < shared_msrs_global.nr; ++i)
- locals->current_value[i] = shared_msrs_global.msrs[i].value;
+ shared_msr_update(i, shared_msrs_global.msrs[i]);
}
void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
{
struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs);
- if (((value ^ smsr->current_value[slot]) & mask) == 0)
+ if (((value ^ smsr->values[slot].curr) & mask) == 0)
return;
- smsr->current_value[slot] = value;
- wrmsrl(shared_msrs_global.msrs[slot].msr, value);
+ smsr->values[slot].curr = value;
+ wrmsrl(shared_msrs_global.msrs[slot], value);
if (!smsr->registered) {
smsr->urn.on_user_return = kvm_on_user_return;
user_return_notifier_register(&smsr->urn);
@@ -257,12 +269,68 @@ void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
}
EXPORT_SYMBOL_GPL(kvm_set_apic_base);
+#define EXCPT_BENIGN 0
+#define EXCPT_CONTRIBUTORY 1
+#define EXCPT_PF 2
+
+static int exception_class(int vector)
+{
+ switch (vector) {
+ case PF_VECTOR:
+ return EXCPT_PF;
+ case DE_VECTOR:
+ case TS_VECTOR:
+ case NP_VECTOR:
+ case SS_VECTOR:
+ case GP_VECTOR:
+ return EXCPT_CONTRIBUTORY;
+ default:
+ break;
+ }
+ return EXCPT_BENIGN;
+}
+
+static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
+ unsigned nr, bool has_error, u32 error_code)
+{
+ u32 prev_nr;
+ int class1, class2;
+
+ if (!vcpu->arch.exception.pending) {
+ queue:
+ vcpu->arch.exception.pending = true;
+ vcpu->arch.exception.has_error_code = has_error;
+ vcpu->arch.exception.nr = nr;
+ vcpu->arch.exception.error_code = error_code;
+ return;
+ }
+
+ /* to check exception */
+ prev_nr = vcpu->arch.exception.nr;
+ if (prev_nr == DF_VECTOR) {
+ /* triple fault -> shutdown */
+ set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
+ return;
+ }
+ class1 = exception_class(prev_nr);
+ class2 = exception_class(nr);
+ if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY)
+ || (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
+ /* generate double fault per SDM Table 5-5 */
+ vcpu->arch.exception.pending = true;
+ vcpu->arch.exception.has_error_code = true;
+ vcpu->arch.exception.nr = DF_VECTOR;
+ vcpu->arch.exception.error_code = 0;
+ } else
+ /* replace previous exception with a new one in a hope
+ that instruction re-execution will regenerate lost
+ exception */
+ goto queue;
+}
+
void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
{
- WARN_ON(vcpu->arch.exception.pending);
- vcpu->arch.exception.pending = true;
- vcpu->arch.exception.has_error_code = false;
- vcpu->arch.exception.nr = nr;
+ kvm_multiple_exception(vcpu, nr, false, 0);
}
EXPORT_SYMBOL_GPL(kvm_queue_exception);
@@ -270,25 +338,6 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
u32 error_code)
{
++vcpu->stat.pf_guest;
-
- if (vcpu->arch.exception.pending) {
- switch(vcpu->arch.exception.nr) {
- case DF_VECTOR:
- /* triple fault -> shutdown */
- set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
- return;
- case PF_VECTOR:
- vcpu->arch.exception.nr = DF_VECTOR;
- vcpu->arch.exception.error_code = 0;
- return;
- default:
- /* replace previous exception with a new one in a hope
- that instruction re-execution will regenerate lost
- exception */
- vcpu->arch.exception.pending = false;
- break;
- }
- }
vcpu->arch.cr2 = addr;
kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
}
@@ -301,11 +350,7 @@ EXPORT_SYMBOL_GPL(kvm_inject_nmi);
void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
{
- WARN_ON(vcpu->arch.exception.pending);
- vcpu->arch.exception.pending = true;
- vcpu->arch.exception.has_error_code = true;
- vcpu->arch.exception.nr = nr;
- vcpu->arch.exception.error_code = error_code;
+ kvm_multiple_exception(vcpu, nr, true, error_code);
}
EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
@@ -383,12 +428,18 @@ out:
void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
- if (cr0 & CR0_RESERVED_BITS) {
+ cr0 |= X86_CR0_ET;
+
+#ifdef CONFIG_X86_64
+ if (cr0 & 0xffffffff00000000UL) {
printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
- cr0, vcpu->arch.cr0);
+ cr0, kvm_read_cr0(vcpu));
kvm_inject_gp(vcpu, 0);
return;
}
+#endif
+
+ cr0 &= ~CR0_RESERVED_BITS;
if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
@@ -405,7 +456,7 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
#ifdef CONFIG_X86_64
- if ((vcpu->arch.shadow_efer & EFER_LME)) {
+ if ((vcpu->arch.efer & EFER_LME)) {
int cs_db, cs_l;
if (!is_pae(vcpu)) {
@@ -443,13 +494,13 @@ EXPORT_SYMBOL_GPL(kvm_set_cr0);
void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
{
- kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f));
+ kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0ful) | (msw & 0x0f));
}
EXPORT_SYMBOL_GPL(kvm_lmsw);
void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
- unsigned long old_cr4 = vcpu->arch.cr4;
+ unsigned long old_cr4 = kvm_read_cr4(vcpu);
unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
if (cr4 & CR4_RESERVED_BITS) {
@@ -575,9 +626,11 @@ static inline u32 bit(int bitno)
* kvm-specific. Those are put in the beginning of the list.
*/
-#define KVM_SAVE_MSRS_BEGIN 2
+#define KVM_SAVE_MSRS_BEGIN 5
static u32 msrs_to_save[] = {
MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
+ HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
+ HV_X64_MSR_APIC_ASSIST_PAGE,
MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
MSR_K6_STAR,
#ifdef CONFIG_X86_64
@@ -602,7 +655,7 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
}
if (is_paging(vcpu)
- && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) {
+ && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) {
printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
kvm_inject_gp(vcpu, 0);
return;
@@ -633,9 +686,9 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
kvm_x86_ops->set_efer(vcpu, efer);
efer &= ~EFER_LMA;
- efer |= vcpu->arch.shadow_efer & EFER_LMA;
+ efer |= vcpu->arch.efer & EFER_LMA;
- vcpu->arch.shadow_efer = efer;
+ vcpu->arch.efer = efer;
vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
kvm_mmu_reset_context(vcpu);
@@ -957,6 +1010,100 @@ out:
return r;
}
+static bool kvm_hv_hypercall_enabled(struct kvm *kvm)
+{
+ return kvm->arch.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE;
+}
+
+static bool kvm_hv_msr_partition_wide(u32 msr)
+{
+ bool r = false;
+ switch (msr) {
+ case HV_X64_MSR_GUEST_OS_ID:
+ case HV_X64_MSR_HYPERCALL:
+ r = true;
+ break;
+ }
+
+ return r;
+}
+
+static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+ struct kvm *kvm = vcpu->kvm;
+
+ switch (msr) {
+ case HV_X64_MSR_GUEST_OS_ID:
+ kvm->arch.hv_guest_os_id = data;
+ /* setting guest os id to zero disables hypercall page */
+ if (!kvm->arch.hv_guest_os_id)
+ kvm->arch.hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE;
+ break;
+ case HV_X64_MSR_HYPERCALL: {
+ u64 gfn;
+ unsigned long addr;
+ u8 instructions[4];
+
+ /* if guest os id is not set hypercall should remain disabled */
+ if (!kvm->arch.hv_guest_os_id)
+ break;
+ if (!(data & HV_X64_MSR_HYPERCALL_ENABLE)) {
+ kvm->arch.hv_hypercall = data;
+ break;
+ }
+ gfn = data >> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT;
+ addr = gfn_to_hva(kvm, gfn);
+ if (kvm_is_error_hva(addr))
+ return 1;
+ kvm_x86_ops->patch_hypercall(vcpu, instructions);
+ ((unsigned char *)instructions)[3] = 0xc3; /* ret */
+ if (copy_to_user((void __user *)addr, instructions, 4))
+ return 1;
+ kvm->arch.hv_hypercall = data;
+ break;
+ }
+ default:
+ pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "
+ "data 0x%llx\n", msr, data);
+ return 1;
+ }
+ return 0;
+}
+
+static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+ switch (msr) {
+ case HV_X64_MSR_APIC_ASSIST_PAGE: {
+ unsigned long addr;
+
+ if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) {
+ vcpu->arch.hv_vapic = data;
+ break;
+ }
+ addr = gfn_to_hva(vcpu->kvm, data >>
+ HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT);
+ if (kvm_is_error_hva(addr))
+ return 1;
+ if (clear_user((void __user *)addr, PAGE_SIZE))
+ return 1;
+ vcpu->arch.hv_vapic = data;
+ break;
+ }
+ case HV_X64_MSR_EOI:
+ return kvm_hv_vapic_msr_write(vcpu, APIC_EOI, data);
+ case HV_X64_MSR_ICR:
+ return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data);
+ case HV_X64_MSR_TPR:
+ return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data);
+ default:
+ pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "
+ "data 0x%llx\n", msr, data);
+ return 1;
+ }
+
+ return 0;
+}
+
int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
{
switch (msr) {
@@ -1071,6 +1218,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
"0x%x data 0x%llx\n", msr, data);
break;
+ case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
+ if (kvm_hv_msr_partition_wide(msr)) {
+ int r;
+ mutex_lock(&vcpu->kvm->lock);
+ r = set_msr_hyperv_pw(vcpu, msr, data);
+ mutex_unlock(&vcpu->kvm->lock);
+ return r;
+ } else
+ return set_msr_hyperv(vcpu, msr, data);
+ break;
default:
if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
return xen_hvm_config(vcpu, data);
@@ -1170,6 +1327,54 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
return 0;
}
+static int get_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+{
+ u64 data = 0;
+ struct kvm *kvm = vcpu->kvm;
+
+ switch (msr) {
+ case HV_X64_MSR_GUEST_OS_ID:
+ data = kvm->arch.hv_guest_os_id;
+ break;
+ case HV_X64_MSR_HYPERCALL:
+ data = kvm->arch.hv_hypercall;
+ break;
+ default:
+ pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
+ return 1;
+ }
+
+ *pdata = data;
+ return 0;
+}
+
+static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+{
+ u64 data = 0;
+
+ switch (msr) {
+ case HV_X64_MSR_VP_INDEX: {
+ int r;
+ struct kvm_vcpu *v;
+ kvm_for_each_vcpu(r, v, vcpu->kvm)
+ if (v == vcpu)
+ data = r;
+ break;
+ }
+ case HV_X64_MSR_EOI:
+ return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata);
+ case HV_X64_MSR_ICR:
+ return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata);
+ case HV_X64_MSR_TPR:
+ return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata);
+ default:
+ pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
+ return 1;
+ }
+ *pdata = data;
+ return 0;
+}
+
int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
{
u64 data;
@@ -1221,7 +1426,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
data |= (((uint64_t)4ULL) << 40);
break;
case MSR_EFER:
- data = vcpu->arch.shadow_efer;
+ data = vcpu->arch.efer;
break;
case MSR_KVM_WALL_CLOCK:
data = vcpu->kvm->arch.wall_clock;
@@ -1236,6 +1441,16 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
case MSR_IA32_MCG_STATUS:
case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
return get_msr_mce(vcpu, msr, pdata);
+ case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
+ if (kvm_hv_msr_partition_wide(msr)) {
+ int r;
+ mutex_lock(&vcpu->kvm->lock);
+ r = get_msr_hyperv_pw(vcpu, msr, pdata);
+ mutex_unlock(&vcpu->kvm->lock);
+ return r;
+ } else
+ return get_msr_hyperv(vcpu, msr, pdata);
+ break;
default:
if (!ignore_msrs) {
pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
@@ -1261,15 +1476,15 @@ static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
int (*do_msr)(struct kvm_vcpu *vcpu,
unsigned index, u64 *data))
{
- int i;
+ int i, idx;
vcpu_load(vcpu);
- down_read(&vcpu->kvm->slots_lock);
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
for (i = 0; i < msrs->nmsrs; ++i)
if (do_msr(vcpu, entries[i].index, &entries[i].data))
break;
- up_read(&vcpu->kvm->slots_lock);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
vcpu_put(vcpu);
@@ -1351,6 +1566,11 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_XEN_HVM:
case KVM_CAP_ADJUST_CLOCK:
case KVM_CAP_VCPU_EVENTS:
+ case KVM_CAP_HYPERV:
+ case KVM_CAP_HYPERV_VAPIC:
+ case KVM_CAP_HYPERV_SPIN:
+ case KVM_CAP_PCI_SEGMENT:
+ case KVM_CAP_X86_ROBUST_SINGLESTEP:
r = 1;
break;
case KVM_CAP_COALESCED_MMIO:
@@ -1464,8 +1684,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
{
- kvm_x86_ops->vcpu_put(vcpu);
kvm_put_guest_fpu(vcpu);
+ kvm_x86_ops->vcpu_put(vcpu);
}
static int is_efer_nx(void)
@@ -1530,6 +1750,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
cpuid_fix_nx_cap(vcpu);
r = 0;
kvm_apic_set_version(vcpu);
+ kvm_x86_ops->cpuid_update(vcpu);
out_free:
vfree(cpuid_entries);
@@ -1552,6 +1773,7 @@ static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
goto out;
vcpu->arch.cpuid_nent = cpuid->nent;
kvm_apic_set_version(vcpu);
+ kvm_x86_ops->cpuid_update(vcpu);
return 0;
out:
@@ -1594,12 +1816,15 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
u32 index, int *nent, int maxnent)
{
unsigned f_nx = is_efer_nx() ? F(NX) : 0;
- unsigned f_gbpages = kvm_x86_ops->gb_page_enable() ? F(GBPAGES) : 0;
#ifdef CONFIG_X86_64
+ unsigned f_gbpages = (kvm_x86_ops->get_lpage_level() == PT_PDPE_LEVEL)
+ ? F(GBPAGES) : 0;
unsigned f_lm = F(LM);
#else
+ unsigned f_gbpages = 0;
unsigned f_lm = 0;
#endif
+ unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0;
/* cpuid 1.edx */
const u32 kvm_supported_word0_x86_features =
@@ -1619,7 +1844,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
F(PAT) | F(PSE36) | 0 /* Reserved */ |
f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
- F(FXSR) | F(FXSR_OPT) | f_gbpages | 0 /* RDTSCP */ |
+ F(FXSR) | F(FXSR_OPT) | f_gbpages | f_rdtscp |
0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
/* cpuid 1.ecx */
const u32 kvm_supported_word4_x86_features =
@@ -1866,7 +2091,7 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
return 0;
if (mce->status & MCI_STATUS_UC) {
if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
- !(vcpu->arch.cr4 & X86_CR4_MCE)) {
+ !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) {
printk(KERN_DEBUG "kvm: set_mce: "
"injects mce exception while "
"previous one is in progress!\n");
@@ -2160,14 +2385,14 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
return -EINVAL;
- down_write(&kvm->slots_lock);
+ mutex_lock(&kvm->slots_lock);
spin_lock(&kvm->mmu_lock);
kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
spin_unlock(&kvm->mmu_lock);
- up_write(&kvm->slots_lock);
+ mutex_unlock(&kvm->slots_lock);
return 0;
}
@@ -2176,13 +2401,35 @@ static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
return kvm->arch.n_alloc_mmu_pages;
}
+gfn_t unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn)
+{
+ int i;
+ struct kvm_mem_alias *alias;
+ struct kvm_mem_aliases *aliases;
+
+ aliases = rcu_dereference(kvm->arch.aliases);
+
+ for (i = 0; i < aliases->naliases; ++i) {
+ alias = &aliases->aliases[i];
+ if (alias->flags & KVM_ALIAS_INVALID)
+ continue;
+ if (gfn >= alias->base_gfn
+ && gfn < alias->base_gfn + alias->npages)
+ return alias->target_gfn + gfn - alias->base_gfn;
+ }
+ return gfn;
+}
+
gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
{
int i;
struct kvm_mem_alias *alias;
+ struct kvm_mem_aliases *aliases;
- for (i = 0; i < kvm->arch.naliases; ++i) {
- alias = &kvm->arch.aliases[i];
+ aliases = rcu_dereference(kvm->arch.aliases);
+
+ for (i = 0; i < aliases->naliases; ++i) {
+ alias = &aliases->aliases[i];
if (gfn >= alias->base_gfn
&& gfn < alias->base_gfn + alias->npages)
return alias->target_gfn + gfn - alias->base_gfn;
@@ -2200,6 +2447,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
{
int r, n;
struct kvm_mem_alias *p;
+ struct kvm_mem_aliases *aliases, *old_aliases;
r = -EINVAL;
/* General sanity checks */
@@ -2216,26 +2464,48 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
< alias->target_phys_addr)
goto out;
- down_write(&kvm->slots_lock);
- spin_lock(&kvm->mmu_lock);
+ r = -ENOMEM;
+ aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL);
+ if (!aliases)
+ goto out;
+
+ mutex_lock(&kvm->slots_lock);
- p = &kvm->arch.aliases[alias->slot];
+ /* invalidate any gfn reference in case of deletion/shrinking */
+ memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases));
+ aliases->aliases[alias->slot].flags |= KVM_ALIAS_INVALID;
+ old_aliases = kvm->arch.aliases;
+ rcu_assign_pointer(kvm->arch.aliases, aliases);
+ synchronize_srcu_expedited(&kvm->srcu);
+ kvm_mmu_zap_all(kvm);
+ kfree(old_aliases);
+
+ r = -ENOMEM;
+ aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL);
+ if (!aliases)
+ goto out_unlock;
+
+ memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases));
+
+ p = &aliases->aliases[alias->slot];
p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
p->npages = alias->memory_size >> PAGE_SHIFT;
p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
+ p->flags &= ~(KVM_ALIAS_INVALID);
for (n = KVM_ALIAS_SLOTS; n > 0; --n)
- if (kvm->arch.aliases[n - 1].npages)
+ if (aliases->aliases[n - 1].npages)
break;
- kvm->arch.naliases = n;
+ aliases->naliases = n;
- spin_unlock(&kvm->mmu_lock);
- kvm_mmu_zap_all(kvm);
-
- up_write(&kvm->slots_lock);
-
- return 0;
+ old_aliases = kvm->arch.aliases;
+ rcu_assign_pointer(kvm->arch.aliases, aliases);
+ synchronize_srcu_expedited(&kvm->srcu);
+ kfree(old_aliases);
+ r = 0;
+out_unlock:
+ mutex_unlock(&kvm->slots_lock);
out:
return r;
}
@@ -2273,18 +2543,18 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
r = 0;
switch (chip->chip_id) {
case KVM_IRQCHIP_PIC_MASTER:
- spin_lock(&pic_irqchip(kvm)->lock);
+ raw_spin_lock(&pic_irqchip(kvm)->lock);
memcpy(&pic_irqchip(kvm)->pics[0],
&chip->chip.pic,
sizeof(struct kvm_pic_state));
- spin_unlock(&pic_irqchip(kvm)->lock);
+ raw_spin_unlock(&pic_irqchip(kvm)->lock);
break;
case KVM_IRQCHIP_PIC_SLAVE:
- spin_lock(&pic_irqchip(kvm)->lock);
+ raw_spin_lock(&pic_irqchip(kvm)->lock);
memcpy(&pic_irqchip(kvm)->pics[1],
&chip->chip.pic,
sizeof(struct kvm_pic_state));
- spin_unlock(&pic_irqchip(kvm)->lock);
+ raw_spin_unlock(&pic_irqchip(kvm)->lock);
break;
case KVM_IRQCHIP_IOAPIC:
r = kvm_set_ioapic(kvm, &chip->chip.ioapic);
@@ -2364,29 +2634,62 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm,
int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
struct kvm_dirty_log *log)
{
- int r;
- int n;
+ int r, n, i;
struct kvm_memory_slot *memslot;
- int is_dirty = 0;
+ unsigned long is_dirty = 0;
+ unsigned long *dirty_bitmap = NULL;
- down_write(&kvm->slots_lock);
+ mutex_lock(&kvm->slots_lock);
- r = kvm_get_dirty_log(kvm, log, &is_dirty);
- if (r)
+ r = -EINVAL;
+ if (log->slot >= KVM_MEMORY_SLOTS)
+ goto out;
+
+ memslot = &kvm->memslots->memslots[log->slot];
+ r = -ENOENT;
+ if (!memslot->dirty_bitmap)
+ goto out;
+
+ n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
+
+ r = -ENOMEM;
+ dirty_bitmap = vmalloc(n);
+ if (!dirty_bitmap)
goto out;
+ memset(dirty_bitmap, 0, n);
+
+ for (i = 0; !is_dirty && i < n/sizeof(long); i++)
+ is_dirty = memslot->dirty_bitmap[i];
/* If nothing is dirty, don't bother messing with page tables. */
if (is_dirty) {
+ struct kvm_memslots *slots, *old_slots;
+
spin_lock(&kvm->mmu_lock);
kvm_mmu_slot_remove_write_access(kvm, log->slot);
spin_unlock(&kvm->mmu_lock);
- memslot = &kvm->memslots[log->slot];
- n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
- memset(memslot->dirty_bitmap, 0, n);
+
+ slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
+ if (!slots)
+ goto out_free;
+
+ memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
+ slots->memslots[log->slot].dirty_bitmap = dirty_bitmap;
+
+ old_slots = kvm->memslots;
+ rcu_assign_pointer(kvm->memslots, slots);
+ synchronize_srcu_expedited(&kvm->srcu);
+ dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap;
+ kfree(old_slots);
}
+
r = 0;
+ if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n))
+ r = -EFAULT;
+out_free:
+ vfree(dirty_bitmap);
out:
- up_write(&kvm->slots_lock);
+ mutex_unlock(&kvm->slots_lock);
return r;
}
@@ -2469,6 +2772,8 @@ long kvm_arch_vm_ioctl(struct file *filp,
if (vpic) {
r = kvm_ioapic_init(kvm);
if (r) {
+ kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
+ &vpic->dev);
kfree(vpic);
goto create_irqchip_unlock;
}
@@ -2480,10 +2785,8 @@ long kvm_arch_vm_ioctl(struct file *filp,
r = kvm_setup_default_irq_routing(kvm);
if (r) {
mutex_lock(&kvm->irq_lock);
- kfree(kvm->arch.vpic);
- kfree(kvm->arch.vioapic);
- kvm->arch.vpic = NULL;
- kvm->arch.vioapic = NULL;
+ kvm_ioapic_destroy(kvm);
+ kvm_destroy_pic(kvm);
mutex_unlock(&kvm->irq_lock);
}
create_irqchip_unlock:
@@ -2499,7 +2802,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
sizeof(struct kvm_pit_config)))
goto out;
create_pit:
- down_write(&kvm->slots_lock);
+ mutex_lock(&kvm->slots_lock);
r = -EEXIST;
if (kvm->arch.vpit)
goto create_pit_unlock;
@@ -2508,7 +2811,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
if (kvm->arch.vpit)
r = 0;
create_pit_unlock:
- up_write(&kvm->slots_lock);
+ mutex_unlock(&kvm->slots_lock);
break;
case KVM_IRQ_LINE_STATUS:
case KVM_IRQ_LINE: {
@@ -2725,7 +3028,7 @@ static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
!kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v))
return 0;
- return kvm_io_bus_write(&vcpu->kvm->mmio_bus, addr, len, v);
+ return kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, len, v);
}
static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
@@ -2734,17 +3037,44 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
!kvm_iodevice_read(&vcpu->arch.apic->dev, addr, len, v))
return 0;
- return kvm_io_bus_read(&vcpu->kvm->mmio_bus, addr, len, v);
+ return kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, len, v);
}
-static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
- struct kvm_vcpu *vcpu)
+gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
+{
+ u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error);
+}
+
+ gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
+{
+ u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ access |= PFERR_FETCH_MASK;
+ return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error);
+}
+
+gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
+{
+ u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ access |= PFERR_WRITE_MASK;
+ return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error);
+}
+
+/* uses this to access any guest's mapped memory without checking CPL */
+gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
+{
+ return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, 0, error);
+}
+
+static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
+ struct kvm_vcpu *vcpu, u32 access,
+ u32 *error)
{
void *data = val;
int r = X86EMUL_CONTINUE;
while (bytes) {
- gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
+ gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr, access, error);
unsigned offset = addr & (PAGE_SIZE-1);
unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
int ret;
@@ -2767,14 +3097,37 @@ out:
return r;
}
+/* used for instruction fetching */
+static int kvm_fetch_guest_virt(gva_t addr, void *val, unsigned int bytes,
+ struct kvm_vcpu *vcpu, u32 *error)
+{
+ u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ return kvm_read_guest_virt_helper(addr, val, bytes, vcpu,
+ access | PFERR_FETCH_MASK, error);
+}
+
+static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
+ struct kvm_vcpu *vcpu, u32 *error)
+{
+ u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
+ error);
+}
+
+static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes,
+ struct kvm_vcpu *vcpu, u32 *error)
+{
+ return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error);
+}
+
static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes,
- struct kvm_vcpu *vcpu)
+ struct kvm_vcpu *vcpu, u32 *error)
{
void *data = val;
int r = X86EMUL_CONTINUE;
while (bytes) {
- gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
+ gpa_t gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error);
unsigned offset = addr & (PAGE_SIZE-1);
unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
int ret;
@@ -2804,6 +3157,7 @@ static int emulator_read_emulated(unsigned long addr,
struct kvm_vcpu *vcpu)
{
gpa_t gpa;
+ u32 error_code;
if (vcpu->mmio_read_completed) {
memcpy(val, vcpu->mmio_data, bytes);
@@ -2813,17 +3167,20 @@ static int emulator_read_emulated(unsigned long addr,
return X86EMUL_CONTINUE;
}
- gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
+ gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, &error_code);
+
+ if (gpa == UNMAPPED_GVA) {
+ kvm_inject_page_fault(vcpu, addr, error_code);
+ return X86EMUL_PROPAGATE_FAULT;
+ }
/* For APIC access vmexit */
if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
goto mmio;
- if (kvm_read_guest_virt(addr, val, bytes, vcpu)
+ if (kvm_read_guest_virt(addr, val, bytes, vcpu, NULL)
== X86EMUL_CONTINUE)
return X86EMUL_CONTINUE;
- if (gpa == UNMAPPED_GVA)
- return X86EMUL_PROPAGATE_FAULT;
mmio:
/*
@@ -2862,11 +3219,12 @@ static int emulator_write_emulated_onepage(unsigned long addr,
struct kvm_vcpu *vcpu)
{
gpa_t gpa;
+ u32 error_code;
- gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
+ gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, &error_code);
if (gpa == UNMAPPED_GVA) {
- kvm_inject_page_fault(vcpu, addr, 2);
+ kvm_inject_page_fault(vcpu, addr, error_code);
return X86EMUL_PROPAGATE_FAULT;
}
@@ -2930,7 +3288,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
char *kaddr;
u64 val;
- gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
+ gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL);
if (gpa == UNMAPPED_GVA ||
(gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
@@ -2967,35 +3325,21 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
int emulate_clts(struct kvm_vcpu *vcpu)
{
- kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS);
+ kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
+ kvm_x86_ops->fpu_activate(vcpu);
return X86EMUL_CONTINUE;
}
int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
{
- struct kvm_vcpu *vcpu = ctxt->vcpu;
-
- switch (dr) {
- case 0 ... 3:
- *dest = kvm_x86_ops->get_dr(vcpu, dr);
- return X86EMUL_CONTINUE;
- default:
- pr_unimpl(vcpu, "%s: unexpected dr %u\n", __func__, dr);
- return X86EMUL_UNHANDLEABLE;
- }
+ return kvm_x86_ops->get_dr(ctxt->vcpu, dr, dest);
}
int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
{
unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
- int exception;
- kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception);
- if (exception) {
- /* FIXME: better handling */
- return X86EMUL_UNHANDLEABLE;
- }
- return X86EMUL_CONTINUE;
+ return kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask);
}
void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
@@ -3009,7 +3353,7 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
- kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu);
+ kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu, NULL);
printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
@@ -3017,7 +3361,8 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
static struct x86_emulate_ops emulate_ops = {
- .read_std = kvm_read_guest_virt,
+ .read_std = kvm_read_guest_virt_system,
+ .fetch = kvm_fetch_guest_virt,
.read_emulated = emulator_read_emulated,
.write_emulated = emulator_write_emulated,
.cmpxchg_emulated = emulator_cmpxchg_emulated,
@@ -3060,8 +3405,9 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
vcpu->arch.emulate_ctxt.vcpu = vcpu;
vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu);
vcpu->arch.emulate_ctxt.mode =
+ (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
(vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
- ? X86EMUL_MODE_REAL : cs_l
+ ? X86EMUL_MODE_VM86 : cs_l
? X86EMUL_MODE_PROT64 : cs_db
? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
@@ -3153,12 +3499,17 @@ static int pio_copy_data(struct kvm_vcpu *vcpu)
gva_t q = vcpu->arch.pio.guest_gva;
unsigned bytes;
int ret;
+ u32 error_code;
bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count;
if (vcpu->arch.pio.in)
- ret = kvm_write_guest_virt(q, p, bytes, vcpu);
+ ret = kvm_write_guest_virt(q, p, bytes, vcpu, &error_code);
else
- ret = kvm_read_guest_virt(q, p, bytes, vcpu);
+ ret = kvm_read_guest_virt(q, p, bytes, vcpu, &error_code);
+
+ if (ret == X86EMUL_PROPAGATE_FAULT)
+ kvm_inject_page_fault(vcpu, q, error_code);
+
return ret;
}
@@ -3179,7 +3530,7 @@ int complete_pio(struct kvm_vcpu *vcpu)
if (io->in) {
r = pio_copy_data(vcpu);
if (r)
- return r;
+ goto out;
}
delta = 1;
@@ -3206,7 +3557,7 @@ int complete_pio(struct kvm_vcpu *vcpu)
kvm_register_write(vcpu, VCPU_REGS_RSI, val);
}
}
-
+out:
io->count -= io->cur_count;
io->cur_count = 0;
@@ -3219,11 +3570,12 @@ static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
int r;
if (vcpu->arch.pio.in)
- r = kvm_io_bus_read(&vcpu->kvm->pio_bus, vcpu->arch.pio.port,
+ r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port,
vcpu->arch.pio.size, pd);
else
- r = kvm_io_bus_write(&vcpu->kvm->pio_bus, vcpu->arch.pio.port,
- vcpu->arch.pio.size, pd);
+ r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
+ vcpu->arch.pio.port, vcpu->arch.pio.size,
+ pd);
return r;
}
@@ -3234,7 +3586,7 @@ static int pio_string_write(struct kvm_vcpu *vcpu)
int i, r = 0;
for (i = 0; i < io->cur_count; i++) {
- if (kvm_io_bus_write(&vcpu->kvm->pio_bus,
+ if (kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
io->port, io->size, pd)) {
r = -EOPNOTSUPP;
break;
@@ -3248,6 +3600,8 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port)
{
unsigned long val;
+ trace_kvm_pio(!in, port, size, 1);
+
vcpu->run->exit_reason = KVM_EXIT_IO;
vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
vcpu->run->io.size = vcpu->arch.pio.size = size;
@@ -3259,11 +3613,10 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port)
vcpu->arch.pio.down = 0;
vcpu->arch.pio.rep = 0;
- trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port,
- size, 1);
-
- val = kvm_register_read(vcpu, VCPU_REGS_RAX);
- memcpy(vcpu->arch.pio_data, &val, 4);
+ if (!vcpu->arch.pio.in) {
+ val = kvm_register_read(vcpu, VCPU_REGS_RAX);
+ memcpy(vcpu->arch.pio_data, &val, 4);
+ }
if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
complete_pio(vcpu);
@@ -3280,6 +3633,8 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
unsigned now, in_page;
int ret = 0;
+ trace_kvm_pio(!in, port, size, count);
+
vcpu->run->exit_reason = KVM_EXIT_IO;
vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
vcpu->run->io.size = vcpu->arch.pio.size = size;
@@ -3291,9 +3646,6 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
vcpu->arch.pio.down = down;
vcpu->arch.pio.rep = rep;
- trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port,
- size, count);
-
if (!count) {
kvm_x86_ops->skip_emulated_instruction(vcpu);
return 1;
@@ -3325,10 +3677,8 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
if (!vcpu->arch.pio.in) {
/* string PIO write */
ret = pio_copy_data(vcpu);
- if (ret == X86EMUL_PROPAGATE_FAULT) {
- kvm_inject_gp(vcpu, 0);
+ if (ret == X86EMUL_PROPAGATE_FAULT)
return 1;
- }
if (ret == 0 && !pio_string_write(vcpu)) {
complete_pio(vcpu);
if (vcpu->arch.pio.count == 0)
@@ -3487,11 +3837,76 @@ static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0,
return a0 | ((gpa_t)a1 << 32);
}
+int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
+{
+ u64 param, ingpa, outgpa, ret;
+ uint16_t code, rep_idx, rep_cnt, res = HV_STATUS_SUCCESS, rep_done = 0;
+ bool fast, longmode;
+ int cs_db, cs_l;
+
+ /*
+ * hypercall generates UD from non zero cpl and real mode
+ * per HYPER-V spec
+ */
+ if (kvm_x86_ops->get_cpl(vcpu) != 0 || !is_protmode(vcpu)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 0;
+ }
+
+ kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
+ longmode = is_long_mode(vcpu) && cs_l == 1;
+
+ if (!longmode) {
+ param = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDX) << 32) |
+ (kvm_register_read(vcpu, VCPU_REGS_RAX) & 0xffffffff);
+ ingpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RBX) << 32) |
+ (kvm_register_read(vcpu, VCPU_REGS_RCX) & 0xffffffff);
+ outgpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDI) << 32) |
+ (kvm_register_read(vcpu, VCPU_REGS_RSI) & 0xffffffff);
+ }
+#ifdef CONFIG_X86_64
+ else {
+ param = kvm_register_read(vcpu, VCPU_REGS_RCX);
+ ingpa = kvm_register_read(vcpu, VCPU_REGS_RDX);
+ outgpa = kvm_register_read(vcpu, VCPU_REGS_R8);
+ }
+#endif
+
+ code = param & 0xffff;
+ fast = (param >> 16) & 0x1;
+ rep_cnt = (param >> 32) & 0xfff;
+ rep_idx = (param >> 48) & 0xfff;
+
+ trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa);
+
+ switch (code) {
+ case HV_X64_HV_NOTIFY_LONG_SPIN_WAIT:
+ kvm_vcpu_on_spin(vcpu);
+ break;
+ default:
+ res = HV_STATUS_INVALID_HYPERCALL_CODE;
+ break;
+ }
+
+ ret = res | (((u64)rep_done & 0xfff) << 32);
+ if (longmode) {
+ kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
+ } else {
+ kvm_register_write(vcpu, VCPU_REGS_RDX, ret >> 32);
+ kvm_register_write(vcpu, VCPU_REGS_RAX, ret & 0xffffffff);
+ }
+
+ return 1;
+}
+
int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
{
unsigned long nr, a0, a1, a2, a3, ret;
int r = 1;
+ if (kvm_hv_hypercall_enabled(vcpu->kvm))
+ return kvm_hv_hypercall(vcpu);
+
nr = kvm_register_read(vcpu, VCPU_REGS_RAX);
a0 = kvm_register_read(vcpu, VCPU_REGS_RBX);
a1 = kvm_register_read(vcpu, VCPU_REGS_RCX);
@@ -3534,10 +3949,8 @@ EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
{
char instruction[3];
- int ret = 0;
unsigned long rip = kvm_rip_read(vcpu);
-
/*
* Blow out the MMU to ensure that no other VCPU has an active mapping
* to ensure that the updated hypercall appears atomically across all
@@ -3546,11 +3959,8 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
kvm_mmu_zap_all(vcpu->kvm);
kvm_x86_ops->patch_hypercall(vcpu, instruction);
- if (emulator_write_emulated(rip, instruction, 3, vcpu)
- != X86EMUL_CONTINUE)
- ret = -EFAULT;
- return ret;
+ return emulator_write_emulated(rip, instruction, 3, vcpu);
}
static u64 mk_cr_64(u64 curr_cr, u32 new_val)
@@ -3583,10 +3993,9 @@ unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
{
unsigned long value;
- kvm_x86_ops->decache_cr4_guest_bits(vcpu);
switch (cr) {
case 0:
- value = vcpu->arch.cr0;
+ value = kvm_read_cr0(vcpu);
break;
case 2:
value = vcpu->arch.cr2;
@@ -3595,7 +4004,7 @@ unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
value = vcpu->arch.cr3;
break;
case 4:
- value = vcpu->arch.cr4;
+ value = kvm_read_cr4(vcpu);
break;
case 8:
value = kvm_get_cr8(vcpu);
@@ -3613,7 +4022,7 @@ void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
{
switch (cr) {
case 0:
- kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val));
+ kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
*rflags = kvm_get_rflags(vcpu);
break;
case 2:
@@ -3623,7 +4032,7 @@ void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
kvm_set_cr3(vcpu, val);
break;
case 4:
- kvm_set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val));
+ kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
break;
case 8:
kvm_set_cr8(vcpu, val & 0xfUL);
@@ -3690,6 +4099,7 @@ struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
}
return best;
}
+EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry);
int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
{
@@ -3773,14 +4183,15 @@ static void vapic_enter(struct kvm_vcpu *vcpu)
static void vapic_exit(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
+ int idx;
if (!apic || !apic->vapic_addr)
return;
- down_read(&vcpu->kvm->slots_lock);
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
kvm_release_page_dirty(apic->vapic_page);
mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
- up_read(&vcpu->kvm->slots_lock);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
}
static void update_cr8_intercept(struct kvm_vcpu *vcpu)
@@ -3876,12 +4287,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
r = 0;
goto out;
}
+ if (test_and_clear_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests)) {
+ vcpu->fpu_active = 0;
+ kvm_x86_ops->fpu_deactivate(vcpu);
+ }
}
preempt_disable();
kvm_x86_ops->prepare_guest_switch(vcpu);
- kvm_load_guest_fpu(vcpu);
+ if (vcpu->fpu_active)
+ kvm_load_guest_fpu(vcpu);
local_irq_disable();
@@ -3909,7 +4325,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
kvm_lapic_sync_to_vapic(vcpu);
}
- up_read(&vcpu->kvm->slots_lock);
+ srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
kvm_guest_enter();
@@ -3951,7 +4367,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
preempt_enable();
- down_read(&vcpu->kvm->slots_lock);
+ vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
/*
* Profile KVM exit RIPs:
@@ -3973,6 +4389,7 @@ out:
static int __vcpu_run(struct kvm_vcpu *vcpu)
{
int r;
+ struct kvm *kvm = vcpu->kvm;
if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
pr_debug("vcpu %d received sipi with vector # %x\n",
@@ -3984,7 +4401,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
}
- down_read(&vcpu->kvm->slots_lock);
+ vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
vapic_enter(vcpu);
r = 1;
@@ -3992,9 +4409,9 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
r = vcpu_enter_guest(vcpu);
else {
- up_read(&vcpu->kvm->slots_lock);
+ srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
kvm_vcpu_block(vcpu);
- down_read(&vcpu->kvm->slots_lock);
+ vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests))
{
switch(vcpu->arch.mp_state) {
@@ -4029,13 +4446,13 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
++vcpu->stat.signal_exits;
}
if (need_resched()) {
- up_read(&vcpu->kvm->slots_lock);
+ srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
kvm_resched(vcpu);
- down_read(&vcpu->kvm->slots_lock);
+ vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
}
}
- up_read(&vcpu->kvm->slots_lock);
+ srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
post_kvm_run_save(vcpu);
vapic_exit(vcpu);
@@ -4074,10 +4491,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
vcpu->mmio_read_completed = 1;
vcpu->mmio_needed = 0;
- down_read(&vcpu->kvm->slots_lock);
+ vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
r = emulate_instruction(vcpu, vcpu->arch.mmio_fault_cr2, 0,
EMULTYPE_NO_DECODE);
- up_read(&vcpu->kvm->slots_lock);
+ srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
if (r == EMULATE_DO_MMIO) {
/*
* Read-modify-write. Back to userspace.
@@ -4204,13 +4621,12 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
sregs->gdt.limit = dt.limit;
sregs->gdt.base = dt.base;
- kvm_x86_ops->decache_cr4_guest_bits(vcpu);
- sregs->cr0 = vcpu->arch.cr0;
+ sregs->cr0 = kvm_read_cr0(vcpu);
sregs->cr2 = vcpu->arch.cr2;
sregs->cr3 = vcpu->arch.cr3;
- sregs->cr4 = vcpu->arch.cr4;
+ sregs->cr4 = kvm_read_cr4(vcpu);
sregs->cr8 = kvm_get_cr8(vcpu);
- sregs->efer = vcpu->arch.shadow_efer;
+ sregs->efer = vcpu->arch.efer;
sregs->apic_base = kvm_get_apic_base(vcpu);
memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap);
@@ -4298,14 +4714,23 @@ static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
{
struct descriptor_table dtable;
u16 index = selector >> 3;
+ int ret;
+ u32 err;
+ gva_t addr;
get_segment_descriptor_dtable(vcpu, selector, &dtable);
if (dtable.limit < index * 8 + 7) {
kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc);
- return 1;
+ return X86EMUL_PROPAGATE_FAULT;
}
- return kvm_read_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu);
+ addr = dtable.base + index * 8;
+ ret = kvm_read_guest_virt_system(addr, seg_desc, sizeof(*seg_desc),
+ vcpu, &err);
+ if (ret == X86EMUL_PROPAGATE_FAULT)
+ kvm_inject_page_fault(vcpu, addr, err);
+
+ return ret;
}
/* allowed just for 8 bytes segments */
@@ -4319,15 +4744,23 @@ static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
if (dtable.limit < index * 8 + 7)
return 1;
- return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu);
+ return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu, NULL);
+}
+
+static gpa_t get_tss_base_addr_write(struct kvm_vcpu *vcpu,
+ struct desc_struct *seg_desc)
+{
+ u32 base_addr = get_desc_base(seg_desc);
+
+ return kvm_mmu_gva_to_gpa_write(vcpu, base_addr, NULL);
}
-static gpa_t get_tss_base_addr(struct kvm_vcpu *vcpu,
+static gpa_t get_tss_base_addr_read(struct kvm_vcpu *vcpu,
struct desc_struct *seg_desc)
{
u32 base_addr = get_desc_base(seg_desc);
- return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr);
+ return kvm_mmu_gva_to_gpa_read(vcpu, base_addr, NULL);
}
static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg)
@@ -4338,18 +4771,6 @@ static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg)
return kvm_seg.selector;
}
-static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu,
- u16 selector,
- struct kvm_segment *kvm_seg)
-{
- struct desc_struct seg_desc;
-
- if (load_guest_segment_descriptor(vcpu, selector, &seg_desc))
- return 1;
- seg_desct_to_kvm_desct(&seg_desc, selector, kvm_seg);
- return 0;
-}
-
static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg)
{
struct kvm_segment segvar = {
@@ -4367,7 +4788,7 @@ static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int se
.unusable = 0,
};
kvm_x86_ops->set_segment(vcpu, &segvar, seg);
- return 0;
+ return X86EMUL_CONTINUE;
}
static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg)
@@ -4377,24 +4798,112 @@ static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg)
(kvm_get_rflags(vcpu) & X86_EFLAGS_VM);
}
-int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
- int type_bits, int seg)
+int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg)
{
struct kvm_segment kvm_seg;
+ struct desc_struct seg_desc;
+ u8 dpl, rpl, cpl;
+ unsigned err_vec = GP_VECTOR;
+ u32 err_code = 0;
+ bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */
+ int ret;
- if (is_vm86_segment(vcpu, seg) || !(vcpu->arch.cr0 & X86_CR0_PE))
+ if (is_vm86_segment(vcpu, seg) || !is_protmode(vcpu))
return kvm_load_realmode_segment(vcpu, selector, seg);
- if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg))
- return 1;
- kvm_seg.type |= type_bits;
- if (seg != VCPU_SREG_SS && seg != VCPU_SREG_CS &&
- seg != VCPU_SREG_LDTR)
- if (!kvm_seg.s)
- kvm_seg.unusable = 1;
+ /* NULL selector is not valid for TR, CS and SS */
+ if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR)
+ && null_selector)
+ goto exception;
+
+ /* TR should be in GDT only */
+ if (seg == VCPU_SREG_TR && (selector & (1 << 2)))
+ goto exception;
+
+ ret = load_guest_segment_descriptor(vcpu, selector, &seg_desc);
+ if (ret)
+ return ret;
+
+ seg_desct_to_kvm_desct(&seg_desc, selector, &kvm_seg);
+
+ if (null_selector) { /* for NULL selector skip all following checks */
+ kvm_seg.unusable = 1;
+ goto load;
+ }
+
+ err_code = selector & 0xfffc;
+ err_vec = GP_VECTOR;
+ /* can't load system descriptor into segment selecor */
+ if (seg <= VCPU_SREG_GS && !kvm_seg.s)
+ goto exception;
+
+ if (!kvm_seg.present) {
+ err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR;
+ goto exception;
+ }
+
+ rpl = selector & 3;
+ dpl = kvm_seg.dpl;
+ cpl = kvm_x86_ops->get_cpl(vcpu);
+
+ switch (seg) {
+ case VCPU_SREG_SS:
+ /*
+ * segment is not a writable data segment or segment
+ * selector's RPL != CPL or segment selector's RPL != CPL
+ */
+ if (rpl != cpl || (kvm_seg.type & 0xa) != 0x2 || dpl != cpl)
+ goto exception;
+ break;
+ case VCPU_SREG_CS:
+ if (!(kvm_seg.type & 8))
+ goto exception;
+
+ if (kvm_seg.type & 4) {
+ /* conforming */
+ if (dpl > cpl)
+ goto exception;
+ } else {
+ /* nonconforming */
+ if (rpl > cpl || dpl != cpl)
+ goto exception;
+ }
+ /* CS(RPL) <- CPL */
+ selector = (selector & 0xfffc) | cpl;
+ break;
+ case VCPU_SREG_TR:
+ if (kvm_seg.s || (kvm_seg.type != 1 && kvm_seg.type != 9))
+ goto exception;
+ break;
+ case VCPU_SREG_LDTR:
+ if (kvm_seg.s || kvm_seg.type != 2)
+ goto exception;
+ break;
+ default: /* DS, ES, FS, or GS */
+ /*
+ * segment is not a data or readable code segment or
+ * ((segment is a data or nonconforming code segment)
+ * and (both RPL and CPL > DPL))
+ */
+ if ((kvm_seg.type & 0xa) == 0x8 ||
+ (((kvm_seg.type & 0xc) != 0xc) && (rpl > dpl && cpl > dpl)))
+ goto exception;
+ break;
+ }
+
+ if (!kvm_seg.unusable && kvm_seg.s) {
+ /* mark segment as accessed */
+ kvm_seg.type |= 1;
+ seg_desc.type |= 1;
+ save_guest_segment_descriptor(vcpu, selector, &seg_desc);
+ }
+load:
kvm_set_segment(vcpu, &kvm_seg, seg);
- return 0;
+ return X86EMUL_CONTINUE;
+exception:
+ kvm_queue_exception_e(vcpu, err_vec, err_code);
+ return X86EMUL_PROPAGATE_FAULT;
}
static void save_state_to_tss32(struct kvm_vcpu *vcpu,
@@ -4420,6 +4929,14 @@ static void save_state_to_tss32(struct kvm_vcpu *vcpu,
tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR);
}
+static void kvm_load_segment_selector(struct kvm_vcpu *vcpu, u16 sel, int seg)
+{
+ struct kvm_segment kvm_seg;
+ kvm_get_segment(vcpu, &kvm_seg, seg);
+ kvm_seg.selector = sel;
+ kvm_set_segment(vcpu, &kvm_seg, seg);
+}
+
static int load_state_from_tss32(struct kvm_vcpu *vcpu,
struct tss_segment_32 *tss)
{
@@ -4437,25 +4954,41 @@ static int load_state_from_tss32(struct kvm_vcpu *vcpu,
kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi);
kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi);
- if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR))
+ /*
+ * SDM says that segment selectors are loaded before segment
+ * descriptors
+ */
+ kvm_load_segment_selector(vcpu, tss->ldt_selector, VCPU_SREG_LDTR);
+ kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES);
+ kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS);
+ kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS);
+ kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS);
+ kvm_load_segment_selector(vcpu, tss->fs, VCPU_SREG_FS);
+ kvm_load_segment_selector(vcpu, tss->gs, VCPU_SREG_GS);
+
+ /*
+ * Now load segment descriptors. If fault happenes at this stage
+ * it is handled in a context of new task
+ */
+ if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, VCPU_SREG_LDTR))
return 1;
- if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES))
+ if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES))
return 1;
- if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS))
+ if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS))
return 1;
- if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS))
+ if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS))
return 1;
- if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS))
+ if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS))
return 1;
- if (kvm_load_segment_descriptor(vcpu, tss->fs, 1, VCPU_SREG_FS))
+ if (kvm_load_segment_descriptor(vcpu, tss->fs, VCPU_SREG_FS))
return 1;
- if (kvm_load_segment_descriptor(vcpu, tss->gs, 1, VCPU_SREG_GS))
+ if (kvm_load_segment_descriptor(vcpu, tss->gs, VCPU_SREG_GS))
return 1;
return 0;
}
@@ -4495,19 +5028,33 @@ static int load_state_from_tss16(struct kvm_vcpu *vcpu,
kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si);
kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di);
- if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR))
+ /*
+ * SDM says that segment selectors are loaded before segment
+ * descriptors
+ */
+ kvm_load_segment_selector(vcpu, tss->ldt, VCPU_SREG_LDTR);
+ kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES);
+ kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS);
+ kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS);
+ kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS);
+
+ /*
+ * Now load segment descriptors. If fault happenes at this stage
+ * it is handled in a context of new task
+ */
+ if (kvm_load_segment_descriptor(vcpu, tss->ldt, VCPU_SREG_LDTR))
return 1;
- if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES))
+ if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES))
return 1;
- if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS))
+ if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS))
return 1;
- if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS))
+ if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS))
return 1;
- if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS))
+ if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS))
return 1;
return 0;
}
@@ -4529,7 +5076,7 @@ static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
sizeof tss_segment_16))
goto out;
- if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc),
+ if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc),
&tss_segment_16, sizeof tss_segment_16))
goto out;
@@ -4537,7 +5084,7 @@ static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
tss_segment_16.prev_task_link = old_tss_sel;
if (kvm_write_guest(vcpu->kvm,
- get_tss_base_addr(vcpu, nseg_desc),
+ get_tss_base_addr_write(vcpu, nseg_desc),
&tss_segment_16.prev_task_link,
sizeof tss_segment_16.prev_task_link))
goto out;
@@ -4568,7 +5115,7 @@ static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
sizeof tss_segment_32))
goto out;
- if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc),
+ if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc),
&tss_segment_32, sizeof tss_segment_32))
goto out;
@@ -4576,7 +5123,7 @@ static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
tss_segment_32.prev_task_link = old_tss_sel;
if (kvm_write_guest(vcpu->kvm,
- get_tss_base_addr(vcpu, nseg_desc),
+ get_tss_base_addr_write(vcpu, nseg_desc),
&tss_segment_32.prev_task_link,
sizeof tss_segment_32.prev_task_link))
goto out;
@@ -4599,7 +5146,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR);
u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR);
- old_tss_base = vcpu->arch.mmu.gva_to_gpa(vcpu, old_tss_base);
+ old_tss_base = kvm_mmu_gva_to_gpa_write(vcpu, old_tss_base, NULL);
/* FIXME: Handle errors. Failure to read either TSS or their
* descriptors should generate a pagefault.
@@ -4658,7 +5205,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
&nseg_desc);
}
- kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 | X86_CR0_TS);
+ kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0(vcpu) | X86_CR0_TS);
seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg);
tr_seg.type = 11;
kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR);
@@ -4689,17 +5236,15 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
kvm_set_cr8(vcpu, sregs->cr8);
- mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer;
+ mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
kvm_x86_ops->set_efer(vcpu, sregs->efer);
kvm_set_apic_base(vcpu, sregs->apic_base);
- kvm_x86_ops->decache_cr4_guest_bits(vcpu);
-
- mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0;
+ mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
vcpu->arch.cr0 = sregs->cr0;
- mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4;
+ mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
if (!is_long_mode(vcpu) && is_pae(vcpu)) {
load_pdptrs(vcpu, vcpu->arch.cr3);
@@ -4734,7 +5279,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
/* Older userspace won't unhalt the vcpu on reset. */
if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 &&
sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
- !(vcpu->arch.cr0 & X86_CR0_PE))
+ !is_protmode(vcpu))
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
vcpu_put(vcpu);
@@ -4832,11 +5377,12 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
{
unsigned long vaddr = tr->linear_address;
gpa_t gpa;
+ int idx;
vcpu_load(vcpu);
- down_read(&vcpu->kvm->slots_lock);
- gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr);
- up_read(&vcpu->kvm->slots_lock);
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
tr->physical_address = gpa;
tr->valid = gpa != UNMAPPED_GVA;
tr->writeable = 1;
@@ -4917,14 +5463,14 @@ EXPORT_SYMBOL_GPL(fx_init);
void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
{
- if (!vcpu->fpu_active || vcpu->guest_fpu_loaded)
+ if (vcpu->guest_fpu_loaded)
return;
vcpu->guest_fpu_loaded = 1;
kvm_fx_save(&vcpu->arch.host_fx_image);
kvm_fx_restore(&vcpu->arch.guest_fx_image);
+ trace_kvm_fpu(1);
}
-EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
{
@@ -4935,8 +5481,9 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
kvm_fx_save(&vcpu->arch.guest_fx_image);
kvm_fx_restore(&vcpu->arch.host_fx_image);
++vcpu->stat.fpu_reload;
+ set_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests);
+ trace_kvm_fpu(0);
}
-EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
{
@@ -5088,11 +5635,13 @@ fail:
void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
{
+ int idx;
+
kfree(vcpu->arch.mce_banks);
kvm_free_lapic(vcpu);
- down_read(&vcpu->kvm->slots_lock);
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
kvm_mmu_destroy(vcpu);
- up_read(&vcpu->kvm->slots_lock);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
free_page((unsigned long)vcpu->arch.pio_data);
}
@@ -5103,6 +5652,12 @@ struct kvm *kvm_arch_create_vm(void)
if (!kvm)
return ERR_PTR(-ENOMEM);
+ kvm->arch.aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL);
+ if (!kvm->arch.aliases) {
+ kfree(kvm);
+ return ERR_PTR(-ENOMEM);
+ }
+
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
@@ -5159,16 +5714,18 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
put_page(kvm->arch.apic_access_page);
if (kvm->arch.ept_identity_pagetable)
put_page(kvm->arch.ept_identity_pagetable);
+ cleanup_srcu_struct(&kvm->srcu);
+ kfree(kvm->arch.aliases);
kfree(kvm);
}
-int kvm_arch_set_memory_region(struct kvm *kvm,
- struct kvm_userspace_memory_region *mem,
+int kvm_arch_prepare_memory_region(struct kvm *kvm,
+ struct kvm_memory_slot *memslot,
struct kvm_memory_slot old,
+ struct kvm_userspace_memory_region *mem,
int user_alloc)
{
- int npages = mem->memory_size >> PAGE_SHIFT;
- struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot];
+ int npages = memslot->npages;
/*To keep backward compatibility with older userspace,
*x86 needs to hanlde !user_alloc case.
@@ -5188,26 +5745,35 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
if (IS_ERR((void *)userspace_addr))
return PTR_ERR((void *)userspace_addr);
- /* set userspace_addr atomically for kvm_hva_to_rmapp */
- spin_lock(&kvm->mmu_lock);
memslot->userspace_addr = userspace_addr;
- spin_unlock(&kvm->mmu_lock);
- } else {
- if (!old.user_alloc && old.rmap) {
- int ret;
-
- down_write(&current->mm->mmap_sem);
- ret = do_munmap(current->mm, old.userspace_addr,
- old.npages * PAGE_SIZE);
- up_write(&current->mm->mmap_sem);
- if (ret < 0)
- printk(KERN_WARNING
- "kvm_vm_ioctl_set_memory_region: "
- "failed to munmap memory\n");
- }
}
}
+
+ return 0;
+}
+
+void kvm_arch_commit_memory_region(struct kvm *kvm,
+ struct kvm_userspace_memory_region *mem,
+ struct kvm_memory_slot old,
+ int user_alloc)
+{
+
+ int npages = mem->memory_size >> PAGE_SHIFT;
+
+ if (!user_alloc && !old.user_alloc && old.rmap && !npages) {
+ int ret;
+
+ down_write(&current->mm->mmap_sem);
+ ret = do_munmap(current->mm, old.userspace_addr,
+ old.npages * PAGE_SIZE);
+ up_write(&current->mm->mmap_sem);
+ if (ret < 0)
+ printk(KERN_WARNING
+ "kvm_vm_ioctl_set_memory_region: "
+ "failed to munmap memory\n");
+ }
+
spin_lock(&kvm->mmu_lock);
if (!kvm->arch.n_requested_mmu_pages) {
unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
@@ -5216,8 +5782,6 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
kvm_mmu_slot_remove_write_access(kvm, mem->slot);
spin_unlock(&kvm->mmu_lock);
-
- return 0;
}
void kvm_arch_flush_shadow(struct kvm *kvm)
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 5eadea585d2..2d101639bd8 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -2,6 +2,7 @@
#define ARCH_X86_KVM_X86_H
#include <linux/kvm_host.h>
+#include "kvm_cache_regs.h"
static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
{
@@ -35,4 +36,33 @@ static inline bool kvm_exception_is_soft(unsigned int nr)
struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
u32 function, u32 index);
+static inline bool is_protmode(struct kvm_vcpu *vcpu)
+{
+ return kvm_read_cr0_bits(vcpu, X86_CR0_PE);
+}
+
+static inline int is_long_mode(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_X86_64
+ return vcpu->arch.efer & EFER_LMA;
+#else
+ return 0;
+#endif
+}
+
+static inline int is_pae(struct kvm_vcpu *vcpu)
+{
+ return kvm_read_cr4_bits(vcpu, X86_CR4_PAE);
+}
+
+static inline int is_pse(struct kvm_vcpu *vcpu)
+{
+ return kvm_read_cr4_bits(vcpu, X86_CR4_PSE);
+}
+
+static inline int is_paging(struct kvm_vcpu *vcpu)
+{
+ return kvm_read_cr0_bits(vcpu, X86_CR0_PG);
+}
+
#endif
diff --git a/drivers/staging/pohmelfs/inode.c b/drivers/staging/pohmelfs/inode.c
index f69b7783027..11fc4d5c43e 100644
--- a/drivers/staging/pohmelfs/inode.c
+++ b/drivers/staging/pohmelfs/inode.c
@@ -969,7 +969,7 @@ int pohmelfs_setattr_raw(struct inode *inode, struct iattr *attr)
if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
(attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
- err = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0;
+ err = dquot_transfer(inode, attr);
if (err)
goto err_out_exit;
}
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index 14d94420457..08b2eb15704 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -151,7 +151,7 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
if (access == V9FS_ACCESS_SINGLE)
return ERR_PTR(-EPERM);
- if (v9fs_extended(v9ses))
+ if (v9fs_proto_dotu(v9ses))
uname = NULL;
else
uname = v9ses->uname;
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 7d6c2139891..6c7f6a25111 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -241,7 +241,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
list_add(&v9ses->slist, &v9fs_sessionlist);
spin_unlock(&v9fs_sessionlist_lock);
- v9ses->flags = V9FS_EXTENDED | V9FS_ACCESS_USER;
+ v9ses->flags = V9FS_PROTO_2000U | V9FS_ACCESS_USER;
strcpy(v9ses->uname, V9FS_DEFUSER);
strcpy(v9ses->aname, V9FS_DEFANAME);
v9ses->uid = ~0;
@@ -262,13 +262,13 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
goto error;
}
- if (!v9ses->clnt->dotu)
- v9ses->flags &= ~V9FS_EXTENDED;
+ if (!p9_is_proto_dotu(v9ses->clnt))
+ v9ses->flags &= ~V9FS_PROTO_2000U;
v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ;
/* for legacy mode, fall back to V9FS_ACCESS_ANY */
- if (!v9fs_extended(v9ses) &&
+ if (!v9fs_proto_dotu(v9ses) &&
((v9ses->flags&V9FS_ACCESS_MASK) == V9FS_ACCESS_USER)) {
v9ses->flags &= ~V9FS_ACCESS_MASK;
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 019f4ccb70c..79000bf6249 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -23,7 +23,8 @@
/**
* enum p9_session_flags - option flags for each 9P session
- * @V9FS_EXTENDED: whether or not to use 9P2000.u extensions
+ * @V9FS_PROTO_2000U: whether or not to use 9P2000.u extensions
+ * @V9FS_PROTO_2010L: whether or not to use 9P2010.l extensions
* @V9FS_ACCESS_SINGLE: only the mounting user can access the hierarchy
* @V9FS_ACCESS_USER: a new attach will be issued for every user (default)
* @V9FS_ACCESS_ANY: use a single attach for all users
@@ -32,11 +33,12 @@
* Session flags reflect options selected by users at mount time
*/
enum p9_session_flags {
- V9FS_EXTENDED = 0x01,
- V9FS_ACCESS_SINGLE = 0x02,
- V9FS_ACCESS_USER = 0x04,
- V9FS_ACCESS_ANY = 0x06,
- V9FS_ACCESS_MASK = 0x06,
+ V9FS_PROTO_2000U = 0x01,
+ V9FS_PROTO_2010L = 0x02,
+ V9FS_ACCESS_SINGLE = 0x04,
+ V9FS_ACCESS_USER = 0x08,
+ V9FS_ACCESS_ANY = 0x0C,
+ V9FS_ACCESS_MASK = 0x0C,
};
/* possible values of ->cache */
@@ -121,7 +123,12 @@ static inline struct v9fs_session_info *v9fs_inode2v9ses(struct inode *inode)
return (inode->i_sb->s_fs_info);
}
-static inline int v9fs_extended(struct v9fs_session_info *v9ses)
+static inline int v9fs_proto_dotu(struct v9fs_session_info *v9ses)
{
- return v9ses->flags & V9FS_EXTENDED;
+ return v9ses->flags & V9FS_PROTO_2000U;
+}
+
+static inline int v9fs_proto_dotl(struct v9fs_session_info *v9ses)
+{
+ return v9ses->flags & V9FS_PROTO_2010L;
}
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 15cce53bf61..6580aa44954 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -135,7 +135,7 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
while (rdir->head < rdir->tail) {
err = p9stat_read(rdir->buf + rdir->head,
buflen - rdir->head, &st,
- fid->clnt->dotu);
+ fid->clnt->proto_version);
if (err) {
P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err);
err = -EIO;
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 74a0461a9ac..36122683fae 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -61,7 +61,7 @@ int v9fs_file_open(struct inode *inode, struct file *file)
P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p \n", inode, file);
v9ses = v9fs_inode2v9ses(inode);
- omode = v9fs_uflags2omode(file->f_flags, v9fs_extended(v9ses));
+ omode = v9fs_uflags2omode(file->f_flags, v9fs_proto_dotu(v9ses));
fid = file->private_data;
if (!fid) {
fid = v9fs_fid_clone(file->f_path.dentry);
@@ -77,7 +77,7 @@ int v9fs_file_open(struct inode *inode, struct file *file)
i_size_write(inode, 0);
inode->i_blocks = 0;
}
- if ((file->f_flags & O_APPEND) && (!v9fs_extended(v9ses)))
+ if ((file->f_flags & O_APPEND) && (!v9fs_proto_dotu(v9ses)))
generic_file_llseek(file, 0, SEEK_END);
}
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index a407fa3388c..5fe45d692c9 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -60,7 +60,7 @@ static int unixmode2p9mode(struct v9fs_session_info *v9ses, int mode)
res = mode & 0777;
if (S_ISDIR(mode))
res |= P9_DMDIR;
- if (v9fs_extended(v9ses)) {
+ if (v9fs_proto_dotu(v9ses)) {
if (S_ISLNK(mode))
res |= P9_DMSYMLINK;
if (v9ses->nodev == 0) {
@@ -102,21 +102,21 @@ static int p9mode2unixmode(struct v9fs_session_info *v9ses, int mode)
if ((mode & P9_DMDIR) == P9_DMDIR)
res |= S_IFDIR;
- else if ((mode & P9_DMSYMLINK) && (v9fs_extended(v9ses)))
+ else if ((mode & P9_DMSYMLINK) && (v9fs_proto_dotu(v9ses)))
res |= S_IFLNK;
- else if ((mode & P9_DMSOCKET) && (v9fs_extended(v9ses))
+ else if ((mode & P9_DMSOCKET) && (v9fs_proto_dotu(v9ses))
&& (v9ses->nodev == 0))
res |= S_IFSOCK;
- else if ((mode & P9_DMNAMEDPIPE) && (v9fs_extended(v9ses))
+ else if ((mode & P9_DMNAMEDPIPE) && (v9fs_proto_dotu(v9ses))
&& (v9ses->nodev == 0))
res |= S_IFIFO;
- else if ((mode & P9_DMDEVICE) && (v9fs_extended(v9ses))
+ else if ((mode & P9_DMDEVICE) && (v9fs_proto_dotu(v9ses))
&& (v9ses->nodev == 0))
res |= S_IFBLK;
else
res |= S_IFREG;
- if (v9fs_extended(v9ses)) {
+ if (v9fs_proto_dotu(v9ses)) {
if ((mode & P9_DMSETUID) == P9_DMSETUID)
res |= S_ISUID;
@@ -265,7 +265,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
case S_IFBLK:
case S_IFCHR:
case S_IFSOCK:
- if (!v9fs_extended(v9ses)) {
+ if (!v9fs_proto_dotu(v9ses)) {
P9_DPRINTK(P9_DEBUG_ERROR,
"special files without extended mode\n");
err = -EINVAL;
@@ -278,7 +278,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
inode->i_fop = &v9fs_file_operations;
break;
case S_IFLNK:
- if (!v9fs_extended(v9ses)) {
+ if (!v9fs_proto_dotu(v9ses)) {
P9_DPRINTK(P9_DEBUG_ERROR,
"extended modes used w/o 9P2000.u\n");
err = -EINVAL;
@@ -288,7 +288,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
break;
case S_IFDIR:
inc_nlink(inode);
- if (v9fs_extended(v9ses))
+ if (v9fs_proto_dotu(v9ses))
inode->i_op = &v9fs_dir_inode_operations_ext;
else
inode->i_op = &v9fs_dir_inode_operations;
@@ -575,7 +575,8 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode,
flags = O_RDWR;
fid = v9fs_create(v9ses, dir, dentry, NULL, perm,
- v9fs_uflags2omode(flags, v9fs_extended(v9ses)));
+ v9fs_uflags2omode(flags,
+ v9fs_proto_dotu(v9ses)));
if (IS_ERR(fid)) {
err = PTR_ERR(fid);
fid = NULL;
@@ -858,7 +859,7 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
if (iattr->ia_valid & ATTR_SIZE)
wstat.length = iattr->ia_size;
- if (v9fs_extended(v9ses)) {
+ if (v9fs_proto_dotu(v9ses)) {
if (iattr->ia_valid & ATTR_UID)
wstat.n_uid = iattr->ia_uid;
@@ -886,6 +887,8 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
struct super_block *sb)
{
char ext[32];
+ char tag_name[14];
+ unsigned int i_nlink;
struct v9fs_session_info *v9ses = sb->s_fs_info;
inode->i_nlink = 1;
@@ -897,11 +900,26 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
inode->i_uid = v9ses->dfltuid;
inode->i_gid = v9ses->dfltgid;
- if (v9fs_extended(v9ses)) {
+ if (v9fs_proto_dotu(v9ses)) {
inode->i_uid = stat->n_uid;
inode->i_gid = stat->n_gid;
}
-
+ if ((S_ISREG(inode->i_mode)) || (S_ISDIR(inode->i_mode))) {
+ if (v9fs_proto_dotu(v9ses) && (stat->extension[0] != '\0')) {
+ /*
+ * Hadlink support got added later to
+ * to the .u extension. So there can be
+ * server out there that doesn't support
+ * this even with .u extension. So check
+ * for non NULL stat->extension
+ */
+ strncpy(ext, stat->extension, sizeof(ext));
+ /* HARDLINKCOUNT %u */
+ sscanf(ext, "%13s %u", tag_name, &i_nlink);
+ if (!strncmp(tag_name, "HARDLINKCOUNT", 13))
+ inode->i_nlink = i_nlink;
+ }
+ }
inode->i_mode = p9mode2unixmode(v9ses, stat->mode);
if ((S_ISBLK(inode->i_mode)) || (S_ISCHR(inode->i_mode))) {
char type = 0;
@@ -976,7 +994,7 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
if (IS_ERR(fid))
return PTR_ERR(fid);
- if (!v9fs_extended(v9ses))
+ if (!v9fs_proto_dotu(v9ses))
return -EBADF;
st = p9_client_stat(fid);
@@ -1066,7 +1084,7 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
struct p9_fid *fid;
v9ses = v9fs_inode2v9ses(dir);
- if (!v9fs_extended(v9ses)) {
+ if (!v9fs_proto_dotu(v9ses)) {
P9_DPRINTK(P9_DEBUG_ERROR, "not extended\n");
return -EPERM;
}
diff --git a/fs/attr.c b/fs/attr.c
index 96d394bdadd..0a6ea54cde7 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -12,7 +12,6 @@
#include <linux/capability.h>
#include <linux/fsnotify.h>
#include <linux/fcntl.h>
-#include <linux/quotaops.h>
#include <linux/security.h>
/* Taken over from the old code... */
@@ -212,14 +211,8 @@ int notify_change(struct dentry * dentry, struct iattr * attr)
error = inode->i_op->setattr(dentry, attr);
} else {
error = inode_change_ok(inode, attr);
- if (!error) {
- if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
- (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid))
- error = vfs_dq_transfer(inode, attr) ?
- -EDQUOT : 0;
- if (!error)
- error = inode_setattr(inode, attr);
- }
+ if (!error)
+ error = inode_setattr(inode, attr);
}
if (ia_valid & ATTR_SIZE)
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index 7f8d2e5a7ea..1d081f0cfec 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -570,7 +570,7 @@ do_more:
error_return:
brelse(bitmap_bh);
release_blocks(sb, freed);
- vfs_dq_free_block(inode, freed);
+ dquot_free_block(inode, freed);
}
/**
@@ -1236,6 +1236,7 @@ ext2_fsblk_t ext2_new_blocks(struct inode *inode, ext2_fsblk_t goal,
unsigned short windowsz = 0;
unsigned long ngroups;
unsigned long num = *count;
+ int ret;
*errp = -ENOSPC;
sb = inode->i_sb;
@@ -1247,8 +1248,9 @@ ext2_fsblk_t ext2_new_blocks(struct inode *inode, ext2_fsblk_t goal,
/*
* Check quota for allocation of this block.
*/
- if (vfs_dq_alloc_block(inode, num)) {
- *errp = -EDQUOT;
+ ret = dquot_alloc_block(inode, num);
+ if (ret) {
+ *errp = ret;
return 0;
}
@@ -1409,7 +1411,7 @@ allocated:
*errp = 0;
brelse(bitmap_bh);
- vfs_dq_free_block(inode, *count-num);
+ dquot_free_block(inode, *count-num);
*count = num;
return ret_block;
@@ -1420,7 +1422,7 @@ out:
* Undo the block allocation
*/
if (!performed_allocation)
- vfs_dq_free_block(inode, *count);
+ dquot_free_block(inode, *count);
brelse(bitmap_bh);
return 0;
}
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 586e3589d4c..5d198d0697f 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -20,6 +20,7 @@
#include <linux/time.h>
#include <linux/pagemap.h>
+#include <linux/quotaops.h>
#include "ext2.h"
#include "xattr.h"
#include "acl.h"
@@ -70,7 +71,7 @@ const struct file_operations ext2_file_operations = {
.compat_ioctl = ext2_compat_ioctl,
#endif
.mmap = generic_file_mmap,
- .open = generic_file_open,
+ .open = dquot_file_open,
.release = ext2_release_file,
.fsync = ext2_fsync,
.splice_read = generic_file_splice_read,
@@ -87,7 +88,7 @@ const struct file_operations ext2_xip_file_operations = {
.compat_ioctl = ext2_compat_ioctl,
#endif
.mmap = xip_file_mmap,
- .open = generic_file_open,
+ .open = dquot_file_open,
.release = ext2_release_file,
.fsync = ext2_fsync,
};
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index 15387c9c17d..ad7d572ee8d 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -121,8 +121,8 @@ void ext2_free_inode (struct inode * inode)
if (!is_bad_inode(inode)) {
/* Quota is already initialized in iput() */
ext2_xattr_delete_inode(inode);
- vfs_dq_free_inode(inode);
- vfs_dq_drop(inode);
+ dquot_free_inode(inode);
+ dquot_drop(inode);
}
es = EXT2_SB(sb)->s_es;
@@ -586,10 +586,10 @@ got:
goto fail_drop;
}
- if (vfs_dq_alloc_inode(inode)) {
- err = -EDQUOT;
+ dquot_initialize(inode);
+ err = dquot_alloc_inode(inode);
+ if (err)
goto fail_drop;
- }
err = ext2_init_acl(inode, dir);
if (err)
@@ -605,10 +605,10 @@ got:
return inode;
fail_free_drop:
- vfs_dq_free_inode(inode);
+ dquot_free_inode(inode);
fail_drop:
- vfs_dq_drop(inode);
+ dquot_drop(inode);
inode->i_flags |= S_NOQUOTA;
inode->i_nlink = 0;
unlock_new_inode(inode);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 36ae1cac767..fc13cc119aa 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -60,6 +60,8 @@ static inline int ext2_inode_is_fast_symlink(struct inode *inode)
*/
void ext2_delete_inode (struct inode * inode)
{
+ if (!is_bad_inode(inode))
+ dquot_initialize(inode);
truncate_inode_pages(&inode->i_data, 0);
if (is_bad_inode(inode))
@@ -1464,9 +1466,12 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
error = inode_change_ok(inode, iattr);
if (error)
return error;
+
+ if (iattr->ia_valid & ATTR_SIZE)
+ dquot_initialize(inode);
if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
(iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
- error = vfs_dq_transfer(inode, iattr) ? -EDQUOT : 0;
+ error = dquot_transfer(inode, iattr);
if (error)
return error;
}
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index dd7175ce560..71efb0e9a3f 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -31,6 +31,7 @@
*/
#include <linux/pagemap.h>
+#include <linux/quotaops.h>
#include "ext2.h"
#include "xattr.h"
#include "acl.h"
@@ -99,24 +100,27 @@ struct dentry *ext2_get_parent(struct dentry *child)
*/
static int ext2_create (struct inode * dir, struct dentry * dentry, int mode, struct nameidata *nd)
{
- struct inode * inode = ext2_new_inode (dir, mode);
- int err = PTR_ERR(inode);
- if (!IS_ERR(inode)) {
- inode->i_op = &ext2_file_inode_operations;
- if (ext2_use_xip(inode->i_sb)) {
- inode->i_mapping->a_ops = &ext2_aops_xip;
- inode->i_fop = &ext2_xip_file_operations;
- } else if (test_opt(inode->i_sb, NOBH)) {
- inode->i_mapping->a_ops = &ext2_nobh_aops;
- inode->i_fop = &ext2_file_operations;
- } else {
- inode->i_mapping->a_ops = &ext2_aops;
- inode->i_fop = &ext2_file_operations;
- }
- mark_inode_dirty(inode);
- err = ext2_add_nondir(dentry, inode);
+ struct inode *inode;
+
+ dquot_initialize(dir);
+
+ inode = ext2_new_inode(dir, mode);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ inode->i_op = &ext2_file_inode_operations;
+ if (ext2_use_xip(inode->i_sb)) {
+ inode->i_mapping->a_ops = &ext2_aops_xip;
+ inode->i_fop = &ext2_xip_file_operations;
+ } else if (test_opt(inode->i_sb, NOBH)) {
+ inode->i_mapping->a_ops = &ext2_nobh_aops;
+ inode->i_fop = &ext2_file_operations;
+ } else {
+ inode->i_mapping->a_ops = &ext2_aops;
+ inode->i_fop = &ext2_file_operations;
}
- return err;
+ mark_inode_dirty(inode);
+ return ext2_add_nondir(dentry, inode);
}
static int ext2_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_t rdev)
@@ -127,6 +131,8 @@ static int ext2_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_
if (!new_valid_dev(rdev))
return -EINVAL;
+ dquot_initialize(dir);
+
inode = ext2_new_inode (dir, mode);
err = PTR_ERR(inode);
if (!IS_ERR(inode)) {
@@ -151,6 +157,8 @@ static int ext2_symlink (struct inode * dir, struct dentry * dentry,
if (l > sb->s_blocksize)
goto out;
+ dquot_initialize(dir);
+
inode = ext2_new_inode (dir, S_IFLNK | S_IRWXUGO);
err = PTR_ERR(inode);
if (IS_ERR(inode))
@@ -194,6 +202,8 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir,
if (inode->i_nlink >= EXT2_LINK_MAX)
return -EMLINK;
+ dquot_initialize(dir);
+
inode->i_ctime = CURRENT_TIME_SEC;
inode_inc_link_count(inode);
atomic_inc(&inode->i_count);
@@ -216,6 +226,8 @@ static int ext2_mkdir(struct inode * dir, struct dentry * dentry, int mode)
if (dir->i_nlink >= EXT2_LINK_MAX)
goto out;
+ dquot_initialize(dir);
+
inode_inc_link_count(dir);
inode = ext2_new_inode (dir, S_IFDIR | mode);
@@ -262,6 +274,8 @@ static int ext2_unlink(struct inode * dir, struct dentry *dentry)
struct page * page;
int err = -ENOENT;
+ dquot_initialize(dir);
+
de = ext2_find_entry (dir, &dentry->d_name, &page);
if (!de)
goto out;
@@ -304,6 +318,9 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
struct ext2_dir_entry_2 * old_de;
int err = -ENOENT;
+ dquot_initialize(old_dir);
+ dquot_initialize(new_dir);
+
old_de = ext2_find_entry (old_dir, &old_dentry->d_name, &old_page);
if (!old_de)
goto out;
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index f9cb54a585c..42e4a303b67 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -194,6 +194,8 @@ static void destroy_inodecache(void)
static void ext2_clear_inode(struct inode *inode)
{
struct ext2_block_alloc_info *rsv = EXT2_I(inode)->i_block_alloc_info;
+
+ dquot_drop(inode);
ext2_discard_reservation(inode);
EXT2_I(inode)->i_block_alloc_info = NULL;
if (unlikely(rsv))
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index 904f00642f8..e44dc92609b 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -644,8 +644,8 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
the inode. */
ea_bdebug(new_bh, "reusing block");
- error = -EDQUOT;
- if (vfs_dq_alloc_block(inode, 1)) {
+ error = dquot_alloc_block(inode, 1);
+ if (error) {
unlock_buffer(new_bh);
goto cleanup;
}
@@ -702,7 +702,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
* as if nothing happened and cleanup the unused block */
if (error && error != -ENOSPC) {
if (new_bh && new_bh != old_bh)
- vfs_dq_free_block(inode, 1);
+ dquot_free_block(inode, 1);
goto cleanup;
}
} else
@@ -734,7 +734,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
le32_add_cpu(&HDR(old_bh)->h_refcount, -1);
if (ce)
mb_cache_entry_release(ce);
- vfs_dq_free_block(inode, 1);
+ dquot_free_block(inode, 1);
mark_buffer_dirty(old_bh);
ea_bdebug(old_bh, "refcount now=%d",
le32_to_cpu(HDR(old_bh)->h_refcount));
@@ -797,7 +797,7 @@ ext2_xattr_delete_inode(struct inode *inode)
mark_buffer_dirty(bh);
if (IS_SYNC(inode))
sync_dirty_buffer(bh);
- vfs_dq_free_block(inode, 1);
+ dquot_free_block(inode, 1);
}
EXT2_I(inode)->i_file_acl = 0;
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 27967f92e82..161da2d3f89 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -676,7 +676,7 @@ void ext3_free_blocks(handle_t *handle, struct inode *inode,
}
ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks);
if (dquot_freed_blocks)
- vfs_dq_free_block(inode, dquot_freed_blocks);
+ dquot_free_block(inode, dquot_freed_blocks);
return;
}
@@ -1502,8 +1502,9 @@ ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode,
/*
* Check quota for allocation of this block.
*/
- if (vfs_dq_alloc_block(inode, num)) {
- *errp = -EDQUOT;
+ err = dquot_alloc_block(inode, num);
+ if (err) {
+ *errp = err;
return 0;
}
@@ -1713,7 +1714,7 @@ allocated:
*errp = 0;
brelse(bitmap_bh);
- vfs_dq_free_block(inode, *count-num);
+ dquot_free_block(inode, *count-num);
*count = num;
return ret_block;
@@ -1728,7 +1729,7 @@ out:
* Undo the block allocation
*/
if (!performed_allocation)
- vfs_dq_free_block(inode, *count);
+ dquot_free_block(inode, *count);
brelse(bitmap_bh);
return 0;
}
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index 388bbdfa0b4..f55df0e61cb 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -21,6 +21,7 @@
#include <linux/time.h>
#include <linux/fs.h>
#include <linux/jbd.h>
+#include <linux/quotaops.h>
#include <linux/ext3_fs.h>
#include <linux/ext3_jbd.h>
#include "xattr.h"
@@ -33,9 +34,9 @@
*/
static int ext3_release_file (struct inode * inode, struct file * filp)
{
- if (EXT3_I(inode)->i_state & EXT3_STATE_FLUSH_ON_CLOSE) {
+ if (ext3_test_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE)) {
filemap_flush(inode->i_mapping);
- EXT3_I(inode)->i_state &= ~EXT3_STATE_FLUSH_ON_CLOSE;
+ ext3_clear_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE);
}
/* if we are the last writer on the inode, drop the block reservation */
if ((filp->f_mode & FMODE_WRITE) &&
@@ -62,7 +63,7 @@ const struct file_operations ext3_file_operations = {
.compat_ioctl = ext3_compat_ioctl,
#endif
.mmap = generic_file_mmap,
- .open = generic_file_open,
+ .open = dquot_file_open,
.release = ext3_release_file,
.fsync = ext3_sync_file,
.splice_read = generic_file_splice_read,
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index b3999128513..ef9008b885b 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -123,10 +123,10 @@ void ext3_free_inode (handle_t *handle, struct inode * inode)
* Note: we must free any quota before locking the superblock,
* as writing the quota to disk may need the lock as well.
*/
- vfs_dq_init(inode);
+ dquot_initialize(inode);
ext3_xattr_delete_inode(handle, inode);
- vfs_dq_free_inode(inode);
- vfs_dq_drop(inode);
+ dquot_free_inode(inode);
+ dquot_drop(inode);
is_directory = S_ISDIR(inode->i_mode);
@@ -588,10 +588,10 @@ got:
sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE : 0;
ret = inode;
- if (vfs_dq_alloc_inode(inode)) {
- err = -EDQUOT;
+ dquot_initialize(inode);
+ err = dquot_alloc_inode(inode);
+ if (err)
goto fail_drop;
- }
err = ext3_init_acl(handle, inode, dir);
if (err)
@@ -619,10 +619,10 @@ really_out:
return ret;
fail_free_drop:
- vfs_dq_free_inode(inode);
+ dquot_free_inode(inode);
fail_drop:
- vfs_dq_drop(inode);
+ dquot_drop(inode);
inode->i_flags |= S_NOQUOTA;
inode->i_nlink = 0;
unlock_new_inode(inode);
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 7aca55fcc97..7f920b7263a 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -196,6 +196,9 @@ void ext3_delete_inode (struct inode * inode)
{
handle_t *handle;
+ if (!is_bad_inode(inode))
+ dquot_initialize(inode);
+
truncate_inode_pages(&inode->i_data, 0);
if (is_bad_inode(inode))
@@ -1378,7 +1381,7 @@ static int ext3_journalled_write_end(struct file *file,
*/
if (pos + len > inode->i_size && ext3_can_truncate(inode))
ext3_orphan_add(handle, inode);
- EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
+ ext3_set_inode_state(inode, EXT3_STATE_JDATA);
if (inode->i_size > EXT3_I(inode)->i_disksize) {
EXT3_I(inode)->i_disksize = inode->i_size;
ret2 = ext3_mark_inode_dirty(handle, inode);
@@ -1417,7 +1420,7 @@ static sector_t ext3_bmap(struct address_space *mapping, sector_t block)
journal_t *journal;
int err;
- if (EXT3_I(inode)->i_state & EXT3_STATE_JDATA) {
+ if (ext3_test_inode_state(inode, EXT3_STATE_JDATA)) {
/*
* This is a REALLY heavyweight approach, but the use of
* bmap on dirty files is expected to be extremely rare:
@@ -1436,7 +1439,7 @@ static sector_t ext3_bmap(struct address_space *mapping, sector_t block)
* everything they get.
*/
- EXT3_I(inode)->i_state &= ~EXT3_STATE_JDATA;
+ ext3_clear_inode_state(inode, EXT3_STATE_JDATA);
journal = EXT3_JOURNAL(inode);
journal_lock_updates(journal);
err = journal_flush(journal);
@@ -1528,6 +1531,7 @@ static int ext3_ordered_writepage(struct page *page,
int err;
J_ASSERT(PageLocked(page));
+ WARN_ON_ONCE(IS_RDONLY(inode));
/*
* We give up here if we're reentered, because it might be for a
@@ -1600,6 +1604,9 @@ static int ext3_writeback_writepage(struct page *page,
int ret = 0;
int err;
+ J_ASSERT(PageLocked(page));
+ WARN_ON_ONCE(IS_RDONLY(inode));
+
if (ext3_journal_current_handle())
goto out_fail;
@@ -1642,6 +1649,9 @@ static int ext3_journalled_writepage(struct page *page,
int ret = 0;
int err;
+ J_ASSERT(PageLocked(page));
+ WARN_ON_ONCE(IS_RDONLY(inode));
+
if (ext3_journal_current_handle())
goto no_write;
@@ -1670,7 +1680,7 @@ static int ext3_journalled_writepage(struct page *page,
PAGE_CACHE_SIZE, NULL, write_end_fn);
if (ret == 0)
ret = err;
- EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
+ ext3_set_inode_state(inode, EXT3_STATE_JDATA);
unlock_page(page);
} else {
/*
@@ -1785,8 +1795,9 @@ retry:
handle = ext3_journal_start(inode, 2);
if (IS_ERR(handle)) {
/* This is really bad luck. We've written the data
- * but cannot extend i_size. Bail out and pretend
- * the write failed... */
+ * but cannot extend i_size. Truncate allocated blocks
+ * and pretend the write failed... */
+ ext3_truncate(inode);
ret = PTR_ERR(handle);
goto out;
}
@@ -2402,7 +2413,7 @@ void ext3_truncate(struct inode *inode)
goto out_notrans;
if (inode->i_size == 0 && ext3_should_writeback_data(inode))
- ei->i_state |= EXT3_STATE_FLUSH_ON_CLOSE;
+ ext3_set_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE);
/*
* We have to lock the EOF page here, because lock_page() nests
@@ -2721,7 +2732,7 @@ int ext3_get_inode_loc(struct inode *inode, struct ext3_iloc *iloc)
{
/* We have all inode data except xattrs in memory here. */
return __ext3_get_inode_loc(inode, iloc,
- !(EXT3_I(inode)->i_state & EXT3_STATE_XATTR));
+ !ext3_test_inode_state(inode, EXT3_STATE_XATTR));
}
void ext3_set_inode_flags(struct inode *inode)
@@ -2893,7 +2904,7 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
EXT3_GOOD_OLD_INODE_SIZE +
ei->i_extra_isize;
if (*magic == cpu_to_le32(EXT3_XATTR_MAGIC))
- ei->i_state |= EXT3_STATE_XATTR;
+ ext3_set_inode_state(inode, EXT3_STATE_XATTR);
}
} else
ei->i_extra_isize = 0;
@@ -2955,7 +2966,7 @@ again:
/* For fields not not tracking in the in-memory inode,
* initialise them to zero for new inodes. */
- if (ei->i_state & EXT3_STATE_NEW)
+ if (ext3_test_inode_state(inode, EXT3_STATE_NEW))
memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size);
ext3_get_inode_flags(ei);
@@ -3052,7 +3063,7 @@ again:
rc = ext3_journal_dirty_metadata(handle, bh);
if (!err)
err = rc;
- ei->i_state &= ~EXT3_STATE_NEW;
+ ext3_clear_inode_state(inode, EXT3_STATE_NEW);
atomic_set(&ei->i_sync_tid, handle->h_transaction->t_tid);
out_brelse:
@@ -3140,6 +3151,8 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr)
if (error)
return error;
+ if (ia_valid & ATTR_SIZE)
+ dquot_initialize(inode);
if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
(ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
handle_t *handle;
@@ -3152,7 +3165,7 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr)
error = PTR_ERR(handle);
goto err_out;
}
- error = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0;
+ error = dquot_transfer(inode, attr);
if (error) {
ext3_journal_stop(handle);
return error;
@@ -3237,7 +3250,7 @@ static int ext3_writepage_trans_blocks(struct inode *inode)
ret = 2 * (bpp + indirects) + 2;
#ifdef CONFIG_QUOTA
- /* We know that structure was already allocated during vfs_dq_init so
+ /* We know that structure was already allocated during dquot_initialize so
* we will be updating only the data blocks + inodes */
ret += EXT3_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
#endif
@@ -3328,7 +3341,7 @@ int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode)
* i_size has been changed by generic_commit_write() and we thus need
* to include the updated inode in the current transaction.
*
- * Also, vfs_dq_alloc_space() will always dirty the inode when blocks
+ * Also, dquot_alloc_space() will always dirty the inode when blocks
* are allocated to the file.
*
* If the inode is marked synchronous, we don't honour that here - doing
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 7b0e44f7d66..ee184084ca4 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1696,6 +1696,8 @@ static int ext3_create (struct inode * dir, struct dentry * dentry, int mode,
struct inode * inode;
int err, retries = 0;
+ dquot_initialize(dir);
+
retry:
handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
@@ -1730,6 +1732,8 @@ static int ext3_mknod (struct inode * dir, struct dentry *dentry,
if (!new_valid_dev(rdev))
return -EINVAL;
+ dquot_initialize(dir);
+
retry:
handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
@@ -1766,6 +1770,8 @@ static int ext3_mkdir(struct inode * dir, struct dentry * dentry, int mode)
if (dir->i_nlink >= EXT3_LINK_MAX)
return -EMLINK;
+ dquot_initialize(dir);
+
retry:
handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
@@ -2060,7 +2066,9 @@ static int ext3_rmdir (struct inode * dir, struct dentry *dentry)
/* Initialize quotas before so that eventual writes go in
* separate transaction */
- vfs_dq_init(dentry->d_inode);
+ dquot_initialize(dir);
+ dquot_initialize(dentry->d_inode);
+
handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb));
if (IS_ERR(handle))
return PTR_ERR(handle);
@@ -2119,7 +2127,9 @@ static int ext3_unlink(struct inode * dir, struct dentry *dentry)
/* Initialize quotas before so that eventual writes go
* in separate transaction */
- vfs_dq_init(dentry->d_inode);
+ dquot_initialize(dir);
+ dquot_initialize(dentry->d_inode);
+
handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb));
if (IS_ERR(handle))
return PTR_ERR(handle);
@@ -2174,6 +2184,8 @@ static int ext3_symlink (struct inode * dir,
if (l > dir->i_sb->s_blocksize)
return -ENAMETOOLONG;
+ dquot_initialize(dir);
+
retry:
handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
EXT3_INDEX_EXTRA_TRANS_BLOCKS + 5 +
@@ -2228,6 +2240,9 @@ static int ext3_link (struct dentry * old_dentry,
if (inode->i_nlink >= EXT3_LINK_MAX)
return -EMLINK;
+
+ dquot_initialize(dir);
+
/*
* Return -ENOENT if we've raced with unlink and i_nlink is 0. Doing
* otherwise has the potential to corrupt the orphan inode list.
@@ -2278,12 +2293,15 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
struct ext3_dir_entry_2 * old_de, * new_de;
int retval, flush_file = 0;
+ dquot_initialize(old_dir);
+ dquot_initialize(new_dir);
+
old_bh = new_bh = dir_bh = NULL;
/* Initialize quotas before so that eventual writes go
* in separate transaction */
if (new_dentry->d_inode)
- vfs_dq_init(new_dentry->d_inode);
+ dquot_initialize(new_dentry->d_inode);
handle = ext3_journal_start(old_dir, 2 *
EXT3_DATA_TRANS_BLOCKS(old_dir->i_sb) +
EXT3_INDEX_EXTRA_TRANS_BLOCKS + 2);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index afa2b569da1..e844accbf55 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -181,7 +181,7 @@ static void ext3_handle_error(struct super_block *sb)
if (!test_opt (sb, ERRORS_CONT)) {
journal_t *journal = EXT3_SB(sb)->s_journal;
- EXT3_SB(sb)->s_mount_opt |= EXT3_MOUNT_ABORT;
+ set_opt(EXT3_SB(sb)->s_mount_opt, ABORT);
if (journal)
journal_abort(journal, -EIO);
}
@@ -296,7 +296,7 @@ void ext3_abort (struct super_block * sb, const char * function,
"error: remounting filesystem read-only");
EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS;
sb->s_flags |= MS_RDONLY;
- EXT3_SB(sb)->s_mount_opt |= EXT3_MOUNT_ABORT;
+ set_opt(EXT3_SB(sb)->s_mount_opt, ABORT);
if (EXT3_SB(sb)->s_journal)
journal_abort(EXT3_SB(sb)->s_journal, -EIO);
}
@@ -528,6 +528,8 @@ static void destroy_inodecache(void)
static void ext3_clear_inode(struct inode *inode)
{
struct ext3_block_alloc_info *rsv = EXT3_I(inode)->i_block_alloc_info;
+
+ dquot_drop(inode);
ext3_discard_reservation(inode);
EXT3_I(inode)->i_block_alloc_info = NULL;
if (unlikely(rsv))
@@ -562,10 +564,10 @@ static inline void ext3_show_quota_options(struct seq_file *seq, struct super_bl
if (sbi->s_qf_names[GRPQUOTA])
seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
- if (sbi->s_mount_opt & EXT3_MOUNT_USRQUOTA)
+ if (test_opt(sb, USRQUOTA))
seq_puts(seq, ",usrquota");
- if (sbi->s_mount_opt & EXT3_MOUNT_GRPQUOTA)
+ if (test_opt(sb, GRPQUOTA))
seq_puts(seq, ",grpquota");
#endif
}
@@ -656,8 +658,7 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs)
if (test_opt(sb, NOBH))
seq_puts(seq, ",nobh");
- seq_printf(seq, ",data=%s", data_mode_string(sbi->s_mount_opt &
- EXT3_MOUNT_DATA_FLAGS));
+ seq_printf(seq, ",data=%s", data_mode_string(test_opt(sb, DATA_FLAGS)));
if (test_opt(sb, DATA_ERR_ABORT))
seq_puts(seq, ",data_err=abort");
@@ -751,13 +752,6 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type,
const char *data, size_t len, loff_t off);
static const struct dquot_operations ext3_quota_operations = {
- .initialize = dquot_initialize,
- .drop = dquot_drop,
- .alloc_space = dquot_alloc_space,
- .alloc_inode = dquot_alloc_inode,
- .free_space = dquot_free_space,
- .free_inode = dquot_free_inode,
- .transfer = dquot_transfer,
.write_dquot = ext3_write_dquot,
.acquire_dquot = ext3_acquire_dquot,
.release_dquot = ext3_release_dquot,
@@ -896,6 +890,63 @@ static ext3_fsblk_t get_sb_block(void **data, struct super_block *sb)
return sb_block;
}
+#ifdef CONFIG_QUOTA
+static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
+{
+ struct ext3_sb_info *sbi = EXT3_SB(sb);
+ char *qname;
+
+ if (sb_any_quota_loaded(sb) &&
+ !sbi->s_qf_names[qtype]) {
+ ext3_msg(sb, KERN_ERR,
+ "Cannot change journaled "
+ "quota options when quota turned on");
+ return 0;
+ }
+ qname = match_strdup(args);
+ if (!qname) {
+ ext3_msg(sb, KERN_ERR,
+ "Not enough memory for storing quotafile name");
+ return 0;
+ }
+ if (sbi->s_qf_names[qtype] &&
+ strcmp(sbi->s_qf_names[qtype], qname)) {
+ ext3_msg(sb, KERN_ERR,
+ "%s quota file already specified", QTYPE2NAME(qtype));
+ kfree(qname);
+ return 0;
+ }
+ sbi->s_qf_names[qtype] = qname;
+ if (strchr(sbi->s_qf_names[qtype], '/')) {
+ ext3_msg(sb, KERN_ERR,
+ "quotafile must be on filesystem root");
+ kfree(sbi->s_qf_names[qtype]);
+ sbi->s_qf_names[qtype] = NULL;
+ return 0;
+ }
+ set_opt(sbi->s_mount_opt, QUOTA);
+ return 1;
+}
+
+static int clear_qf_name(struct super_block *sb, int qtype) {
+
+ struct ext3_sb_info *sbi = EXT3_SB(sb);
+
+ if (sb_any_quota_loaded(sb) &&
+ sbi->s_qf_names[qtype]) {
+ ext3_msg(sb, KERN_ERR, "Cannot change journaled quota options"
+ " when quota turned on");
+ return 0;
+ }
+ /*
+ * The space will be released later when all options are confirmed
+ * to be correct
+ */
+ sbi->s_qf_names[qtype] = NULL;
+ return 1;
+}
+#endif
+
static int parse_options (char *options, struct super_block *sb,
unsigned int *inum, unsigned long *journal_devnum,
ext3_fsblk_t *n_blocks_count, int is_remount)
@@ -906,8 +957,7 @@ static int parse_options (char *options, struct super_block *sb,
int data_opt = 0;
int option;
#ifdef CONFIG_QUOTA
- int qtype, qfmt;
- char *qname;
+ int qfmt;
#endif
if (!options)
@@ -1065,20 +1115,19 @@ static int parse_options (char *options, struct super_block *sb,
data_opt = EXT3_MOUNT_WRITEBACK_DATA;
datacheck:
if (is_remount) {
- if ((sbi->s_mount_opt & EXT3_MOUNT_DATA_FLAGS)
- == data_opt)
+ if (test_opt(sb, DATA_FLAGS) == data_opt)
break;
ext3_msg(sb, KERN_ERR,
"error: cannot change "
"data mode on remount. The filesystem "
"is mounted in data=%s mode and you "
"try to remount it in data=%s mode.",
- data_mode_string(sbi->s_mount_opt &
- EXT3_MOUNT_DATA_FLAGS),
+ data_mode_string(test_opt(sb,
+ DATA_FLAGS)),
data_mode_string(data_opt));
return 0;
} else {
- sbi->s_mount_opt &= ~EXT3_MOUNT_DATA_FLAGS;
+ clear_opt(sbi->s_mount_opt, DATA_FLAGS);
sbi->s_mount_opt |= data_opt;
}
break;
@@ -1090,62 +1139,20 @@ static int parse_options (char *options, struct super_block *sb,
break;
#ifdef CONFIG_QUOTA
case Opt_usrjquota:
- qtype = USRQUOTA;
- goto set_qf_name;
- case Opt_grpjquota:
- qtype = GRPQUOTA;
-set_qf_name:
- if (sb_any_quota_loaded(sb) &&
- !sbi->s_qf_names[qtype]) {
- ext3_msg(sb, KERN_ERR,
- "error: cannot change journaled "
- "quota options when quota turned on.");
- return 0;
- }
- qname = match_strdup(&args[0]);
- if (!qname) {
- ext3_msg(sb, KERN_ERR,
- "error: not enough memory for "
- "storing quotafile name.");
+ if (!set_qf_name(sb, USRQUOTA, &args[0]))
return 0;
- }
- if (sbi->s_qf_names[qtype] &&
- strcmp(sbi->s_qf_names[qtype], qname)) {
- ext3_msg(sb, KERN_ERR,
- "error: %s quota file already "
- "specified.", QTYPE2NAME(qtype));
- kfree(qname);
- return 0;
- }
- sbi->s_qf_names[qtype] = qname;
- if (strchr(sbi->s_qf_names[qtype], '/')) {
- ext3_msg(sb, KERN_ERR,
- "error: quotafile must be on "
- "filesystem root.");
- kfree(sbi->s_qf_names[qtype]);
- sbi->s_qf_names[qtype] = NULL;
+ break;
+ case Opt_grpjquota:
+ if (!set_qf_name(sb, GRPQUOTA, &args[0]))
return 0;
- }
- set_opt(sbi->s_mount_opt, QUOTA);
break;
case Opt_offusrjquota:
- qtype = USRQUOTA;
- goto clear_qf_name;
+ if (!clear_qf_name(sb, USRQUOTA))
+ return 0;
+ break;
case Opt_offgrpjquota:
- qtype = GRPQUOTA;
-clear_qf_name:
- if (sb_any_quota_loaded(sb) &&
- sbi->s_qf_names[qtype]) {
- ext3_msg(sb, KERN_ERR, "error: cannot change "
- "journaled quota options when "
- "quota turned on.");
+ if (!clear_qf_name(sb, GRPQUOTA))
return 0;
- }
- /*
- * The space will be released later when all options
- * are confirmed to be correct
- */
- sbi->s_qf_names[qtype] = NULL;
break;
case Opt_jqfmt_vfsold:
qfmt = QFMT_VFS_OLD;
@@ -1244,18 +1251,12 @@ set_qf_format:
}
#ifdef CONFIG_QUOTA
if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
- if ((sbi->s_mount_opt & EXT3_MOUNT_USRQUOTA) &&
- sbi->s_qf_names[USRQUOTA])
+ if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA])
clear_opt(sbi->s_mount_opt, USRQUOTA);
-
- if ((sbi->s_mount_opt & EXT3_MOUNT_GRPQUOTA) &&
- sbi->s_qf_names[GRPQUOTA])
+ if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA])
clear_opt(sbi->s_mount_opt, GRPQUOTA);
- if ((sbi->s_qf_names[USRQUOTA] &&
- (sbi->s_mount_opt & EXT3_MOUNT_GRPQUOTA)) ||
- (sbi->s_qf_names[GRPQUOTA] &&
- (sbi->s_mount_opt & EXT3_MOUNT_USRQUOTA))) {
+ if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) {
ext3_msg(sb, KERN_ERR, "error: old and new quota "
"format mixing.");
return 0;
@@ -1478,7 +1479,7 @@ static void ext3_orphan_cleanup (struct super_block * sb,
}
list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan);
- vfs_dq_init(inode);
+ dquot_initialize(inode);
if (inode->i_nlink) {
printk(KERN_DEBUG
"%s: truncating inode %lu to %Ld bytes\n",
@@ -1671,11 +1672,11 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
set_opt(sbi->s_mount_opt, POSIX_ACL);
#endif
if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_DATA)
- sbi->s_mount_opt |= EXT3_MOUNT_JOURNAL_DATA;
+ set_opt(sbi->s_mount_opt, JOURNAL_DATA);
else if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_ORDERED)
- sbi->s_mount_opt |= EXT3_MOUNT_ORDERED_DATA;
+ set_opt(sbi->s_mount_opt, ORDERED_DATA);
else if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_WBACK)
- sbi->s_mount_opt |= EXT3_MOUNT_WRITEBACK_DATA;
+ set_opt(sbi->s_mount_opt, WRITEBACK_DATA);
if (le16_to_cpu(sbi->s_es->s_errors) == EXT3_ERRORS_PANIC)
set_opt(sbi->s_mount_opt, ERRORS_PANIC);
@@ -1694,7 +1695,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
goto failed_mount;
sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
- ((sbi->s_mount_opt & EXT3_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
+ (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
if (le32_to_cpu(es->s_rev_level) == EXT3_GOOD_OLD_REV &&
(EXT3_HAS_COMPAT_FEATURE(sb, ~0U) ||
@@ -2561,11 +2562,11 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
goto restore_opts;
}
- if (sbi->s_mount_opt & EXT3_MOUNT_ABORT)
+ if (test_opt(sb, ABORT))
ext3_abort(sb, __func__, "Abort forced by user");
sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
- ((sbi->s_mount_opt & EXT3_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
+ (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
es = sbi->s_es;
@@ -2573,7 +2574,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) ||
n_blocks_count > le32_to_cpu(es->s_blocks_count)) {
- if (sbi->s_mount_opt & EXT3_MOUNT_ABORT) {
+ if (test_opt(sb, ABORT)) {
err = -EROFS;
goto restore_opts;
}
@@ -2734,7 +2735,7 @@ static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf)
* Process 1 Process 2
* ext3_create() quota_sync()
* journal_start() write_dquot()
- * vfs_dq_init() down(dqio_mutex)
+ * dquot_initialize() down(dqio_mutex)
* down(dqio_mutex) journal_start()
*
*/
@@ -2942,9 +2943,7 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type,
sector_t blk = off >> EXT3_BLOCK_SIZE_BITS(sb);
int err = 0;
int offset = off & (sb->s_blocksize - 1);
- int tocopy;
int journal_quota = EXT3_SB(sb)->s_qf_names[type] != NULL;
- size_t towrite = len;
struct buffer_head *bh;
handle_t *handle = journal_current_handle();
@@ -2955,53 +2954,54 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type,
(unsigned long long)off, (unsigned long long)len);
return -EIO;
}
+
+ /*
+ * Since we account only one data block in transaction credits,
+ * then it is impossible to cross a block boundary.
+ */
+ if (sb->s_blocksize - offset < len) {
+ ext3_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
+ " cancelled because not block aligned",
+ (unsigned long long)off, (unsigned long long)len);
+ return -EIO;
+ }
mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
- while (towrite > 0) {
- tocopy = sb->s_blocksize - offset < towrite ?
- sb->s_blocksize - offset : towrite;
- bh = ext3_bread(handle, inode, blk, 1, &err);
- if (!bh)
+ bh = ext3_bread(handle, inode, blk, 1, &err);
+ if (!bh)
+ goto out;
+ if (journal_quota) {
+ err = ext3_journal_get_write_access(handle, bh);
+ if (err) {
+ brelse(bh);
goto out;
- if (journal_quota) {
- err = ext3_journal_get_write_access(handle, bh);
- if (err) {
- brelse(bh);
- goto out;
- }
- }
- lock_buffer(bh);
- memcpy(bh->b_data+offset, data, tocopy);
- flush_dcache_page(bh->b_page);
- unlock_buffer(bh);
- if (journal_quota)
- err = ext3_journal_dirty_metadata(handle, bh);
- else {
- /* Always do at least ordered writes for quotas */
- err = ext3_journal_dirty_data(handle, bh);
- mark_buffer_dirty(bh);
}
- brelse(bh);
- if (err)
- goto out;
- offset = 0;
- towrite -= tocopy;
- data += tocopy;
- blk++;
}
+ lock_buffer(bh);
+ memcpy(bh->b_data+offset, data, len);
+ flush_dcache_page(bh->b_page);
+ unlock_buffer(bh);
+ if (journal_quota)
+ err = ext3_journal_dirty_metadata(handle, bh);
+ else {
+ /* Always do at least ordered writes for quotas */
+ err = ext3_journal_dirty_data(handle, bh);
+ mark_buffer_dirty(bh);
+ }
+ brelse(bh);
out:
- if (len == towrite) {
+ if (err) {
mutex_unlock(&inode->i_mutex);
return err;
}
- if (inode->i_size < off+len-towrite) {
- i_size_write(inode, off+len-towrite);
+ if (inode->i_size < off + len) {
+ i_size_write(inode, off + len);
EXT3_I(inode)->i_disksize = inode->i_size;
}
inode->i_version++;
inode->i_mtime = inode->i_ctime = CURRENT_TIME;
ext3_mark_inode_dirty(handle, inode);
mutex_unlock(&inode->i_mutex);
- return len - towrite;
+ return len;
}
#endif
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index 66895ccf76c..534a94c3a93 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -274,7 +274,7 @@ ext3_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
void *end;
int error;
- if (!(EXT3_I(inode)->i_state & EXT3_STATE_XATTR))
+ if (!ext3_test_inode_state(inode, EXT3_STATE_XATTR))
return -ENODATA;
error = ext3_get_inode_loc(inode, &iloc);
if (error)
@@ -403,7 +403,7 @@ ext3_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size)
void *end;
int error;
- if (!(EXT3_I(inode)->i_state & EXT3_STATE_XATTR))
+ if (!ext3_test_inode_state(inode, EXT3_STATE_XATTR))
return 0;
error = ext3_get_inode_loc(inode, &iloc);
if (error)
@@ -500,7 +500,7 @@ ext3_xattr_release_block(handle_t *handle, struct inode *inode,
error = ext3_journal_dirty_metadata(handle, bh);
if (IS_SYNC(inode))
handle->h_sync = 1;
- vfs_dq_free_block(inode, 1);
+ dquot_free_block(inode, 1);
ea_bdebug(bh, "refcount now=%d; releasing",
le32_to_cpu(BHDR(bh)->h_refcount));
if (ce)
@@ -775,8 +775,8 @@ inserted:
else {
/* The old block is released after updating
the inode. */
- error = -EDQUOT;
- if (vfs_dq_alloc_block(inode, 1))
+ error = dquot_alloc_block(inode, 1);
+ if (error)
goto cleanup;
error = ext3_journal_get_write_access(handle,
new_bh);
@@ -850,7 +850,7 @@ cleanup:
return error;
cleanup_dquot:
- vfs_dq_free_block(inode, 1);
+ dquot_free_block(inode, 1);
goto cleanup;
bad_block:
@@ -882,7 +882,7 @@ ext3_xattr_ibody_find(struct inode *inode, struct ext3_xattr_info *i,
is->s.base = is->s.first = IFIRST(header);
is->s.here = is->s.first;
is->s.end = (void *)raw_inode + EXT3_SB(inode->i_sb)->s_inode_size;
- if (EXT3_I(inode)->i_state & EXT3_STATE_XATTR) {
+ if (ext3_test_inode_state(inode, EXT3_STATE_XATTR)) {
error = ext3_xattr_check_names(IFIRST(header), is->s.end);
if (error)
return error;
@@ -914,10 +914,10 @@ ext3_xattr_ibody_set(handle_t *handle, struct inode *inode,
header = IHDR(inode, ext3_raw_inode(&is->iloc));
if (!IS_LAST_ENTRY(s->first)) {
header->h_magic = cpu_to_le32(EXT3_XATTR_MAGIC);
- EXT3_I(inode)->i_state |= EXT3_STATE_XATTR;
+ ext3_set_inode_state(inode, EXT3_STATE_XATTR);
} else {
header->h_magic = cpu_to_le32(0);
- EXT3_I(inode)->i_state &= ~EXT3_STATE_XATTR;
+ ext3_clear_inode_state(inode, EXT3_STATE_XATTR);
}
return 0;
}
@@ -967,10 +967,10 @@ ext3_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
if (error)
goto cleanup;
- if (EXT3_I(inode)->i_state & EXT3_STATE_NEW) {
+ if (ext3_test_inode_state(inode, EXT3_STATE_NEW)) {
struct ext3_inode *raw_inode = ext3_raw_inode(&is.iloc);
memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size);
- EXT3_I(inode)->i_state &= ~EXT3_STATE_NEW;
+ ext3_clear_inode_state(inode, EXT3_STATE_NEW);
}
error = ext3_xattr_ibody_find(inode, &i, &is);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 503a4892740..d0776e410f3 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -23,6 +23,7 @@
#include <linux/jbd2.h>
#include <linux/mount.h>
#include <linux/path.h>
+#include <linux/quotaops.h>
#include "ext4.h"
#include "ext4_jbd2.h"
#include "xattr.h"
@@ -125,7 +126,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
sb->s_dirt = 1;
}
}
- return generic_file_open(inode, filp);
+ return dquot_file_open(inode, filp);
}
const struct file_operations ext4_file_operations = {
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 004c9da9e5c..361c0b9962a 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -214,10 +214,10 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
* Note: we must free any quota before locking the superblock,
* as writing the quota to disk may need the lock as well.
*/
- vfs_dq_init(inode);
+ dquot_initialize(inode);
ext4_xattr_delete_inode(handle, inode);
- vfs_dq_free_inode(inode);
- vfs_dq_drop(inode);
+ dquot_free_inode(inode);
+ dquot_drop(inode);
is_directory = S_ISDIR(inode->i_mode);
@@ -1029,10 +1029,10 @@ got:
ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize;
ret = inode;
- if (vfs_dq_alloc_inode(inode)) {
- err = -EDQUOT;
+ dquot_initialize(inode);
+ err = dquot_alloc_inode(inode);
+ if (err)
goto fail_drop;
- }
err = ext4_init_acl(handle, inode, dir);
if (err)
@@ -1069,10 +1069,10 @@ really_out:
return ret;
fail_free_drop:
- vfs_dq_free_inode(inode);
+ dquot_free_inode(inode);
fail_drop:
- vfs_dq_drop(inode);
+ dquot_drop(inode);
inode->i_flags |= S_NOQUOTA;
inode->i_nlink = 0;
unlock_new_inode(inode);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index f977aade0d1..986120f3006 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -171,6 +171,9 @@ void ext4_delete_inode(struct inode *inode)
handle_t *handle;
int err;
+ if (!is_bad_inode(inode))
+ dquot_initialize(inode);
+
if (ext4_should_order_data(inode))
ext4_begin_ordered_truncate(inode, 0);
truncate_inode_pages(&inode->i_data, 0);
@@ -1108,9 +1111,9 @@ void ext4_da_update_reserve_space(struct inode *inode,
/* Update quota subsystem */
if (quota_claim) {
- vfs_dq_claim_block(inode, used);
+ dquot_claim_block(inode, used);
if (mdb_free)
- vfs_dq_release_reservation_block(inode, mdb_free);
+ dquot_release_reservation_block(inode, mdb_free);
} else {
/*
* We did fallocate with an offset that is already delayed
@@ -1121,8 +1124,8 @@ void ext4_da_update_reserve_space(struct inode *inode,
* that
*/
if (allocated_meta_blocks)
- vfs_dq_claim_block(inode, allocated_meta_blocks);
- vfs_dq_release_reservation_block(inode, mdb_free + used);
+ dquot_claim_block(inode, allocated_meta_blocks);
+ dquot_release_reservation_block(inode, mdb_free + used);
}
/*
@@ -1857,6 +1860,7 @@ static int ext4_da_reserve_space(struct inode *inode, sector_t lblock)
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct ext4_inode_info *ei = EXT4_I(inode);
unsigned long md_needed, md_reserved;
+ int ret;
/*
* recalculate the amount of metadata blocks to reserve
@@ -1875,11 +1879,12 @@ repeat:
* later. Real quota accounting is done at pages writeout
* time.
*/
- if (vfs_dq_reserve_block(inode, md_needed + 1))
- return -EDQUOT;
+ ret = dquot_reserve_block(inode, md_needed + 1);
+ if (ret)
+ return ret;
if (ext4_claim_free_blocks(sbi, md_needed + 1)) {
- vfs_dq_release_reservation_block(inode, md_needed + 1);
+ dquot_release_reservation_block(inode, md_needed + 1);
if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
yield();
goto repeat;
@@ -1936,7 +1941,7 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
- vfs_dq_release_reservation_block(inode, to_free);
+ dquot_release_reservation_block(inode, to_free);
}
static void ext4_da_page_release_reservation(struct page *page,
@@ -5418,6 +5423,8 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
if (error)
return error;
+ if (ia_valid & ATTR_SIZE)
+ dquot_initialize(inode);
if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
(ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
handle_t *handle;
@@ -5430,7 +5437,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
error = PTR_ERR(handle);
goto err_out;
}
- error = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0;
+ error = dquot_transfer(inode, attr);
if (error) {
ext4_journal_stop(handle);
return error;
@@ -5816,7 +5823,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
* i_size has been changed by generic_commit_write() and we thus need
* to include the updated inode in the current transaction.
*
- * Also, vfs_dq_alloc_block() will always dirty the inode when blocks
+ * Also, dquot_alloc_block() will always dirty the inode when blocks
* are allocated to the file.
*
* If the inode is marked synchronous, we don't honour that here - doing
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index abb11e328b6..506713a2ebd 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4240,7 +4240,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
return 0;
}
reserv_blks = ar->len;
- while (ar->len && vfs_dq_alloc_block(ar->inode, ar->len)) {
+ while (ar->len && dquot_alloc_block(ar->inode, ar->len)) {
ar->flags |= EXT4_MB_HINT_NOPREALLOC;
ar->len--;
}
@@ -4317,7 +4317,7 @@ out2:
kmem_cache_free(ext4_ac_cachep, ac);
out1:
if (inquota && ar->len < inquota)
- vfs_dq_free_block(ar->inode, inquota - ar->len);
+ dquot_free_block(ar->inode, inquota - ar->len);
out3:
if (!ar->len) {
if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag)
@@ -4631,7 +4631,7 @@ do_more:
sb->s_dirt = 1;
error_return:
if (freed)
- vfs_dq_free_block(inode, freed);
+ dquot_free_block(inode, freed);
brelse(bitmap_bh);
ext4_std_error(sb, err);
if (ac)
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 608d21f873e..0c070fabd10 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1759,6 +1759,8 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, int mode,
struct inode *inode;
int err, retries = 0;
+ dquot_initialize(dir);
+
retry:
handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
@@ -1793,6 +1795,8 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry,
if (!new_valid_dev(rdev))
return -EINVAL;
+ dquot_initialize(dir);
+
retry:
handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
@@ -1830,6 +1834,8 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode)
if (EXT4_DIR_LINK_MAX(dir))
return -EMLINK;
+ dquot_initialize(dir);
+
retry:
handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
@@ -2137,7 +2143,9 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
/* Initialize quotas before so that eventual writes go in
* separate transaction */
- vfs_dq_init(dentry->d_inode);
+ dquot_initialize(dir);
+ dquot_initialize(dentry->d_inode);
+
handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb));
if (IS_ERR(handle))
return PTR_ERR(handle);
@@ -2196,7 +2204,9 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
/* Initialize quotas before so that eventual writes go
* in separate transaction */
- vfs_dq_init(dentry->d_inode);
+ dquot_initialize(dir);
+ dquot_initialize(dentry->d_inode);
+
handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb));
if (IS_ERR(handle))
return PTR_ERR(handle);
@@ -2251,6 +2261,8 @@ static int ext4_symlink(struct inode *dir,
if (l > dir->i_sb->s_blocksize)
return -ENAMETOOLONG;
+ dquot_initialize(dir);
+
retry:
handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 5 +
@@ -2309,6 +2321,8 @@ static int ext4_link(struct dentry *old_dentry,
if (inode->i_nlink >= EXT4_LINK_MAX)
return -EMLINK;
+ dquot_initialize(dir);
+
/*
* Return -ENOENT if we've raced with unlink and i_nlink is 0. Doing
* otherwise has the potential to corrupt the orphan inode list.
@@ -2359,12 +2373,15 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
struct ext4_dir_entry_2 *old_de, *new_de;
int retval, force_da_alloc = 0;
+ dquot_initialize(old_dir);
+ dquot_initialize(new_dir);
+
old_bh = new_bh = dir_bh = NULL;
/* Initialize quotas before so that eventual writes go
* in separate transaction */
if (new_dentry->d_inode)
- vfs_dq_init(new_dentry->d_inode);
+ dquot_initialize(new_dentry->d_inode);
handle = ext4_journal_start(old_dir, 2 *
EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) +
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index ad1ee5f21ba..2b83b96cb2e 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -798,6 +798,7 @@ static void destroy_inodecache(void)
static void ext4_clear_inode(struct inode *inode)
{
+ dquot_drop(inode);
ext4_discard_preallocations(inode);
if (EXT4_JOURNAL(inode))
jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal,
@@ -1052,19 +1053,9 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
const char *data, size_t len, loff_t off);
static const struct dquot_operations ext4_quota_operations = {
- .initialize = dquot_initialize,
- .drop = dquot_drop,
- .alloc_space = dquot_alloc_space,
- .reserve_space = dquot_reserve_space,
- .claim_space = dquot_claim_space,
- .release_rsv = dquot_release_reserved_space,
#ifdef CONFIG_QUOTA
.get_reserved_space = ext4_get_reserved_space,
#endif
- .alloc_inode = dquot_alloc_inode,
- .free_space = dquot_free_space,
- .free_inode = dquot_free_inode,
- .transfer = dquot_transfer,
.write_dquot = ext4_write_dquot,
.acquire_dquot = ext4_acquire_dquot,
.release_dquot = ext4_release_dquot,
@@ -2014,7 +2005,7 @@ static void ext4_orphan_cleanup(struct super_block *sb,
}
list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
- vfs_dq_init(inode);
+ dquot_initialize(inode);
if (inode->i_nlink) {
ext4_msg(sb, KERN_DEBUG,
"%s: truncating inode %lu to %lld bytes",
@@ -3801,7 +3792,7 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
* Process 1 Process 2
* ext4_create() quota_sync()
* jbd2_journal_start() write_dquot()
- * vfs_dq_init() down(dqio_mutex)
+ * dquot_initialize() down(dqio_mutex)
* down(dqio_mutex) jbd2_journal_start()
*
*/
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index efc16a4b7ce..b4c5aa8489d 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -495,7 +495,7 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
error = ext4_handle_dirty_metadata(handle, inode, bh);
if (IS_SYNC(inode))
ext4_handle_sync(handle);
- vfs_dq_free_block(inode, 1);
+ dquot_free_block(inode, 1);
ea_bdebug(bh, "refcount now=%d; releasing",
le32_to_cpu(BHDR(bh)->h_refcount));
if (ce)
@@ -787,8 +787,8 @@ inserted:
else {
/* The old block is released after updating
the inode. */
- error = -EDQUOT;
- if (vfs_dq_alloc_block(inode, 1))
+ error = dquot_alloc_block(inode, 1);
+ if (error)
goto cleanup;
error = ext4_journal_get_write_access(handle,
new_bh);
@@ -876,7 +876,7 @@ cleanup:
return error;
cleanup_dquot:
- vfs_dq_free_block(inode, 1);
+ dquot_free_block(inode, 1);
goto cleanup;
bad_block:
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index e3bf6eab875..6dbcbad6ab1 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -1083,7 +1083,7 @@ void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
}
}
-int gfs2_quota_sync(struct super_block *sb, int type)
+int gfs2_quota_sync(struct super_block *sb, int type, int wait)
{
struct gfs2_sbd *sdp = sb->s_fs_info;
struct gfs2_quota_data **qda;
@@ -1127,6 +1127,11 @@ int gfs2_quota_sync(struct super_block *sb, int type)
return error;
}
+static int gfs2_quota_sync_timeo(struct super_block *sb, int type)
+{
+ return gfs2_quota_sync(sb, type, 0);
+}
+
int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id)
{
struct gfs2_quota_data *qd;
@@ -1382,7 +1387,7 @@ int gfs2_quotad(void *data)
&tune->gt_statfs_quantum);
/* Update quota file */
- quotad_check_timeo(sdp, "sync", gfs2_quota_sync, t,
+ quotad_check_timeo(sdp, "sync", gfs2_quota_sync_timeo, t,
&quotad_timeo, &tune->gt_quota_quantum);
/* Check for & recover partially truncated inodes */
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index e271fa07ad0..195f60c8bd1 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -25,7 +25,7 @@ extern int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid);
extern void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
u32 uid, u32 gid);
-extern int gfs2_quota_sync(struct super_block *sb, int type);
+extern int gfs2_quota_sync(struct super_block *sb, int type, int wait);
extern int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id);
extern int gfs2_quota_init(struct gfs2_sbd *sdp);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index ca87598ead7..50aac606b99 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -764,7 +764,7 @@ static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
int error;
flush_workqueue(gfs2_delete_workqueue);
- gfs2_quota_sync(sdp->sd_vfs, 0);
+ gfs2_quota_sync(sdp->sd_vfs, 0, 1);
gfs2_statfs_sync(sdp->sd_vfs, 0);
error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE,
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index a0db1c94317..b5f1a46133c 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -167,7 +167,7 @@ static ssize_t quota_sync_store(struct gfs2_sbd *sdp, const char *buf,
if (simple_strtol(buf, NULL, 0) != 1)
return -EINVAL;
- gfs2_quota_sync(sdp->sd_vfs, 0);
+ gfs2_quota_sync(sdp->sd_vfs, 0, 1);
return len;
}
diff --git a/fs/inode.c b/fs/inode.c
index 03dfeb2e392..407bf392e20 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -8,7 +8,6 @@
#include <linux/mm.h>
#include <linux/dcache.h>
#include <linux/init.h>
-#include <linux/quotaops.h>
#include <linux/slab.h>
#include <linux/writeback.h>
#include <linux/module.h>
@@ -314,7 +313,6 @@ void clear_inode(struct inode *inode)
BUG_ON(!(inode->i_state & I_FREEING));
BUG_ON(inode->i_state & I_CLEAR);
inode_sync_wait(inode);
- vfs_dq_drop(inode);
if (inode->i_sb->s_op->clear_inode)
inode->i_sb->s_op->clear_inode(inode);
if (S_ISBLK(inode->i_mode) && inode->i_bdev)
@@ -1211,8 +1209,6 @@ void generic_delete_inode(struct inode *inode)
if (op->delete_inode) {
void (*delete)(struct inode *) = op->delete_inode;
- if (!is_bad_inode(inode))
- vfs_dq_init(inode);
/* Filesystems implementing their own
* s_op->delete_inode are required to call
* truncate_inode_pages and clear_inode()
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 4bd882548c4..2c90e3ef625 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -862,12 +862,12 @@ restart_loop:
/* A buffer which has been freed while still being
* journaled by a previous transaction may end up still
* being dirty here, but we want to avoid writing back
- * that buffer in the future now that the last use has
- * been committed. That's not only a performance gain,
- * it also stops aliasing problems if the buffer is left
- * behind for writeback and gets reallocated for another
+ * that buffer in the future after the "add to orphan"
+ * operation been committed, That's not only a performance
+ * gain, it also stops aliasing problems if the buffer is
+ * left behind for writeback and gets reallocated for another
* use in a different page. */
- if (buffer_freed(bh)) {
+ if (buffer_freed(bh) && !jh->b_next_transaction) {
clear_buffer_freed(bh);
clear_buffer_jbddirty(bh);
}
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 006f9ad838a..99e9fea1107 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -1864,6 +1864,21 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
if (!jh)
goto zap_buffer_no_jh;
+ /*
+ * We cannot remove the buffer from checkpoint lists until the
+ * transaction adding inode to orphan list (let's call it T)
+ * is committed. Otherwise if the transaction changing the
+ * buffer would be cleaned from the journal before T is
+ * committed, a crash will cause that the correct contents of
+ * the buffer will be lost. On the other hand we have to
+ * clear the buffer dirty bit at latest at the moment when the
+ * transaction marking the buffer as freed in the filesystem
+ * structures is committed because from that moment on the
+ * buffer can be reallocated and used by a different page.
+ * Since the block hasn't been freed yet but the inode has
+ * already been added to orphan list, it is safe for us to add
+ * the buffer to BJ_Forget list of the newest transaction.
+ */
transaction = jh->b_transaction;
if (transaction == NULL) {
/* First case: not on any transaction. If it
@@ -1929,16 +1944,15 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
goto zap_buffer;
}
/*
- * If it is committing, we simply cannot touch it. We
- * can remove it's next_transaction pointer from the
- * running transaction if that is set, but nothing
- * else. */
+ * The buffer is committing, we simply cannot touch
+ * it. So we just set j_next_transaction to the
+ * running transaction (if there is one) and mark
+ * buffer as freed so that commit code knows it should
+ * clear dirty bits when it is done with the buffer.
+ */
set_buffer_freed(bh);
- if (jh->b_next_transaction) {
- J_ASSERT(jh->b_next_transaction ==
- journal->j_running_transaction);
- jh->b_next_transaction = NULL;
- }
+ if (journal->j_running_transaction && buffer_jbddirty(bh))
+ jh->b_next_transaction = journal->j_running_transaction;
journal_put_journal_head(jh);
spin_unlock(&journal->j_list_lock);
jbd_unlock_bh_state(bh);
@@ -2120,7 +2134,7 @@ void journal_file_buffer(struct journal_head *jh,
*/
void __journal_refile_buffer(struct journal_head *jh)
{
- int was_dirty;
+ int was_dirty, jlist;
struct buffer_head *bh = jh2bh(jh);
J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
@@ -2142,8 +2156,13 @@ void __journal_refile_buffer(struct journal_head *jh)
__journal_temp_unlink_buffer(jh);
jh->b_transaction = jh->b_next_transaction;
jh->b_next_transaction = NULL;
- __journal_file_buffer(jh, jh->b_transaction,
- jh->b_modified ? BJ_Metadata : BJ_Reserved);
+ if (buffer_freed(bh))
+ jlist = BJ_Forget;
+ else if (jh->b_modified)
+ jlist = BJ_Metadata;
+ else
+ jlist = BJ_Reserved;
+ __journal_file_buffer(jh, jh->b_transaction, jlist);
J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING);
if (was_dirty)
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index d66477c3430..213169780b6 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -20,7 +20,6 @@
#include <linux/sched.h>
#include <linux/fs.h>
-#include <linux/quotaops.h>
#include <linux/posix_acl_xattr.h>
#include "jfs_incore.h"
#include "jfs_txnmgr.h"
@@ -174,7 +173,7 @@ cleanup:
return rc;
}
-static int jfs_acl_chmod(struct inode *inode)
+int jfs_acl_chmod(struct inode *inode)
{
struct posix_acl *acl, *clone;
int rc;
@@ -205,26 +204,3 @@ static int jfs_acl_chmod(struct inode *inode)
posix_acl_release(clone);
return rc;
}
-
-int jfs_setattr(struct dentry *dentry, struct iattr *iattr)
-{
- struct inode *inode = dentry->d_inode;
- int rc;
-
- rc = inode_change_ok(inode, iattr);
- if (rc)
- return rc;
-
- if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
- (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
- if (vfs_dq_transfer(inode, iattr))
- return -EDQUOT;
- }
-
- rc = inode_setattr(inode, iattr);
-
- if (!rc && (iattr->ia_valid & ATTR_MODE))
- rc = jfs_acl_chmod(inode);
-
- return rc;
-}
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 2b70fa78e4a..14ba982b3f2 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -18,6 +18,7 @@
*/
#include <linux/fs.h>
+#include <linux/quotaops.h>
#include "jfs_incore.h"
#include "jfs_inode.h"
#include "jfs_dmap.h"
@@ -47,7 +48,7 @@ static int jfs_open(struct inode *inode, struct file *file)
{
int rc;
- if ((rc = generic_file_open(inode, file)))
+ if ((rc = dquot_file_open(inode, file)))
return rc;
/*
@@ -88,14 +89,40 @@ static int jfs_release(struct inode *inode, struct file *file)
return 0;
}
+int jfs_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+ struct inode *inode = dentry->d_inode;
+ int rc;
+
+ rc = inode_change_ok(inode, iattr);
+ if (rc)
+ return rc;
+
+ if (iattr->ia_valid & ATTR_SIZE)
+ dquot_initialize(inode);
+ if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
+ (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
+ rc = dquot_transfer(inode, iattr);
+ if (rc)
+ return rc;
+ }
+
+ rc = inode_setattr(inode, iattr);
+
+ if (!rc && (iattr->ia_valid & ATTR_MODE))
+ rc = jfs_acl_chmod(inode);
+
+ return rc;
+}
+
const struct inode_operations jfs_file_inode_operations = {
.truncate = jfs_truncate,
.setxattr = jfs_setxattr,
.getxattr = jfs_getxattr,
.listxattr = jfs_listxattr,
.removexattr = jfs_removexattr,
-#ifdef CONFIG_JFS_POSIX_ACL
.setattr = jfs_setattr,
+#ifdef CONFIG_JFS_POSIX_ACL
.check_acl = jfs_check_acl,
#endif
};
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 182b78cc3e6..9dd126276c9 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -149,6 +149,9 @@ void jfs_delete_inode(struct inode *inode)
{
jfs_info("In jfs_delete_inode, inode = 0x%p", inode);
+ if (!is_bad_inode(inode))
+ dquot_initialize(inode);
+
if (!is_bad_inode(inode) &&
(JFS_IP(inode)->fileset == FILESYSTEM_I)) {
truncate_inode_pages(&inode->i_data, 0);
@@ -161,9 +164,9 @@ void jfs_delete_inode(struct inode *inode)
/*
* Free the inode from the quota allocation.
*/
- vfs_dq_init(inode);
- vfs_dq_free_inode(inode);
- vfs_dq_drop(inode);
+ dquot_initialize(inode);
+ dquot_free_inode(inode);
+ dquot_drop(inode);
}
clear_inode(inode);
diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h
index b07bd417ef8..54e07559878 100644
--- a/fs/jfs/jfs_acl.h
+++ b/fs/jfs/jfs_acl.h
@@ -22,7 +22,7 @@
int jfs_check_acl(struct inode *, int);
int jfs_init_acl(tid_t, struct inode *, struct inode *);
-int jfs_setattr(struct dentry *, struct iattr *);
+int jfs_acl_chmod(struct inode *inode);
#else
@@ -32,5 +32,10 @@ static inline int jfs_init_acl(tid_t tid, struct inode *inode,
return 0;
}
+static inline int jfs_acl_chmod(struct inode *inode)
+{
+ return 0;
+}
+
#endif
#endif /* _H_JFS_ACL */
diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c
index 925871e9887..0e4623be70c 100644
--- a/fs/jfs/jfs_dtree.c
+++ b/fs/jfs/jfs_dtree.c
@@ -381,10 +381,10 @@ static u32 add_index(tid_t tid, struct inode *ip, s64 bn, int slot)
* It's time to move the inline table to an external
* page and begin to build the xtree
*/
- if (vfs_dq_alloc_block(ip, sbi->nbperpage))
+ if (dquot_alloc_block(ip, sbi->nbperpage))
goto clean_up;
if (dbAlloc(ip, 0, sbi->nbperpage, &xaddr)) {
- vfs_dq_free_block(ip, sbi->nbperpage);
+ dquot_free_block(ip, sbi->nbperpage);
goto clean_up;
}
@@ -408,7 +408,7 @@ static u32 add_index(tid_t tid, struct inode *ip, s64 bn, int slot)
memcpy(&jfs_ip->i_dirtable, temp_table,
sizeof (temp_table));
dbFree(ip, xaddr, sbi->nbperpage);
- vfs_dq_free_block(ip, sbi->nbperpage);
+ dquot_free_block(ip, sbi->nbperpage);
goto clean_up;
}
ip->i_size = PSIZE;
@@ -1027,10 +1027,9 @@ static int dtSplitUp(tid_t tid,
n = xlen;
/* Allocate blocks to quota. */
- if (vfs_dq_alloc_block(ip, n)) {
- rc = -EDQUOT;
+ rc = dquot_alloc_block(ip, n);
+ if (rc)
goto extendOut;
- }
quota_allocation += n;
if ((rc = dbReAlloc(sbi->ipbmap, xaddr, (s64) xlen,
@@ -1308,7 +1307,7 @@ static int dtSplitUp(tid_t tid,
/* Rollback quota allocation */
if (rc && quota_allocation)
- vfs_dq_free_block(ip, quota_allocation);
+ dquot_free_block(ip, quota_allocation);
dtSplitUp_Exit:
@@ -1369,9 +1368,10 @@ static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split,
return -EIO;
/* Allocate blocks to quota. */
- if (vfs_dq_alloc_block(ip, lengthPXD(pxd))) {
+ rc = dquot_alloc_block(ip, lengthPXD(pxd));
+ if (rc) {
release_metapage(rmp);
- return -EDQUOT;
+ return rc;
}
jfs_info("dtSplitPage: ip:0x%p smp:0x%p rmp:0x%p", ip, smp, rmp);
@@ -1892,6 +1892,7 @@ static int dtSplitRoot(tid_t tid,
struct dt_lock *dtlck;
struct tlock *tlck;
struct lv *lv;
+ int rc;
/* get split root page */
smp = split->mp;
@@ -1916,9 +1917,10 @@ static int dtSplitRoot(tid_t tid,
rp = rmp->data;
/* Allocate blocks to quota. */
- if (vfs_dq_alloc_block(ip, lengthPXD(pxd))) {
+ rc = dquot_alloc_block(ip, lengthPXD(pxd));
+ if (rc) {
release_metapage(rmp);
- return -EDQUOT;
+ return rc;
}
BT_MARK_DIRTY(rmp, ip);
@@ -2287,7 +2289,7 @@ static int dtDeleteUp(tid_t tid, struct inode *ip,
xlen = lengthPXD(&fp->header.self);
/* Free quota allocation. */
- vfs_dq_free_block(ip, xlen);
+ dquot_free_block(ip, xlen);
/* free/invalidate its buffer page */
discard_metapage(fmp);
@@ -2363,7 +2365,7 @@ static int dtDeleteUp(tid_t tid, struct inode *ip,
xlen = lengthPXD(&p->header.self);
/* Free quota allocation */
- vfs_dq_free_block(ip, xlen);
+ dquot_free_block(ip, xlen);
/* free/invalidate its buffer page */
discard_metapage(mp);
diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c
index 41d6045dbeb..5d3bbd10f8d 100644
--- a/fs/jfs/jfs_extent.c
+++ b/fs/jfs/jfs_extent.c
@@ -141,10 +141,11 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr)
}
/* Allocate blocks to quota. */
- if (vfs_dq_alloc_block(ip, nxlen)) {
+ rc = dquot_alloc_block(ip, nxlen);
+ if (rc) {
dbFree(ip, nxaddr, (s64) nxlen);
mutex_unlock(&JFS_IP(ip)->commit_mutex);
- return -EDQUOT;
+ return rc;
}
/* determine the value of the extent flag */
@@ -164,7 +165,7 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr)
*/
if (rc) {
dbFree(ip, nxaddr, nxlen);
- vfs_dq_free_block(ip, nxlen);
+ dquot_free_block(ip, nxlen);
mutex_unlock(&JFS_IP(ip)->commit_mutex);
return (rc);
}
@@ -256,10 +257,11 @@ int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, bool abnr)
goto exit;
/* Allocat blocks to quota. */
- if (vfs_dq_alloc_block(ip, nxlen)) {
+ rc = dquot_alloc_block(ip, nxlen);
+ if (rc) {
dbFree(ip, nxaddr, (s64) nxlen);
mutex_unlock(&JFS_IP(ip)->commit_mutex);
- return -EDQUOT;
+ return rc;
}
delta = nxlen - xlen;
@@ -297,7 +299,7 @@ int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, bool abnr)
/* extend the extent */
if ((rc = xtExtend(0, ip, xoff + xlen, (int) nextend, 0))) {
dbFree(ip, xaddr + xlen, delta);
- vfs_dq_free_block(ip, nxlen);
+ dquot_free_block(ip, nxlen);
goto exit;
}
} else {
@@ -308,7 +310,7 @@ int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, bool abnr)
*/
if ((rc = xtTailgate(0, ip, xoff, (int) ntail, nxaddr, 0))) {
dbFree(ip, nxaddr, nxlen);
- vfs_dq_free_block(ip, nxlen);
+ dquot_free_block(ip, nxlen);
goto exit;
}
}
diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c
index dc0e02159ac..829921b6776 100644
--- a/fs/jfs/jfs_inode.c
+++ b/fs/jfs/jfs_inode.c
@@ -116,10 +116,10 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
/*
* Allocate inode to quota.
*/
- if (vfs_dq_alloc_inode(inode)) {
- rc = -EDQUOT;
+ dquot_initialize(inode);
+ rc = dquot_alloc_inode(inode);
+ if (rc)
goto fail_drop;
- }
inode->i_mode = mode;
/* inherit flags from parent */
@@ -162,7 +162,7 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
return inode;
fail_drop:
- vfs_dq_drop(inode);
+ dquot_drop(inode);
inode->i_flags |= S_NOQUOTA;
fail_unlock:
inode->i_nlink = 0;
diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h
index 15902b03c2a..79e2c79661d 100644
--- a/fs/jfs/jfs_inode.h
+++ b/fs/jfs/jfs_inode.h
@@ -40,6 +40,7 @@ extern struct dentry *jfs_fh_to_parent(struct super_block *sb, struct fid *fid,
int fh_len, int fh_type);
extern void jfs_set_inode_flags(struct inode *);
extern int jfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
+extern int jfs_setattr(struct dentry *, struct iattr *);
extern const struct address_space_operations jfs_aops;
extern const struct inode_operations jfs_dir_inode_operations;
diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c
index d654a645864..6c50871e622 100644
--- a/fs/jfs/jfs_xtree.c
+++ b/fs/jfs/jfs_xtree.c
@@ -585,10 +585,10 @@ int xtInsert(tid_t tid, /* transaction id */
hint = addressXAD(xad) + lengthXAD(xad) - 1;
} else
hint = 0;
- if ((rc = vfs_dq_alloc_block(ip, xlen)))
+ if ((rc = dquot_alloc_block(ip, xlen)))
goto out;
if ((rc = dbAlloc(ip, hint, (s64) xlen, &xaddr))) {
- vfs_dq_free_block(ip, xlen);
+ dquot_free_block(ip, xlen);
goto out;
}
}
@@ -617,7 +617,7 @@ int xtInsert(tid_t tid, /* transaction id */
/* undo data extent allocation */
if (*xaddrp == 0) {
dbFree(ip, xaddr, (s64) xlen);
- vfs_dq_free_block(ip, xlen);
+ dquot_free_block(ip, xlen);
}
return rc;
}
@@ -985,10 +985,9 @@ xtSplitPage(tid_t tid, struct inode *ip,
rbn = addressPXD(pxd);
/* Allocate blocks to quota. */
- if (vfs_dq_alloc_block(ip, lengthPXD(pxd))) {
- rc = -EDQUOT;
+ rc = dquot_alloc_block(ip, lengthPXD(pxd));
+ if (rc)
goto clean_up;
- }
quota_allocation += lengthPXD(pxd);
@@ -1195,7 +1194,7 @@ xtSplitPage(tid_t tid, struct inode *ip,
/* Rollback quota allocation. */
if (quota_allocation)
- vfs_dq_free_block(ip, quota_allocation);
+ dquot_free_block(ip, quota_allocation);
return (rc);
}
@@ -1235,6 +1234,7 @@ xtSplitRoot(tid_t tid,
struct pxdlist *pxdlist;
struct tlock *tlck;
struct xtlock *xtlck;
+ int rc;
sp = &JFS_IP(ip)->i_xtroot;
@@ -1252,9 +1252,10 @@ xtSplitRoot(tid_t tid,
return -EIO;
/* Allocate blocks to quota. */
- if (vfs_dq_alloc_block(ip, lengthPXD(pxd))) {
+ rc = dquot_alloc_block(ip, lengthPXD(pxd));
+ if (rc) {
release_metapage(rmp);
- return -EDQUOT;
+ return rc;
}
jfs_info("xtSplitRoot: ip:0x%p rmp:0x%p", ip, rmp);
@@ -3680,7 +3681,7 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag)
ip->i_size = newsize;
/* update quota allocation to reflect freed blocks */
- vfs_dq_free_block(ip, nfreed);
+ dquot_free_block(ip, nfreed);
/*
* free tlock of invalidated pages
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index c79a4270f08..4a3e9f39c21 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -85,6 +85,8 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, int mode,
jfs_info("jfs_create: dip:0x%p name:%s", dip, dentry->d_name.name);
+ dquot_initialize(dip);
+
/*
* search parent directory for entry/freespace
* (dtSearch() returns parent directory page pinned)
@@ -215,6 +217,8 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode)
jfs_info("jfs_mkdir: dip:0x%p name:%s", dip, dentry->d_name.name);
+ dquot_initialize(dip);
+
/* link count overflow on parent directory ? */
if (dip->i_nlink == JFS_LINK_MAX) {
rc = -EMLINK;
@@ -356,7 +360,8 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry)
jfs_info("jfs_rmdir: dip:0x%p name:%s", dip, dentry->d_name.name);
/* Init inode for quota operations. */
- vfs_dq_init(ip);
+ dquot_initialize(dip);
+ dquot_initialize(ip);
/* directory must be empty to be removed */
if (!dtEmpty(ip)) {
@@ -483,7 +488,8 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry)
jfs_info("jfs_unlink: dip:0x%p name:%s", dip, dentry->d_name.name);
/* Init inode for quota operations. */
- vfs_dq_init(ip);
+ dquot_initialize(dip);
+ dquot_initialize(ip);
if ((rc = get_UCSname(&dname, dentry)))
goto out;
@@ -805,6 +811,8 @@ static int jfs_link(struct dentry *old_dentry,
if (ip->i_nlink == 0)
return -ENOENT;
+ dquot_initialize(dir);
+
tid = txBegin(ip->i_sb, 0);
mutex_lock_nested(&JFS_IP(dir)->commit_mutex, COMMIT_MUTEX_PARENT);
@@ -896,6 +904,8 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
jfs_info("jfs_symlink: dip:0x%p name:%s", dip, name);
+ dquot_initialize(dip);
+
ssize = strlen(name) + 1;
/*
@@ -1087,6 +1097,9 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
jfs_info("jfs_rename: %s %s", old_dentry->d_name.name,
new_dentry->d_name.name);
+ dquot_initialize(old_dir);
+ dquot_initialize(new_dir);
+
old_ip = old_dentry->d_inode;
new_ip = new_dentry->d_inode;
@@ -1136,7 +1149,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
} else if (new_ip) {
IWRITE_LOCK(new_ip, RDWRLOCK_NORMAL);
/* Init inode for quota operations. */
- vfs_dq_init(new_ip);
+ dquot_initialize(new_ip);
}
/*
@@ -1360,6 +1373,8 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry,
jfs_info("jfs_mknod: %s", dentry->d_name.name);
+ dquot_initialize(dir);
+
if ((rc = get_UCSname(&dname, dentry)))
goto out;
@@ -1541,8 +1556,8 @@ const struct inode_operations jfs_dir_inode_operations = {
.getxattr = jfs_getxattr,
.listxattr = jfs_listxattr,
.removexattr = jfs_removexattr,
-#ifdef CONFIG_JFS_POSIX_ACL
.setattr = jfs_setattr,
+#ifdef CONFIG_JFS_POSIX_ACL
.check_acl = jfs_check_acl,
#endif
};
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index d929a822a74..266699deb1c 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -131,6 +131,11 @@ static void jfs_destroy_inode(struct inode *inode)
kmem_cache_free(jfs_inode_cachep, ji);
}
+static void jfs_clear_inode(struct inode *inode)
+{
+ dquot_drop(inode);
+}
+
static int jfs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct jfs_sb_info *sbi = JFS_SBI(dentry->d_sb);
@@ -745,6 +750,7 @@ static const struct super_operations jfs_super_operations = {
.dirty_inode = jfs_dirty_inode,
.write_inode = jfs_write_inode,
.delete_inode = jfs_delete_inode,
+ .clear_inode = jfs_clear_inode,
.put_super = jfs_put_super,
.sync_fs = jfs_sync_fs,
.freeze_fs = jfs_freeze,
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index fad364548bc..1f594ab2189 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -260,14 +260,14 @@ static int ea_write(struct inode *ip, struct jfs_ea_list *ealist, int size,
nblocks = (size + (sb->s_blocksize - 1)) >> sb->s_blocksize_bits;
/* Allocate new blocks to quota. */
- if (vfs_dq_alloc_block(ip, nblocks)) {
- return -EDQUOT;
- }
+ rc = dquot_alloc_block(ip, nblocks);
+ if (rc)
+ return rc;
rc = dbAlloc(ip, INOHINT(ip), nblocks, &blkno);
if (rc) {
/*Rollback quota allocation. */
- vfs_dq_free_block(ip, nblocks);
+ dquot_free_block(ip, nblocks);
return rc;
}
@@ -332,7 +332,7 @@ static int ea_write(struct inode *ip, struct jfs_ea_list *ealist, int size,
failed:
/* Rollback quota allocation. */
- vfs_dq_free_block(ip, nblocks);
+ dquot_free_block(ip, nblocks);
dbFree(ip, blkno, nblocks);
return rc;
@@ -538,7 +538,8 @@ static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size)
if (blocks_needed > current_blocks) {
/* Allocate new blocks to quota. */
- if (vfs_dq_alloc_block(inode, blocks_needed))
+ rc = dquot_alloc_block(inode, blocks_needed);
+ if (rc)
return -EDQUOT;
quota_allocation = blocks_needed;
@@ -602,7 +603,7 @@ static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size)
clean_up:
/* Rollback quota allocation */
if (quota_allocation)
- vfs_dq_free_block(inode, quota_allocation);
+ dquot_free_block(inode, quota_allocation);
return (rc);
}
@@ -677,7 +678,7 @@ static int ea_put(tid_t tid, struct inode *inode, struct ea_buffer *ea_buf,
/* If old blocks exist, they must be removed from quota allocation. */
if (old_blocks)
- vfs_dq_free_block(inode, old_blocks);
+ dquot_free_block(inode, old_blocks);
inode->i_ctime = CURRENT_TIME;
diff --git a/fs/namei.c b/fs/namei.c
index 9a6456099f1..3d9d2f965f8 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -19,7 +19,6 @@
#include <linux/slab.h>
#include <linux/fs.h>
#include <linux/namei.h>
-#include <linux/quotaops.h>
#include <linux/pagemap.h>
#include <linux/fsnotify.h>
#include <linux/personality.h>
@@ -1416,7 +1415,6 @@ int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
error = security_inode_create(dir, dentry, mode);
if (error)
return error;
- vfs_dq_init(dir);
error = dir->i_op->create(dir, dentry, mode, nd);
if (!error)
fsnotify_create(dir, dentry);
@@ -1586,9 +1584,6 @@ static struct file *finish_open(struct nameidata *nd,
}
}
if (!IS_ERR(filp)) {
- if (acc_mode & MAY_WRITE)
- vfs_dq_init(nd->path.dentry->d_inode);
-
if (will_truncate) {
error = handle_truncate(&nd->path);
if (error) {
@@ -1986,7 +1981,6 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
if (error)
return error;
- vfs_dq_init(dir);
error = dir->i_op->mknod(dir, dentry, mode, dev);
if (!error)
fsnotify_create(dir, dentry);
@@ -2085,7 +2079,6 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
if (error)
return error;
- vfs_dq_init(dir);
error = dir->i_op->mkdir(dir, dentry, mode);
if (!error)
fsnotify_mkdir(dir, dentry);
@@ -2171,8 +2164,6 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
if (!dir->i_op->rmdir)
return -EPERM;
- vfs_dq_init(dir);
-
mutex_lock(&dentry->d_inode->i_mutex);
dentry_unhash(dentry);
if (d_mountpoint(dentry))
@@ -2258,8 +2249,6 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry)
if (!dir->i_op->unlink)
return -EPERM;
- vfs_dq_init(dir);
-
mutex_lock(&dentry->d_inode->i_mutex);
if (d_mountpoint(dentry))
error = -EBUSY;
@@ -2372,7 +2361,6 @@ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
if (error)
return error;
- vfs_dq_init(dir);
error = dir->i_op->symlink(dir, dentry, oldname);
if (!error)
fsnotify_create(dir, dentry);
@@ -2456,7 +2444,6 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
return error;
mutex_lock(&inode->i_mutex);
- vfs_dq_init(dir);
error = dir->i_op->link(old_dentry, dir, new_dentry);
mutex_unlock(&inode->i_mutex);
if (!error)
@@ -2657,9 +2644,6 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (!old_dir->i_op->rename)
return -EPERM;
- vfs_dq_init(old_dir);
- vfs_dq_init(new_dir);
-
old_name = fsnotify_oldname_init(old_dentry->d_name.name);
if (is_dir)
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 15dc2deaac5..8eca17df4f6 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -20,7 +20,6 @@
#include <linux/fcntl.h>
#include <linux/namei.h>
#include <linux/delay.h>
-#include <linux/quotaops.h>
#include <linux/fsnotify.h>
#include <linux/posix_acl_xattr.h>
#include <linux/xattr.h>
@@ -377,7 +376,6 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
put_write_access(inode);
goto out_nfserr;
}
- vfs_dq_init(inode);
}
/* sanitize the mode change */
@@ -745,8 +743,6 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
flags = O_RDWR|O_LARGEFILE;
else
flags = O_WRONLY|O_LARGEFILE;
-
- vfs_dq_init(inode);
}
*filp = dentry_open(dget(dentry), mntget(fhp->fh_export->ex_path.mnt),
flags, current_cred());
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 2bbe1ecc08c..9f8bd913c51 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -5713,7 +5713,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
goto out;
}
- vfs_dq_free_space_nodirty(inode,
+ dquot_free_space_nodirty(inode,
ocfs2_clusters_to_bytes(inode->i_sb, len));
ret = ocfs2_remove_extent(handle, et, cpos, len, meta_ac, dealloc);
@@ -6936,7 +6936,7 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
goto bail;
}
- vfs_dq_free_space_nodirty(inode,
+ dquot_free_space_nodirty(inode,
ocfs2_clusters_to_bytes(osb->sb, clusters_to_del));
spin_lock(&OCFS2_I(inode)->ip_lock);
OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
@@ -7301,11 +7301,10 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
unsigned int page_end;
u64 phys;
- if (vfs_dq_alloc_space_nodirty(inode,
- ocfs2_clusters_to_bytes(osb->sb, 1))) {
- ret = -EDQUOT;
+ ret = dquot_alloc_space_nodirty(inode,
+ ocfs2_clusters_to_bytes(osb->sb, 1));
+ if (ret)
goto out_commit;
- }
did_quota = 1;
ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off,
@@ -7381,7 +7380,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
out_commit:
if (ret < 0 && did_quota)
- vfs_dq_free_space_nodirty(inode,
+ dquot_free_space_nodirty(inode,
ocfs2_clusters_to_bytes(osb->sb, 1));
ocfs2_commit_trans(osb, handle);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 4c2a6d282c4..21441ddb550 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1764,10 +1764,11 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
wc->w_handle = handle;
- if (clusters_to_alloc && vfs_dq_alloc_space_nodirty(inode,
- ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc))) {
- ret = -EDQUOT;
- goto out_commit;
+ if (clusters_to_alloc) {
+ ret = dquot_alloc_space_nodirty(inode,
+ ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc));
+ if (ret)
+ goto out_commit;
}
/*
* We don't want this to fail in ocfs2_write_end(), so do it
@@ -1810,7 +1811,7 @@ success:
return 0;
out_quota:
if (clusters_to_alloc)
- vfs_dq_free_space(inode,
+ dquot_free_space(inode,
ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc));
out_commit:
ocfs2_commit_trans(osb, handle);
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 765d66c7098..efd77d071c8 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -2964,12 +2964,10 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
goto out;
}
- if (vfs_dq_alloc_space_nodirty(dir,
- ocfs2_clusters_to_bytes(osb->sb,
- alloc + dx_alloc))) {
- ret = -EDQUOT;
+ ret = dquot_alloc_space_nodirty(dir,
+ ocfs2_clusters_to_bytes(osb->sb, alloc + dx_alloc));
+ if (ret)
goto out_commit;
- }
did_quota = 1;
if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) {
@@ -3178,7 +3176,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
out_commit:
if (ret < 0 && did_quota)
- vfs_dq_free_space_nodirty(dir, bytes_allocated);
+ dquot_free_space_nodirty(dir, bytes_allocated);
ocfs2_commit_trans(osb, handle);
@@ -3221,11 +3219,10 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
if (extend) {
u32 offset = OCFS2_I(dir)->ip_clusters;
- if (vfs_dq_alloc_space_nodirty(dir,
- ocfs2_clusters_to_bytes(sb, 1))) {
- status = -EDQUOT;
+ status = dquot_alloc_space_nodirty(dir,
+ ocfs2_clusters_to_bytes(sb, 1));
+ if (status)
goto bail;
- }
did_quota = 1;
status = ocfs2_add_inode_data(OCFS2_SB(sb), dir, &offset,
@@ -3254,7 +3251,7 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
status = 0;
bail:
if (did_quota && status < 0)
- vfs_dq_free_space_nodirty(dir, ocfs2_clusters_to_bytes(sb, 1));
+ dquot_free_space_nodirty(dir, ocfs2_clusters_to_bytes(sb, 1));
mlog_exit(status);
return status;
}
@@ -3889,11 +3886,10 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
goto out;
}
- if (vfs_dq_alloc_space_nodirty(dir,
- ocfs2_clusters_to_bytes(dir->i_sb, 1))) {
- ret = -EDQUOT;
+ ret = dquot_alloc_space_nodirty(dir,
+ ocfs2_clusters_to_bytes(dir->i_sb, 1));
+ if (ret)
goto out_commit;
- }
did_quota = 1;
ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), dx_leaf_bh,
@@ -3983,7 +3979,7 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
out_commit:
if (ret < 0 && did_quota)
- vfs_dq_free_space_nodirty(dir,
+ dquot_free_space_nodirty(dir,
ocfs2_clusters_to_bytes(dir->i_sb, 1));
ocfs2_commit_trans(osb, handle);
@@ -4165,11 +4161,10 @@ static int ocfs2_expand_inline_dx_root(struct inode *dir,
goto out;
}
- if (vfs_dq_alloc_space_nodirty(dir,
- ocfs2_clusters_to_bytes(osb->sb, 1))) {
- ret = -EDQUOT;
+ ret = dquot_alloc_space_nodirty(dir,
+ ocfs2_clusters_to_bytes(osb->sb, 1));
+ if (ret)
goto out_commit;
- }
did_quota = 1;
/*
@@ -4229,7 +4224,7 @@ static int ocfs2_expand_inline_dx_root(struct inode *dir,
out_commit:
if (ret < 0 && did_quota)
- vfs_dq_free_space_nodirty(dir,
+ dquot_free_space_nodirty(dir,
ocfs2_clusters_to_bytes(dir->i_sb, 1));
ocfs2_commit_trans(osb, handle);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 5b52547d629..17947dc8341 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -107,6 +107,9 @@ static int ocfs2_file_open(struct inode *inode, struct file *file)
mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name);
+ if (file->f_mode & FMODE_WRITE)
+ dquot_initialize(inode);
+
spin_lock(&oi->ip_lock);
/* Check that the inode hasn't been wiped from disk by another
@@ -629,11 +632,10 @@ restart_all:
}
restarted_transaction:
- if (vfs_dq_alloc_space_nodirty(inode, ocfs2_clusters_to_bytes(osb->sb,
- clusters_to_add))) {
- status = -EDQUOT;
+ status = dquot_alloc_space_nodirty(inode,
+ ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
+ if (status)
goto leave;
- }
did_quota = 1;
/* reserve a write to the file entry early on - that we if we
@@ -674,7 +676,7 @@ restarted_transaction:
clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
spin_unlock(&OCFS2_I(inode)->ip_lock);
/* Release unused quota reservation */
- vfs_dq_free_space(inode,
+ dquot_free_space(inode,
ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
did_quota = 0;
@@ -710,7 +712,7 @@ restarted_transaction:
leave:
if (status < 0 && did_quota)
- vfs_dq_free_space(inode,
+ dquot_free_space(inode,
ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
if (handle) {
ocfs2_commit_trans(osb, handle);
@@ -978,6 +980,8 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;
if (size_change) {
+ dquot_initialize(inode);
+
status = ocfs2_rw_lock(inode, 1);
if (status < 0) {
mlog_errno(status);
@@ -1020,7 +1024,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
/*
* Gather pointers to quota structures so that allocation /
* freeing of quota structures happens here and not inside
- * vfs_dq_transfer() where we have problems with lock ordering
+ * dquot_transfer() where we have problems with lock ordering
*/
if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid
&& OCFS2_HAS_RO_COMPAT_FEATURE(sb,
@@ -1053,7 +1057,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
mlog_errno(status);
goto bail_unlock;
}
- status = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0;
+ status = dquot_transfer(inode, attr);
if (status < 0)
goto bail_commit;
} else {
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 88459bdd1ff..278a223aae1 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -665,7 +665,7 @@ static int ocfs2_remove_inode(struct inode *inode,
}
ocfs2_remove_from_cache(INODE_CACHE(inode), di_bh);
- vfs_dq_free_inode(inode);
+ dquot_free_inode(inode);
status = ocfs2_free_dinode(handle, inode_alloc_inode,
inode_alloc_bh, di);
@@ -971,6 +971,8 @@ void ocfs2_delete_inode(struct inode *inode)
goto bail;
}
+ dquot_initialize(inode);
+
if (!ocfs2_inode_is_valid_to_delete(inode)) {
/* It's probably not necessary to truncate_inode_pages
* here but we do it for safety anyway (it will most
@@ -1087,6 +1089,8 @@ void ocfs2_clear_inode(struct inode *inode)
mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL,
"Inode=%lu\n", inode->i_ino);
+ dquot_drop(inode);
+
/* To preven remote deletes we hold open lock before, now it
* is time to unlock PR and EX open locks. */
ocfs2_open_unlock(inode);
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 50fb26a6a5f..d9cd4e373a5 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -212,7 +212,7 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, int mode)
} else
inode->i_gid = current_fsgid();
inode->i_mode = mode;
- vfs_dq_init(inode);
+ dquot_initialize(inode);
return inode;
}
@@ -244,6 +244,8 @@ static int ocfs2_mknod(struct inode *dir,
(unsigned long)dev, dentry->d_name.len,
dentry->d_name.name);
+ dquot_initialize(dir);
+
/* get our super block */
osb = OCFS2_SB(dir->i_sb);
@@ -348,13 +350,9 @@ static int ocfs2_mknod(struct inode *dir,
goto leave;
}
- /* We don't use standard VFS wrapper because we don't want vfs_dq_init
- * to be called. */
- if (sb_any_quota_active(osb->sb) &&
- osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) {
- status = -EDQUOT;
+ status = dquot_alloc_inode(inode);
+ if (status)
goto leave;
- }
did_quota_inode = 1;
mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry,
@@ -431,7 +429,7 @@ static int ocfs2_mknod(struct inode *dir,
status = 0;
leave:
if (status < 0 && did_quota_inode)
- vfs_dq_free_inode(inode);
+ dquot_free_inode(inode);
if (handle)
ocfs2_commit_trans(osb, handle);
@@ -636,6 +634,8 @@ static int ocfs2_link(struct dentry *old_dentry,
if (S_ISDIR(inode->i_mode))
return -EPERM;
+ dquot_initialize(dir);
+
err = ocfs2_inode_lock_nested(dir, &parent_fe_bh, 1, OI_LS_PARENT);
if (err < 0) {
if (err != -ENOENT)
@@ -791,6 +791,8 @@ static int ocfs2_unlink(struct inode *dir,
mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry,
dentry->d_name.len, dentry->d_name.name);
+ dquot_initialize(dir);
+
BUG_ON(dentry->d_parent->d_inode != dir);
mlog(0, "ino = %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno);
@@ -1051,6 +1053,9 @@ static int ocfs2_rename(struct inode *old_dir,
old_dentry->d_name.len, old_dentry->d_name.name,
new_dentry->d_name.len, new_dentry->d_name.name);
+ dquot_initialize(old_dir);
+ dquot_initialize(new_dir);
+
osb = OCFS2_SB(old_dir->i_sb);
if (new_inode) {
@@ -1599,6 +1604,8 @@ static int ocfs2_symlink(struct inode *dir,
mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir,
dentry, symname, dentry->d_name.len, dentry->d_name.name);
+ dquot_initialize(dir);
+
sb = dir->i_sb;
osb = OCFS2_SB(sb);
@@ -1688,13 +1695,9 @@ static int ocfs2_symlink(struct inode *dir,
goto bail;
}
- /* We don't use standard VFS wrapper because we don't want vfs_dq_init
- * to be called. */
- if (sb_any_quota_active(osb->sb) &&
- osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) {
- status = -EDQUOT;
+ status = dquot_alloc_inode(inode);
+ if (status)
goto bail;
- }
did_quota_inode = 1;
mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry,
@@ -1716,11 +1719,10 @@ static int ocfs2_symlink(struct inode *dir,
u32 offset = 0;
inode->i_op = &ocfs2_symlink_inode_operations;
- if (vfs_dq_alloc_space_nodirty(inode,
- ocfs2_clusters_to_bytes(osb->sb, 1))) {
- status = -EDQUOT;
+ status = dquot_alloc_space_nodirty(inode,
+ ocfs2_clusters_to_bytes(osb->sb, 1));
+ if (status)
goto bail;
- }
did_quota = 1;
status = ocfs2_add_inode_data(osb, inode, &offset, 1, 0,
new_fe_bh,
@@ -1788,10 +1790,10 @@ static int ocfs2_symlink(struct inode *dir,
d_instantiate(dentry, inode);
bail:
if (status < 0 && did_quota)
- vfs_dq_free_space_nodirty(inode,
+ dquot_free_space_nodirty(inode,
ocfs2_clusters_to_bytes(osb->sb, 1));
if (status < 0 && did_quota_inode)
- vfs_dq_free_inode(inode);
+ dquot_free_inode(inode);
if (handle)
ocfs2_commit_trans(osb, handle);
@@ -2099,13 +2101,9 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
goto leave;
}
- /* We don't use standard VFS wrapper because we don't want vfs_dq_init
- * to be called. */
- if (sb_any_quota_active(osb->sb) &&
- osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) {
- status = -EDQUOT;
+ status = dquot_alloc_inode(inode);
+ if (status)
goto leave;
- }
did_quota_inode = 1;
inode->i_nlink = 0;
@@ -2140,7 +2138,7 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
insert_inode_hash(inode);
leave:
if (status < 0 && did_quota_inode)
- vfs_dq_free_inode(inode);
+ dquot_free_inode(inode);
if (handle)
ocfs2_commit_trans(osb, handle);
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index b437dc0c4ca..355f41d1d52 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -851,13 +851,6 @@ static void ocfs2_destroy_dquot(struct dquot *dquot)
}
const struct dquot_operations ocfs2_quota_operations = {
- .initialize = dquot_initialize,
- .drop = dquot_drop,
- .alloc_space = dquot_alloc_space,
- .alloc_inode = dquot_alloc_inode,
- .free_space = dquot_free_space,
- .free_inode = dquot_free_inode,
- .transfer = dquot_transfer,
.write_dquot = ocfs2_write_dquot,
.acquire_dquot = ocfs2_acquire_dquot,
.release_dquot = ocfs2_release_dquot,
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index fb6aa7acf54..9e96921dffd 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4390,7 +4390,7 @@ static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir,
}
mutex_lock(&inode->i_mutex);
- vfs_dq_init(dir);
+ dquot_initialize(dir);
error = ocfs2_reflink(old_dentry, dir, new_dentry, preserve);
mutex_unlock(&inode->i_mutex);
if (!error)
diff --git a/fs/open.c b/fs/open.c
index e0b2d88b038..e17f54454b5 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -8,7 +8,6 @@
#include <linux/mm.h>
#include <linux/file.h>
#include <linux/fdtable.h>
-#include <linux/quotaops.h>
#include <linux/fsnotify.h>
#include <linux/module.h>
#include <linux/slab.h>
@@ -278,10 +277,8 @@ static long do_sys_truncate(const char __user *pathname, loff_t length)
error = locks_verify_truncate(inode, NULL, length);
if (!error)
error = security_path_truncate(&path, length, 0);
- if (!error) {
- vfs_dq_init(inode);
+ if (!error)
error = do_truncate(path.dentry, length, 0, NULL);
- }
put_write_and_out:
put_write_access(inode);
diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig
index efc02ebb8c7..dad7fb247dd 100644
--- a/fs/quota/Kconfig
+++ b/fs/quota/Kconfig
@@ -59,3 +59,8 @@ config QUOTACTL
bool
depends on XFS_QUOTA || QUOTA
default y
+
+config QUOTACTL_COMPAT
+ bool
+ depends on QUOTACTL && COMPAT_FOR_U64_ALIGNMENT
+ default y
diff --git a/fs/quota/Makefile b/fs/quota/Makefile
index 68d4f6dc057..5f9e9e276af 100644
--- a/fs/quota/Makefile
+++ b/fs/quota/Makefile
@@ -3,3 +3,5 @@ obj-$(CONFIG_QFMT_V1) += quota_v1.o
obj-$(CONFIG_QFMT_V2) += quota_v2.o
obj-$(CONFIG_QUOTA_TREE) += quota_tree.o
obj-$(CONFIG_QUOTACTL) += quota.o
+obj-$(CONFIG_QUOTACTL_COMPAT) += compat.o
+obj-$(CONFIG_QUOTA_NETLINK_INTERFACE) += netlink.o
diff --git a/fs/quota/compat.c b/fs/quota/compat.c
new file mode 100644
index 00000000000..fb1892fe3e5
--- /dev/null
+++ b/fs/quota/compat.c
@@ -0,0 +1,118 @@
+
+#include <linux/syscalls.h>
+#include <linux/compat.h>
+#include <linux/quotaops.h>
+
+/*
+ * This code works only for 32 bit quota tools over 64 bit OS (x86_64, ia64)
+ * and is necessary due to alignment problems.
+ */
+struct compat_if_dqblk {
+ compat_u64 dqb_bhardlimit;
+ compat_u64 dqb_bsoftlimit;
+ compat_u64 dqb_curspace;
+ compat_u64 dqb_ihardlimit;
+ compat_u64 dqb_isoftlimit;
+ compat_u64 dqb_curinodes;
+ compat_u64 dqb_btime;
+ compat_u64 dqb_itime;
+ compat_uint_t dqb_valid;
+};
+
+/* XFS structures */
+struct compat_fs_qfilestat {
+ compat_u64 dqb_bhardlimit;
+ compat_u64 qfs_nblks;
+ compat_uint_t qfs_nextents;
+};
+
+struct compat_fs_quota_stat {
+ __s8 qs_version;
+ __u16 qs_flags;
+ __s8 qs_pad;
+ struct compat_fs_qfilestat qs_uquota;
+ struct compat_fs_qfilestat qs_gquota;
+ compat_uint_t qs_incoredqs;
+ compat_int_t qs_btimelimit;
+ compat_int_t qs_itimelimit;
+ compat_int_t qs_rtbtimelimit;
+ __u16 qs_bwarnlimit;
+ __u16 qs_iwarnlimit;
+};
+
+asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special,
+ qid_t id, void __user *addr)
+{
+ unsigned int cmds;
+ struct if_dqblk __user *dqblk;
+ struct compat_if_dqblk __user *compat_dqblk;
+ struct fs_quota_stat __user *fsqstat;
+ struct compat_fs_quota_stat __user *compat_fsqstat;
+ compat_uint_t data;
+ u16 xdata;
+ long ret;
+
+ cmds = cmd >> SUBCMDSHIFT;
+
+ switch (cmds) {
+ case Q_GETQUOTA:
+ dqblk = compat_alloc_user_space(sizeof(struct if_dqblk));
+ compat_dqblk = addr;
+ ret = sys_quotactl(cmd, special, id, dqblk);
+ if (ret)
+ break;
+ if (copy_in_user(compat_dqblk, dqblk, sizeof(*compat_dqblk)) ||
+ get_user(data, &dqblk->dqb_valid) ||
+ put_user(data, &compat_dqblk->dqb_valid))
+ ret = -EFAULT;
+ break;
+ case Q_SETQUOTA:
+ dqblk = compat_alloc_user_space(sizeof(struct if_dqblk));
+ compat_dqblk = addr;
+ ret = -EFAULT;
+ if (copy_in_user(dqblk, compat_dqblk, sizeof(*compat_dqblk)) ||
+ get_user(data, &compat_dqblk->dqb_valid) ||
+ put_user(data, &dqblk->dqb_valid))
+ break;
+ ret = sys_quotactl(cmd, special, id, dqblk);
+ break;
+ case Q_XGETQSTAT:
+ fsqstat = compat_alloc_user_space(sizeof(struct fs_quota_stat));
+ compat_fsqstat = addr;
+ ret = sys_quotactl(cmd, special, id, fsqstat);
+ if (ret)
+ break;
+ ret = -EFAULT;
+ /* Copying qs_version, qs_flags, qs_pad */
+ if (copy_in_user(compat_fsqstat, fsqstat,
+ offsetof(struct compat_fs_quota_stat, qs_uquota)))
+ break;
+ /* Copying qs_uquota */
+ if (copy_in_user(&compat_fsqstat->qs_uquota,
+ &fsqstat->qs_uquota,
+ sizeof(compat_fsqstat->qs_uquota)) ||
+ get_user(data, &fsqstat->qs_uquota.qfs_nextents) ||
+ put_user(data, &compat_fsqstat->qs_uquota.qfs_nextents))
+ break;
+ /* Copying qs_gquota */
+ if (copy_in_user(&compat_fsqstat->qs_gquota,
+ &fsqstat->qs_gquota,
+ sizeof(compat_fsqstat->qs_gquota)) ||
+ get_user(data, &fsqstat->qs_gquota.qfs_nextents) ||
+ put_user(data, &compat_fsqstat->qs_gquota.qfs_nextents))
+ break;
+ /* Copying the rest */
+ if (copy_in_user(&compat_fsqstat->qs_incoredqs,
+ &fsqstat->qs_incoredqs,
+ sizeof(struct compat_fs_quota_stat) -
+ offsetof(struct compat_fs_quota_stat, qs_incoredqs)) ||
+ get_user(xdata, &fsqstat->qs_iwarnlimit) ||
+ put_user(xdata, &compat_fsqstat->qs_iwarnlimit))
+ break;
+ ret = 0;
+ break;
+ default:
+ ret = sys_quotactl(cmd, special, id, addr);
+ }
+ return ret;
+}
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 3fc62b097be..e0b870f4749 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -100,9 +100,13 @@
*
* Any operation working on dquots via inode pointers must hold dqptr_sem. If
* operation is just reading pointers from inode (or not using them at all) the
- * read lock is enough. If pointers are altered function must hold write lock
- * (these locking rules also apply for S_NOQUOTA flag in the inode - note that
- * for altering the flag i_mutex is also needed).
+ * read lock is enough. If pointers are altered function must hold write lock.
+ * Special care needs to be taken about S_NOQUOTA inode flag (marking that
+ * inode is a quota file). Functions adding pointers from inode to dquots have
+ * to check this flag under dqptr_sem and then (if S_NOQUOTA is not set) they
+ * have to do all pointer modifications before dropping dqptr_sem. This makes
+ * sure they cannot race with quotaon which first sets S_NOQUOTA flag and
+ * then drops all pointers to dquots from an inode.
*
* Each dquot has its dq_lock mutex. Locked dquots might not be referenced
* from inodes (dquot_alloc_space() and such don't check the dq_lock).
@@ -225,6 +229,9 @@ static struct hlist_head *dquot_hash;
struct dqstats dqstats;
EXPORT_SYMBOL(dqstats);
+static qsize_t inode_get_rsv_space(struct inode *inode);
+static void __dquot_initialize(struct inode *inode, int type);
+
static inline unsigned int
hashfn(const struct super_block *sb, unsigned int id, int type)
{
@@ -564,7 +571,7 @@ out:
}
EXPORT_SYMBOL(dquot_scan_active);
-int vfs_quota_sync(struct super_block *sb, int type)
+int vfs_quota_sync(struct super_block *sb, int type, int wait)
{
struct list_head *dirty;
struct dquot *dquot;
@@ -609,6 +616,33 @@ int vfs_quota_sync(struct super_block *sb, int type)
spin_unlock(&dq_list_lock);
mutex_unlock(&dqopt->dqonoff_mutex);
+ if (!wait || (sb_dqopt(sb)->flags & DQUOT_QUOTA_SYS_FILE))
+ return 0;
+
+ /* This is not very clever (and fast) but currently I don't know about
+ * any other simple way of getting quota data to disk and we must get
+ * them there for userspace to be visible... */
+ if (sb->s_op->sync_fs)
+ sb->s_op->sync_fs(sb, 1);
+ sync_blockdev(sb->s_bdev);
+
+ /*
+ * Now when everything is written we can discard the pagecache so
+ * that userspace sees the changes.
+ */
+ mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
+ for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+ if (type != -1 && cnt != type)
+ continue;
+ if (!sb_has_quota_active(sb, cnt))
+ continue;
+ mutex_lock_nested(&sb_dqopt(sb)->files[cnt]->i_mutex,
+ I_MUTEX_QUOTA);
+ truncate_inode_pages(&sb_dqopt(sb)->files[cnt]->i_data, 0);
+ mutex_unlock(&sb_dqopt(sb)->files[cnt]->i_mutex);
+ }
+ mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
+
return 0;
}
EXPORT_SYMBOL(vfs_quota_sync);
@@ -840,11 +874,14 @@ static int dqinit_needed(struct inode *inode, int type)
static void add_dquot_ref(struct super_block *sb, int type)
{
struct inode *inode, *old_inode = NULL;
+ int reserved = 0;
spin_lock(&inode_lock);
list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
continue;
+ if (unlikely(inode_get_rsv_space(inode) > 0))
+ reserved = 1;
if (!atomic_read(&inode->i_writecount))
continue;
if (!dqinit_needed(inode, type))
@@ -854,7 +891,7 @@ static void add_dquot_ref(struct super_block *sb, int type)
spin_unlock(&inode_lock);
iput(old_inode);
- sb->dq_op->initialize(inode, type);
+ __dquot_initialize(inode, type);
/* We hold a reference to 'inode' so it couldn't have been
* removed from s_inodes list while we dropped the inode_lock.
* We cannot iput the inode now as we can be holding the last
@@ -865,6 +902,12 @@ static void add_dquot_ref(struct super_block *sb, int type)
}
spin_unlock(&inode_lock);
iput(old_inode);
+
+ if (reserved) {
+ printk(KERN_WARNING "VFS (%s): Writes happened before quota"
+ " was turned on thus quota information is probably "
+ "inconsistent. Please run quotacheck(8).\n", sb->s_id);
+ }
}
/*
@@ -978,10 +1021,12 @@ static inline void dquot_resv_space(struct dquot *dquot, qsize_t number)
/*
* Claim reserved quota space
*/
-static void dquot_claim_reserved_space(struct dquot *dquot,
- qsize_t number)
+static void dquot_claim_reserved_space(struct dquot *dquot, qsize_t number)
{
- WARN_ON(dquot->dq_dqb.dqb_rsvspace < number);
+ if (dquot->dq_dqb.dqb_rsvspace < number) {
+ WARN_ON_ONCE(1);
+ number = dquot->dq_dqb.dqb_rsvspace;
+ }
dquot->dq_dqb.dqb_curspace += number;
dquot->dq_dqb.dqb_rsvspace -= number;
}
@@ -989,7 +1034,12 @@ static void dquot_claim_reserved_space(struct dquot *dquot,
static inline
void dquot_free_reserved_space(struct dquot *dquot, qsize_t number)
{
- dquot->dq_dqb.dqb_rsvspace -= number;
+ if (dquot->dq_dqb.dqb_rsvspace >= number)
+ dquot->dq_dqb.dqb_rsvspace -= number;
+ else {
+ WARN_ON_ONCE(1);
+ dquot->dq_dqb.dqb_rsvspace = 0;
+ }
}
static void dquot_decr_inodes(struct dquot *dquot, qsize_t number)
@@ -1131,13 +1181,13 @@ static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype)
*warntype = QUOTA_NL_NOWARN;
if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type) ||
test_bit(DQ_FAKE_B, &dquot->dq_flags))
- return QUOTA_OK;
+ return 0;
if (dquot->dq_dqb.dqb_ihardlimit &&
newinodes > dquot->dq_dqb.dqb_ihardlimit &&
!ignore_hardlimit(dquot)) {
*warntype = QUOTA_NL_IHARDWARN;
- return NO_QUOTA;
+ return -EDQUOT;
}
if (dquot->dq_dqb.dqb_isoftlimit &&
@@ -1146,7 +1196,7 @@ static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype)
get_seconds() >= dquot->dq_dqb.dqb_itime &&
!ignore_hardlimit(dquot)) {
*warntype = QUOTA_NL_ISOFTLONGWARN;
- return NO_QUOTA;
+ return -EDQUOT;
}
if (dquot->dq_dqb.dqb_isoftlimit &&
@@ -1157,7 +1207,7 @@ static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype)
sb_dqopt(dquot->dq_sb)->info[dquot->dq_type].dqi_igrace;
}
- return QUOTA_OK;
+ return 0;
}
/* needs dq_data_lock */
@@ -1169,7 +1219,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
*warntype = QUOTA_NL_NOWARN;
if (!sb_has_quota_limits_enabled(sb, dquot->dq_type) ||
test_bit(DQ_FAKE_B, &dquot->dq_flags))
- return QUOTA_OK;
+ return 0;
tspace = dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace
+ space;
@@ -1179,7 +1229,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
!ignore_hardlimit(dquot)) {
if (!prealloc)
*warntype = QUOTA_NL_BHARDWARN;
- return NO_QUOTA;
+ return -EDQUOT;
}
if (dquot->dq_dqb.dqb_bsoftlimit &&
@@ -1189,7 +1239,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
!ignore_hardlimit(dquot)) {
if (!prealloc)
*warntype = QUOTA_NL_BSOFTLONGWARN;
- return NO_QUOTA;
+ return -EDQUOT;
}
if (dquot->dq_dqb.dqb_bsoftlimit &&
@@ -1205,10 +1255,10 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
* We don't allow preallocation to exceed softlimit so exceeding will
* be always printed
*/
- return NO_QUOTA;
+ return -EDQUOT;
}
- return QUOTA_OK;
+ return 0;
}
static int info_idq_free(struct dquot *dquot, qsize_t inodes)
@@ -1242,25 +1292,32 @@ static int info_bdq_free(struct dquot *dquot, qsize_t space)
return QUOTA_NL_BHARDBELOW;
return QUOTA_NL_NOWARN;
}
+
/*
- * Initialize quota pointers in inode
- * We do things in a bit complicated way but by that we avoid calling
- * dqget() and thus filesystem callbacks under dqptr_sem.
+ * Initialize quota pointers in inode
+ *
+ * We do things in a bit complicated way but by that we avoid calling
+ * dqget() and thus filesystem callbacks under dqptr_sem.
+ *
+ * It is better to call this function outside of any transaction as it
+ * might need a lot of space in journal for dquot structure allocation.
*/
-int dquot_initialize(struct inode *inode, int type)
+static void __dquot_initialize(struct inode *inode, int type)
{
unsigned int id = 0;
- int cnt, ret = 0;
- struct dquot *got[MAXQUOTAS] = { NULL, NULL };
+ int cnt;
+ struct dquot *got[MAXQUOTAS];
struct super_block *sb = inode->i_sb;
+ qsize_t rsv;
/* First test before acquiring mutex - solves deadlocks when we
* re-enter the quota code and are already holding the mutex */
- if (IS_NOQUOTA(inode))
- return 0;
+ if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode))
+ return;
/* First get references to structures we might need. */
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+ got[cnt] = NULL;
if (type != -1 && cnt != type)
continue;
switch (cnt) {
@@ -1275,7 +1332,6 @@ int dquot_initialize(struct inode *inode, int type)
}
down_write(&sb_dqopt(sb)->dqptr_sem);
- /* Having dqptr_sem we know NOQUOTA flags can't be altered... */
if (IS_NOQUOTA(inode))
goto out_err;
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
@@ -1287,20 +1343,31 @@ int dquot_initialize(struct inode *inode, int type)
if (!inode->i_dquot[cnt]) {
inode->i_dquot[cnt] = got[cnt];
got[cnt] = NULL;
+ /*
+ * Make quota reservation system happy if someone
+ * did a write before quota was turned on
+ */
+ rsv = inode_get_rsv_space(inode);
+ if (unlikely(rsv))
+ dquot_resv_space(inode->i_dquot[cnt], rsv);
}
}
out_err:
up_write(&sb_dqopt(sb)->dqptr_sem);
/* Drop unused references */
dqput_all(got);
- return ret;
+}
+
+void dquot_initialize(struct inode *inode)
+{
+ __dquot_initialize(inode, -1);
}
EXPORT_SYMBOL(dquot_initialize);
/*
* Release all quotas referenced by inode
*/
-int dquot_drop(struct inode *inode)
+static void __dquot_drop(struct inode *inode)
{
int cnt;
struct dquot *put[MAXQUOTAS];
@@ -1312,32 +1379,31 @@ int dquot_drop(struct inode *inode)
}
up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
dqput_all(put);
- return 0;
}
-EXPORT_SYMBOL(dquot_drop);
-/* Wrapper to remove references to quota structures from inode */
-void vfs_dq_drop(struct inode *inode)
-{
- /* Here we can get arbitrary inode from clear_inode() so we have
- * to be careful. OTOH we don't need locking as quota operations
- * are allowed to change only at mount time */
- if (!IS_NOQUOTA(inode) && inode->i_sb && inode->i_sb->dq_op
- && inode->i_sb->dq_op->drop) {
- int cnt;
- /* Test before calling to rule out calls from proc and such
- * where we are not allowed to block. Note that this is
- * actually reliable test even without the lock - the caller
- * must assure that nobody can come after the DQUOT_DROP and
- * add quota pointers back anyway */
- for (cnt = 0; cnt < MAXQUOTAS; cnt++)
- if (inode->i_dquot[cnt])
- break;
- if (cnt < MAXQUOTAS)
- inode->i_sb->dq_op->drop(inode);
- }
-}
-EXPORT_SYMBOL(vfs_dq_drop);
+void dquot_drop(struct inode *inode)
+{
+ int cnt;
+
+ if (IS_NOQUOTA(inode))
+ return;
+
+ /*
+ * Test before calling to rule out calls from proc and such
+ * where we are not allowed to block. Note that this is
+ * actually reliable test even without the lock - the caller
+ * must assure that nobody can come after the DQUOT_DROP and
+ * add quota pointers back anyway.
+ */
+ for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+ if (inode->i_dquot[cnt])
+ break;
+ }
+
+ if (cnt < MAXQUOTAS)
+ __dquot_drop(inode);
+}
+EXPORT_SYMBOL(dquot_drop);
/*
* inode_reserved_space is managed internally by quota, and protected by
@@ -1351,28 +1417,30 @@ static qsize_t *inode_reserved_space(struct inode * inode)
return inode->i_sb->dq_op->get_reserved_space(inode);
}
-static void inode_add_rsv_space(struct inode *inode, qsize_t number)
+void inode_add_rsv_space(struct inode *inode, qsize_t number)
{
spin_lock(&inode->i_lock);
*inode_reserved_space(inode) += number;
spin_unlock(&inode->i_lock);
}
+EXPORT_SYMBOL(inode_add_rsv_space);
-
-static void inode_claim_rsv_space(struct inode *inode, qsize_t number)
+void inode_claim_rsv_space(struct inode *inode, qsize_t number)
{
spin_lock(&inode->i_lock);
*inode_reserved_space(inode) -= number;
__inode_add_bytes(inode, number);
spin_unlock(&inode->i_lock);
}
+EXPORT_SYMBOL(inode_claim_rsv_space);
-static void inode_sub_rsv_space(struct inode *inode, qsize_t number)
+void inode_sub_rsv_space(struct inode *inode, qsize_t number)
{
spin_lock(&inode->i_lock);
*inode_reserved_space(inode) -= number;
spin_unlock(&inode->i_lock);
}
+EXPORT_SYMBOL(inode_sub_rsv_space);
static qsize_t inode_get_rsv_space(struct inode *inode)
{
@@ -1404,38 +1472,34 @@ static void inode_decr_space(struct inode *inode, qsize_t number, int reserve)
}
/*
- * Following four functions update i_blocks+i_bytes fields and
- * quota information (together with appropriate checks)
- * NOTE: We absolutely rely on the fact that caller dirties
- * the inode (usually macros in quotaops.h care about this) and
- * holds a handle for the current transaction so that dquot write and
- * inode write go into the same transaction.
+ * This functions updates i_blocks+i_bytes fields and quota information
+ * (together with appropriate checks).
+ *
+ * NOTE: We absolutely rely on the fact that caller dirties the inode
+ * (usually helpers in quotaops.h care about this) and holds a handle for
+ * the current transaction so that dquot write and inode write go into the
+ * same transaction.
*/
/*
* This operation can block, but only after everything is updated
*/
int __dquot_alloc_space(struct inode *inode, qsize_t number,
- int warn, int reserve)
+ int warn, int reserve)
{
- int cnt, ret = QUOTA_OK;
+ int cnt, ret = 0;
char warntype[MAXQUOTAS];
/*
* First test before acquiring mutex - solves deadlocks when we
* re-enter the quota code and are already holding the mutex
*/
- if (IS_NOQUOTA(inode)) {
+ if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) {
inode_incr_space(inode, number, reserve);
goto out;
}
down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
- if (IS_NOQUOTA(inode)) {
- inode_incr_space(inode, number, reserve);
- goto out_unlock;
- }
-
for (cnt = 0; cnt < MAXQUOTAS; cnt++)
warntype[cnt] = QUOTA_NL_NOWARN;
@@ -1443,9 +1507,9 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number,
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
if (!inode->i_dquot[cnt])
continue;
- if (check_bdq(inode->i_dquot[cnt], number, warn, warntype+cnt)
- == NO_QUOTA) {
- ret = NO_QUOTA;
+ ret = check_bdq(inode->i_dquot[cnt], number, !warn,
+ warntype+cnt);
+ if (ret) {
spin_unlock(&dq_data_lock);
goto out_flush_warn;
}
@@ -1466,61 +1530,45 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number,
mark_all_dquot_dirty(inode->i_dquot);
out_flush_warn:
flush_warnings(inode->i_dquot, warntype);
-out_unlock:
up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
out:
return ret;
}
-
-int dquot_alloc_space(struct inode *inode, qsize_t number, int warn)
-{
- return __dquot_alloc_space(inode, number, warn, 0);
-}
-EXPORT_SYMBOL(dquot_alloc_space);
-
-int dquot_reserve_space(struct inode *inode, qsize_t number, int warn)
-{
- return __dquot_alloc_space(inode, number, warn, 1);
-}
-EXPORT_SYMBOL(dquot_reserve_space);
+EXPORT_SYMBOL(__dquot_alloc_space);
/*
* This operation can block, but only after everything is updated
*/
-int dquot_alloc_inode(const struct inode *inode, qsize_t number)
+int dquot_alloc_inode(const struct inode *inode)
{
- int cnt, ret = NO_QUOTA;
+ int cnt, ret = 0;
char warntype[MAXQUOTAS];
/* First test before acquiring mutex - solves deadlocks when we
* re-enter the quota code and are already holding the mutex */
- if (IS_NOQUOTA(inode))
- return QUOTA_OK;
+ if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode))
+ return 0;
for (cnt = 0; cnt < MAXQUOTAS; cnt++)
warntype[cnt] = QUOTA_NL_NOWARN;
down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
- if (IS_NOQUOTA(inode)) {
- up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
- return QUOTA_OK;
- }
spin_lock(&dq_data_lock);
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
if (!inode->i_dquot[cnt])
continue;
- if (check_idq(inode->i_dquot[cnt], number, warntype+cnt)
- == NO_QUOTA)
+ ret = check_idq(inode->i_dquot[cnt], 1, warntype + cnt);
+ if (ret)
goto warn_put_all;
}
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
if (!inode->i_dquot[cnt])
continue;
- dquot_incr_inodes(inode->i_dquot[cnt], number);
+ dquot_incr_inodes(inode->i_dquot[cnt], 1);
}
- ret = QUOTA_OK;
+
warn_put_all:
spin_unlock(&dq_data_lock);
- if (ret == QUOTA_OK)
+ if (ret == 0)
mark_all_dquot_dirty(inode->i_dquot);
flush_warnings(inode->i_dquot, warntype);
up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
@@ -1528,23 +1576,19 @@ warn_put_all:
}
EXPORT_SYMBOL(dquot_alloc_inode);
-int dquot_claim_space(struct inode *inode, qsize_t number)
+/*
+ * Convert in-memory reserved quotas to real consumed quotas
+ */
+int dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
{
int cnt;
- int ret = QUOTA_OK;
- if (IS_NOQUOTA(inode)) {
+ if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) {
inode_claim_rsv_space(inode, number);
- goto out;
+ return 0;
}
down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
- if (IS_NOQUOTA(inode)) {
- up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
- inode_claim_rsv_space(inode, number);
- goto out;
- }
-
spin_lock(&dq_data_lock);
/* Claim reserved quotas to allocated quotas */
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
@@ -1557,33 +1601,26 @@ int dquot_claim_space(struct inode *inode, qsize_t number)
spin_unlock(&dq_data_lock);
mark_all_dquot_dirty(inode->i_dquot);
up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
-out:
- return ret;
+ return 0;
}
-EXPORT_SYMBOL(dquot_claim_space);
+EXPORT_SYMBOL(dquot_claim_space_nodirty);
/*
* This operation can block, but only after everything is updated
*/
-int __dquot_free_space(struct inode *inode, qsize_t number, int reserve)
+void __dquot_free_space(struct inode *inode, qsize_t number, int reserve)
{
unsigned int cnt;
char warntype[MAXQUOTAS];
/* First test before acquiring mutex - solves deadlocks when we
* re-enter the quota code and are already holding the mutex */
- if (IS_NOQUOTA(inode)) {
-out_sub:
+ if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) {
inode_decr_space(inode, number, reserve);
- return QUOTA_OK;
+ return;
}
down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
- /* Now recheck reliably when holding dqptr_sem */
- if (IS_NOQUOTA(inode)) {
- up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
- goto out_sub;
- }
spin_lock(&dq_data_lock);
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
if (!inode->i_dquot[cnt])
@@ -1603,56 +1640,34 @@ out_sub:
out_unlock:
flush_warnings(inode->i_dquot, warntype);
up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
- return QUOTA_OK;
-}
-
-int dquot_free_space(struct inode *inode, qsize_t number)
-{
- return __dquot_free_space(inode, number, 0);
}
-EXPORT_SYMBOL(dquot_free_space);
-
-/*
- * Release reserved quota space
- */
-void dquot_release_reserved_space(struct inode *inode, qsize_t number)
-{
- __dquot_free_space(inode, number, 1);
-
-}
-EXPORT_SYMBOL(dquot_release_reserved_space);
+EXPORT_SYMBOL(__dquot_free_space);
/*
* This operation can block, but only after everything is updated
*/
-int dquot_free_inode(const struct inode *inode, qsize_t number)
+void dquot_free_inode(const struct inode *inode)
{
unsigned int cnt;
char warntype[MAXQUOTAS];
/* First test before acquiring mutex - solves deadlocks when we
* re-enter the quota code and are already holding the mutex */
- if (IS_NOQUOTA(inode))
- return QUOTA_OK;
+ if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode))
+ return;
down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
- /* Now recheck reliably when holding dqptr_sem */
- if (IS_NOQUOTA(inode)) {
- up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
- return QUOTA_OK;
- }
spin_lock(&dq_data_lock);
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
if (!inode->i_dquot[cnt])
continue;
- warntype[cnt] = info_idq_free(inode->i_dquot[cnt], number);
- dquot_decr_inodes(inode->i_dquot[cnt], number);
+ warntype[cnt] = info_idq_free(inode->i_dquot[cnt], 1);
+ dquot_decr_inodes(inode->i_dquot[cnt], 1);
}
spin_unlock(&dq_data_lock);
mark_all_dquot_dirty(inode->i_dquot);
flush_warnings(inode->i_dquot, warntype);
up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
- return QUOTA_OK;
}
EXPORT_SYMBOL(dquot_free_inode);
@@ -1662,37 +1677,31 @@ EXPORT_SYMBOL(dquot_free_inode);
* This operation can block, but only after everything is updated
* A transaction must be started when entering this function.
*/
-int dquot_transfer(struct inode *inode, struct iattr *iattr)
+static int __dquot_transfer(struct inode *inode, qid_t *chid, unsigned long mask)
{
qsize_t space, cur_space;
qsize_t rsv_space = 0;
struct dquot *transfer_from[MAXQUOTAS];
struct dquot *transfer_to[MAXQUOTAS];
- int cnt, ret = QUOTA_OK;
- int chuid = iattr->ia_valid & ATTR_UID && inode->i_uid != iattr->ia_uid,
- chgid = iattr->ia_valid & ATTR_GID && inode->i_gid != iattr->ia_gid;
+ int cnt, ret = 0;
char warntype_to[MAXQUOTAS];
char warntype_from_inodes[MAXQUOTAS], warntype_from_space[MAXQUOTAS];
/* First test before acquiring mutex - solves deadlocks when we
* re-enter the quota code and are already holding the mutex */
if (IS_NOQUOTA(inode))
- return QUOTA_OK;
+ return 0;
/* Initialize the arrays */
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
transfer_from[cnt] = NULL;
transfer_to[cnt] = NULL;
warntype_to[cnt] = QUOTA_NL_NOWARN;
}
- if (chuid)
- transfer_to[USRQUOTA] = dqget(inode->i_sb, iattr->ia_uid,
- USRQUOTA);
- if (chgid)
- transfer_to[GRPQUOTA] = dqget(inode->i_sb, iattr->ia_gid,
- GRPQUOTA);
-
+ for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+ if (mask & (1 << cnt))
+ transfer_to[cnt] = dqget(inode->i_sb, chid[cnt], cnt);
+ }
down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
- /* Now recheck reliably when holding dqptr_sem */
if (IS_NOQUOTA(inode)) { /* File without quota accounting? */
up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
goto put_all;
@@ -1706,9 +1715,11 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
if (!transfer_to[cnt])
continue;
transfer_from[cnt] = inode->i_dquot[cnt];
- if (check_idq(transfer_to[cnt], 1, warntype_to + cnt) ==
- NO_QUOTA || check_bdq(transfer_to[cnt], space, 0,
- warntype_to + cnt) == NO_QUOTA)
+ ret = check_idq(transfer_to[cnt], 1, warntype_to + cnt);
+ if (ret)
+ goto over_quota;
+ ret = check_bdq(transfer_to[cnt], space, 0, warntype_to + cnt);
+ if (ret)
goto over_quota;
}
@@ -1762,22 +1773,32 @@ over_quota:
/* Clear dquot pointers we don't want to dqput() */
for (cnt = 0; cnt < MAXQUOTAS; cnt++)
transfer_from[cnt] = NULL;
- ret = NO_QUOTA;
goto warn_put_all;
}
-EXPORT_SYMBOL(dquot_transfer);
-/* Wrapper for transferring ownership of an inode */
-int vfs_dq_transfer(struct inode *inode, struct iattr *iattr)
+/* Wrapper for transferring ownership of an inode for uid/gid only
+ * Called from FSXXX_setattr()
+ */
+int dquot_transfer(struct inode *inode, struct iattr *iattr)
{
+ qid_t chid[MAXQUOTAS];
+ unsigned long mask = 0;
+
+ if (iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) {
+ mask |= 1 << USRQUOTA;
+ chid[USRQUOTA] = iattr->ia_uid;
+ }
+ if (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid) {
+ mask |= 1 << GRPQUOTA;
+ chid[GRPQUOTA] = iattr->ia_gid;
+ }
if (sb_any_quota_active(inode->i_sb) && !IS_NOQUOTA(inode)) {
- vfs_dq_init(inode);
- if (inode->i_sb->dq_op->transfer(inode, iattr) == NO_QUOTA)
- return 1;
+ dquot_initialize(inode);
+ return __dquot_transfer(inode, chid, mask);
}
return 0;
}
-EXPORT_SYMBOL(vfs_dq_transfer);
+EXPORT_SYMBOL(dquot_transfer);
/*
* Write info of quota file to disk
@@ -1798,13 +1819,6 @@ EXPORT_SYMBOL(dquot_commit_info);
* Definitions of diskquota operations.
*/
const struct dquot_operations dquot_operations = {
- .initialize = dquot_initialize,
- .drop = dquot_drop,
- .alloc_space = dquot_alloc_space,
- .alloc_inode = dquot_alloc_inode,
- .free_space = dquot_free_space,
- .free_inode = dquot_free_inode,
- .transfer = dquot_transfer,
.write_dquot = dquot_commit,
.acquire_dquot = dquot_acquire,
.release_dquot = dquot_release,
@@ -1815,6 +1829,20 @@ const struct dquot_operations dquot_operations = {
};
/*
+ * Generic helper for ->open on filesystems supporting disk quotas.
+ */
+int dquot_file_open(struct inode *inode, struct file *file)
+{
+ int error;
+
+ error = generic_file_open(inode, file);
+ if (!error && (file->f_mode & FMODE_WRITE))
+ dquot_initialize(inode);
+ return error;
+}
+EXPORT_SYMBOL(dquot_file_open);
+
+/*
* Turn quota off on a device. type == -1 ==> quotaoff for all types (umount)
*/
int vfs_quota_disable(struct super_block *sb, int type, unsigned int flags)
@@ -1993,11 +2021,13 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
}
if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) {
- /* As we bypass the pagecache we must now flush the inode so
- * that we see all the changes from userspace... */
- write_inode_now(inode, 1);
- /* And now flush the block cache so that kernel sees the
- * changes */
+ /* As we bypass the pagecache we must now flush all the
+ * dirty data and invalidate caches so that kernel sees
+ * changes from userspace. It is not enough to just flush
+ * the quota file since if blocksize < pagesize, invalidation
+ * of the cache could fail because of other unrelated dirty
+ * data */
+ sync_filesystem(sb);
invalidate_bdev(sb->s_bdev);
}
mutex_lock(&dqopt->dqonoff_mutex);
@@ -2010,14 +2040,16 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
/* We don't want quota and atime on quota files (deadlocks
* possible) Also nobody should write to the file - we use
* special IO operations which ignore the immutable bit. */
- down_write(&dqopt->dqptr_sem);
mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE |
S_NOQUOTA);
inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE;
mutex_unlock(&inode->i_mutex);
- up_write(&dqopt->dqptr_sem);
- sb->dq_op->drop(inode);
+ /*
+ * When S_NOQUOTA is set, remove dquot references as no more
+ * references can be added
+ */
+ __dquot_drop(inode);
}
error = -EIO;
@@ -2053,14 +2085,12 @@ out_file_init:
iput(inode);
out_lock:
if (oldflags != -1) {
- down_write(&dqopt->dqptr_sem);
mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
/* Set the flags back (in the case of accidental quotaon()
* on a wrong file we don't want to mess up the flags) */
inode->i_flags &= ~(S_NOATIME | S_NOQUOTA | S_IMMUTABLE);
inode->i_flags |= oldflags;
mutex_unlock(&inode->i_mutex);
- up_write(&dqopt->dqptr_sem);
}
mutex_unlock(&dqopt->dqonoff_mutex);
out_fmt:
diff --git a/fs/quota/netlink.c b/fs/quota/netlink.c
new file mode 100644
index 00000000000..2663ed90fb0
--- /dev/null
+++ b/fs/quota/netlink.c
@@ -0,0 +1,95 @@
+
+#include <linux/cred.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/quotaops.h>
+#include <linux/sched.h>
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+/* Netlink family structure for quota */
+static struct genl_family quota_genl_family = {
+ .id = GENL_ID_GENERATE,
+ .hdrsize = 0,
+ .name = "VFS_DQUOT",
+ .version = 1,
+ .maxattr = QUOTA_NL_A_MAX,
+};
+
+/**
+ * quota_send_warning - Send warning to userspace about exceeded quota
+ * @type: The quota type: USRQQUOTA, GRPQUOTA,...
+ * @id: The user or group id of the quota that was exceeded
+ * @dev: The device on which the fs is mounted (sb->s_dev)
+ * @warntype: The type of the warning: QUOTA_NL_...
+ *
+ * This can be used by filesystems (including those which don't use
+ * dquot) to send a message to userspace relating to quota limits.
+ *
+ */
+
+void quota_send_warning(short type, unsigned int id, dev_t dev,
+ const char warntype)
+{
+ static atomic_t seq;
+ struct sk_buff *skb;
+ void *msg_head;
+ int ret;
+ int msg_size = 4 * nla_total_size(sizeof(u32)) +
+ 2 * nla_total_size(sizeof(u64));
+
+ /* We have to allocate using GFP_NOFS as we are called from a
+ * filesystem performing write and thus further recursion into
+ * the fs to free some data could cause deadlocks. */
+ skb = genlmsg_new(msg_size, GFP_NOFS);
+ if (!skb) {
+ printk(KERN_ERR
+ "VFS: Not enough memory to send quota warning.\n");
+ return;
+ }
+ msg_head = genlmsg_put(skb, 0, atomic_add_return(1, &seq),
+ &quota_genl_family, 0, QUOTA_NL_C_WARNING);
+ if (!msg_head) {
+ printk(KERN_ERR
+ "VFS: Cannot store netlink header in quota warning.\n");
+ goto err_out;
+ }
+ ret = nla_put_u32(skb, QUOTA_NL_A_QTYPE, type);
+ if (ret)
+ goto attr_err_out;
+ ret = nla_put_u64(skb, QUOTA_NL_A_EXCESS_ID, id);
+ if (ret)
+ goto attr_err_out;
+ ret = nla_put_u32(skb, QUOTA_NL_A_WARNING, warntype);
+ if (ret)
+ goto attr_err_out;
+ ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MAJOR, MAJOR(dev));
+ if (ret)
+ goto attr_err_out;
+ ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MINOR, MINOR(dev));
+ if (ret)
+ goto attr_err_out;
+ ret = nla_put_u64(skb, QUOTA_NL_A_CAUSED_ID, current_uid());
+ if (ret)
+ goto attr_err_out;
+ genlmsg_end(skb, msg_head);
+
+ genlmsg_multicast(skb, 0, quota_genl_family.id, GFP_NOFS);
+ return;
+attr_err_out:
+ printk(KERN_ERR "VFS: Not enough space to compose quota message!\n");
+err_out:
+ kfree_skb(skb);
+}
+EXPORT_SYMBOL(quota_send_warning);
+
+static int __init quota_init(void)
+{
+ if (genl_register_family(&quota_genl_family) != 0)
+ printk(KERN_ERR
+ "VFS: Failed to create quota netlink interface.\n");
+ return 0;
+};
+
+module_init(quota_init);
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index ee91e275695..95388f9b735 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -10,7 +10,6 @@
#include <linux/slab.h>
#include <asm/current.h>
#include <asm/uaccess.h>
-#include <linux/compat.h>
#include <linux/kernel.h>
#include <linux/security.h>
#include <linux/syscalls.h>
@@ -18,220 +17,205 @@
#include <linux/capability.h>
#include <linux/quotaops.h>
#include <linux/types.h>
-#include <net/netlink.h>
-#include <net/genetlink.h>
+#include <linux/writeback.h>
-/* Check validity of generic quotactl commands */
-static int generic_quotactl_valid(struct super_block *sb, int type, int cmd,
- qid_t id)
+static int check_quotactl_permission(struct super_block *sb, int type, int cmd,
+ qid_t id)
{
- if (type >= MAXQUOTAS)
- return -EINVAL;
- if (!sb && cmd != Q_SYNC)
- return -ENODEV;
- /* Is operation supported? */
- if (sb && !sb->s_qcop)
- return -ENOSYS;
-
switch (cmd) {
- case Q_GETFMT:
- break;
- case Q_QUOTAON:
- if (!sb->s_qcop->quota_on)
- return -ENOSYS;
- break;
- case Q_QUOTAOFF:
- if (!sb->s_qcop->quota_off)
- return -ENOSYS;
- break;
- case Q_SETINFO:
- if (!sb->s_qcop->set_info)
- return -ENOSYS;
- break;
- case Q_GETINFO:
- if (!sb->s_qcop->get_info)
- return -ENOSYS;
- break;
- case Q_SETQUOTA:
- if (!sb->s_qcop->set_dqblk)
- return -ENOSYS;
- break;
- case Q_GETQUOTA:
- if (!sb->s_qcop->get_dqblk)
- return -ENOSYS;
- break;
- case Q_SYNC:
- if (sb && !sb->s_qcop->quota_sync)
- return -ENOSYS;
+ /* these commands do not require any special privilegues */
+ case Q_GETFMT:
+ case Q_SYNC:
+ case Q_GETINFO:
+ case Q_XGETQSTAT:
+ case Q_XQUOTASYNC:
+ break;
+ /* allow to query information for dquots we "own" */
+ case Q_GETQUOTA:
+ case Q_XGETQUOTA:
+ if ((type == USRQUOTA && current_euid() == id) ||
+ (type == GRPQUOTA && in_egroup_p(id)))
break;
- default:
- return -EINVAL;
+ /*FALLTHROUGH*/
+ default:
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
}
- /* Is quota turned on for commands which need it? */
- switch (cmd) {
- case Q_GETFMT:
- case Q_GETINFO:
- case Q_SETINFO:
- case Q_SETQUOTA:
- case Q_GETQUOTA:
- /* This is just an informative test so we are satisfied
- * without the lock */
- if (!sb_has_quota_active(sb, type))
- return -ESRCH;
- }
+ return security_quotactl(cmd, type, id, sb);
+}
- /* Check privileges */
- if (cmd == Q_GETQUOTA) {
- if (((type == USRQUOTA && current_euid() != id) ||
- (type == GRPQUOTA && !in_egroup_p(id))) &&
- !capable(CAP_SYS_ADMIN))
- return -EPERM;
+static int quota_sync_all(int type)
+{
+ struct super_block *sb;
+ int ret;
+
+ if (type >= MAXQUOTAS)
+ return -EINVAL;
+ ret = security_quotactl(Q_SYNC, type, 0, NULL);
+ if (ret)
+ return ret;
+
+ spin_lock(&sb_lock);
+restart:
+ list_for_each_entry(sb, &super_blocks, s_list) {
+ if (!sb->s_qcop || !sb->s_qcop->quota_sync)
+ continue;
+
+ sb->s_count++;
+ spin_unlock(&sb_lock);
+ down_read(&sb->s_umount);
+ if (sb->s_root)
+ sb->s_qcop->quota_sync(sb, type, 1);
+ up_read(&sb->s_umount);
+ spin_lock(&sb_lock);
+ if (__put_super_and_need_restart(sb))
+ goto restart;
}
- else if (cmd != Q_GETFMT && cmd != Q_SYNC && cmd != Q_GETINFO)
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
+ spin_unlock(&sb_lock);
return 0;
}
-/* Check validity of XFS Quota Manager commands */
-static int xqm_quotactl_valid(struct super_block *sb, int type, int cmd,
- qid_t id)
+static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id,
+ void __user *addr)
{
- if (type >= XQM_MAXQUOTAS)
- return -EINVAL;
- if (!sb)
- return -ENODEV;
- if (!sb->s_qcop)
- return -ENOSYS;
+ char *pathname;
+ int ret = -ENOSYS;
+
+ pathname = getname(addr);
+ if (IS_ERR(pathname))
+ return PTR_ERR(pathname);
+ if (sb->s_qcop->quota_on)
+ ret = sb->s_qcop->quota_on(sb, type, id, pathname, 0);
+ putname(pathname);
+ return ret;
+}
- switch (cmd) {
- case Q_XQUOTAON:
- case Q_XQUOTAOFF:
- case Q_XQUOTARM:
- if (!sb->s_qcop->set_xstate)
- return -ENOSYS;
- break;
- case Q_XGETQSTAT:
- if (!sb->s_qcop->get_xstate)
- return -ENOSYS;
- break;
- case Q_XSETQLIM:
- if (!sb->s_qcop->set_xquota)
- return -ENOSYS;
- break;
- case Q_XGETQUOTA:
- if (!sb->s_qcop->get_xquota)
- return -ENOSYS;
- break;
- case Q_XQUOTASYNC:
- if (!sb->s_qcop->quota_sync)
- return -ENOSYS;
- break;
- default:
- return -EINVAL;
- }
+static int quota_getfmt(struct super_block *sb, int type, void __user *addr)
+{
+ __u32 fmt;
- /* Check privileges */
- if (cmd == Q_XGETQUOTA) {
- if (((type == XQM_USRQUOTA && current_euid() != id) ||
- (type == XQM_GRPQUOTA && !in_egroup_p(id))) &&
- !capable(CAP_SYS_ADMIN))
- return -EPERM;
- } else if (cmd != Q_XGETQSTAT && cmd != Q_XQUOTASYNC) {
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
+ down_read(&sb_dqopt(sb)->dqptr_sem);
+ if (!sb_has_quota_active(sb, type)) {
+ up_read(&sb_dqopt(sb)->dqptr_sem);
+ return -ESRCH;
}
+ fmt = sb_dqopt(sb)->info[type].dqi_format->qf_fmt_id;
+ up_read(&sb_dqopt(sb)->dqptr_sem);
+ if (copy_to_user(addr, &fmt, sizeof(fmt)))
+ return -EFAULT;
+ return 0;
+}
+static int quota_getinfo(struct super_block *sb, int type, void __user *addr)
+{
+ struct if_dqinfo info;
+ int ret;
+
+ if (!sb_has_quota_active(sb, type))
+ return -ESRCH;
+ if (!sb->s_qcop->get_info)
+ return -ENOSYS;
+ ret = sb->s_qcop->get_info(sb, type, &info);
+ if (!ret && copy_to_user(addr, &info, sizeof(info)))
+ return -EFAULT;
+ return ret;
+}
+
+static int quota_setinfo(struct super_block *sb, int type, void __user *addr)
+{
+ struct if_dqinfo info;
+
+ if (copy_from_user(&info, addr, sizeof(info)))
+ return -EFAULT;
+ if (!sb_has_quota_active(sb, type))
+ return -ESRCH;
+ if (!sb->s_qcop->set_info)
+ return -ENOSYS;
+ return sb->s_qcop->set_info(sb, type, &info);
+}
+
+static int quota_getquota(struct super_block *sb, int type, qid_t id,
+ void __user *addr)
+{
+ struct if_dqblk idq;
+ int ret;
+
+ if (!sb_has_quota_active(sb, type))
+ return -ESRCH;
+ if (!sb->s_qcop->get_dqblk)
+ return -ENOSYS;
+ ret = sb->s_qcop->get_dqblk(sb, type, id, &idq);
+ if (ret)
+ return ret;
+ if (copy_to_user(addr, &idq, sizeof(idq)))
+ return -EFAULT;
return 0;
}
-static int check_quotactl_valid(struct super_block *sb, int type, int cmd,
- qid_t id)
+static int quota_setquota(struct super_block *sb, int type, qid_t id,
+ void __user *addr)
{
- int error;
-
- if (XQM_COMMAND(cmd))
- error = xqm_quotactl_valid(sb, type, cmd, id);
- else
- error = generic_quotactl_valid(sb, type, cmd, id);
- if (!error)
- error = security_quotactl(cmd, type, id, sb);
- return error;
+ struct if_dqblk idq;
+
+ if (copy_from_user(&idq, addr, sizeof(idq)))
+ return -EFAULT;
+ if (!sb_has_quota_active(sb, type))
+ return -ESRCH;
+ if (!sb->s_qcop->set_dqblk)
+ return -ENOSYS;
+ return sb->s_qcop->set_dqblk(sb, type, id, &idq);
}
-#ifdef CONFIG_QUOTA
-void sync_quota_sb(struct super_block *sb, int type)
+static int quota_setxstate(struct super_block *sb, int cmd, void __user *addr)
{
- int cnt;
+ __u32 flags;
- if (!sb->s_qcop->quota_sync)
- return;
+ if (copy_from_user(&flags, addr, sizeof(flags)))
+ return -EFAULT;
+ if (!sb->s_qcop->set_xstate)
+ return -ENOSYS;
+ return sb->s_qcop->set_xstate(sb, flags, cmd);
+}
- sb->s_qcop->quota_sync(sb, type);
+static int quota_getxstate(struct super_block *sb, void __user *addr)
+{
+ struct fs_quota_stat fqs;
+ int ret;
- if (sb_dqopt(sb)->flags & DQUOT_QUOTA_SYS_FILE)
- return;
- /* This is not very clever (and fast) but currently I don't know about
- * any other simple way of getting quota data to disk and we must get
- * them there for userspace to be visible... */
- if (sb->s_op->sync_fs)
- sb->s_op->sync_fs(sb, 1);
- sync_blockdev(sb->s_bdev);
+ if (!sb->s_qcop->get_xstate)
+ return -ENOSYS;
+ ret = sb->s_qcop->get_xstate(sb, &fqs);
+ if (!ret && copy_to_user(addr, &fqs, sizeof(fqs)))
+ return -EFAULT;
+ return ret;
+}
- /*
- * Now when everything is written we can discard the pagecache so
- * that userspace sees the changes.
- */
- mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
- for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
- if (type != -1 && cnt != type)
- continue;
- if (!sb_has_quota_active(sb, cnt))
- continue;
- mutex_lock_nested(&sb_dqopt(sb)->files[cnt]->i_mutex,
- I_MUTEX_QUOTA);
- truncate_inode_pages(&sb_dqopt(sb)->files[cnt]->i_data, 0);
- mutex_unlock(&sb_dqopt(sb)->files[cnt]->i_mutex);
- }
- mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
+static int quota_setxquota(struct super_block *sb, int type, qid_t id,
+ void __user *addr)
+{
+ struct fs_disk_quota fdq;
+
+ if (copy_from_user(&fdq, addr, sizeof(fdq)))
+ return -EFAULT;
+ if (!sb->s_qcop->set_xquota)
+ return -ENOSYS;
+ return sb->s_qcop->set_xquota(sb, type, id, &fdq);
}
-#endif
-static void sync_dquots(int type)
+static int quota_getxquota(struct super_block *sb, int type, qid_t id,
+ void __user *addr)
{
- struct super_block *sb;
- int cnt;
+ struct fs_disk_quota fdq;
+ int ret;
- spin_lock(&sb_lock);
-restart:
- list_for_each_entry(sb, &super_blocks, s_list) {
- /* This test just improves performance so it needn't be
- * reliable... */
- for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
- if (type != -1 && type != cnt)
- continue;
- if (!sb_has_quota_active(sb, cnt))
- continue;
- if (!info_dirty(&sb_dqopt(sb)->info[cnt]) &&
- list_empty(&sb_dqopt(sb)->info[cnt].dqi_dirty_list))
- continue;
- break;
- }
- if (cnt == MAXQUOTAS)
- continue;
- sb->s_count++;
- spin_unlock(&sb_lock);
- down_read(&sb->s_umount);
- if (sb->s_root)
- sync_quota_sb(sb, type);
- up_read(&sb->s_umount);
- spin_lock(&sb_lock);
- if (__put_super_and_need_restart(sb))
- goto restart;
- }
- spin_unlock(&sb_lock);
+ if (!sb->s_qcop->get_xquota)
+ return -ENOSYS;
+ ret = sb->s_qcop->get_xquota(sb, type, id, &fdq);
+ if (!ret && copy_to_user(addr, &fdq, sizeof(fdq)))
+ return -EFAULT;
+ return ret;
}
/* Copy parameters and call proper function */
@@ -240,117 +224,55 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
{
int ret;
+ if (type >= (XQM_COMMAND(cmd) ? XQM_MAXQUOTAS : MAXQUOTAS))
+ return -EINVAL;
+ if (!sb->s_qcop)
+ return -ENOSYS;
+
+ ret = check_quotactl_permission(sb, type, cmd, id);
+ if (ret < 0)
+ return ret;
+
switch (cmd) {
- case Q_QUOTAON: {
- char *pathname;
-
- pathname = getname(addr);
- if (IS_ERR(pathname))
- return PTR_ERR(pathname);
- ret = sb->s_qcop->quota_on(sb, type, id, pathname, 0);
- putname(pathname);
- return ret;
- }
- case Q_QUOTAOFF:
- return sb->s_qcop->quota_off(sb, type, 0);
-
- case Q_GETFMT: {
- __u32 fmt;
-
- down_read(&sb_dqopt(sb)->dqptr_sem);
- if (!sb_has_quota_active(sb, type)) {
- up_read(&sb_dqopt(sb)->dqptr_sem);
- return -ESRCH;
- }
- fmt = sb_dqopt(sb)->info[type].dqi_format->qf_fmt_id;
- up_read(&sb_dqopt(sb)->dqptr_sem);
- if (copy_to_user(addr, &fmt, sizeof(fmt)))
- return -EFAULT;
- return 0;
- }
- case Q_GETINFO: {
- struct if_dqinfo info;
-
- ret = sb->s_qcop->get_info(sb, type, &info);
- if (ret)
- return ret;
- if (copy_to_user(addr, &info, sizeof(info)))
- return -EFAULT;
- return 0;
- }
- case Q_SETINFO: {
- struct if_dqinfo info;
-
- if (copy_from_user(&info, addr, sizeof(info)))
- return -EFAULT;
- return sb->s_qcop->set_info(sb, type, &info);
- }
- case Q_GETQUOTA: {
- struct if_dqblk idq;
-
- ret = sb->s_qcop->get_dqblk(sb, type, id, &idq);
- if (ret)
- return ret;
- if (copy_to_user(addr, &idq, sizeof(idq)))
- return -EFAULT;
- return 0;
- }
- case Q_SETQUOTA: {
- struct if_dqblk idq;
-
- if (copy_from_user(&idq, addr, sizeof(idq)))
- return -EFAULT;
- return sb->s_qcop->set_dqblk(sb, type, id, &idq);
- }
- case Q_SYNC:
- if (sb)
- sync_quota_sb(sb, type);
- else
- sync_dquots(type);
- return 0;
-
- case Q_XQUOTAON:
- case Q_XQUOTAOFF:
- case Q_XQUOTARM: {
- __u32 flags;
-
- if (copy_from_user(&flags, addr, sizeof(flags)))
- return -EFAULT;
- return sb->s_qcop->set_xstate(sb, flags, cmd);
- }
- case Q_XGETQSTAT: {
- struct fs_quota_stat fqs;
-
- if ((ret = sb->s_qcop->get_xstate(sb, &fqs)))
- return ret;
- if (copy_to_user(addr, &fqs, sizeof(fqs)))
- return -EFAULT;
- return 0;
- }
- case Q_XSETQLIM: {
- struct fs_disk_quota fdq;
-
- if (copy_from_user(&fdq, addr, sizeof(fdq)))
- return -EFAULT;
- return sb->s_qcop->set_xquota(sb, type, id, &fdq);
- }
- case Q_XGETQUOTA: {
- struct fs_disk_quota fdq;
-
- ret = sb->s_qcop->get_xquota(sb, type, id, &fdq);
- if (ret)
- return ret;
- if (copy_to_user(addr, &fdq, sizeof(fdq)))
- return -EFAULT;
- return 0;
- }
- case Q_XQUOTASYNC:
- return sb->s_qcop->quota_sync(sb, type);
- /* We never reach here unless validity check is broken */
- default:
- BUG();
+ case Q_QUOTAON:
+ return quota_quotaon(sb, type, cmd, id, addr);
+ case Q_QUOTAOFF:
+ if (!sb->s_qcop->quota_off)
+ return -ENOSYS;
+ return sb->s_qcop->quota_off(sb, type, 0);
+ case Q_GETFMT:
+ return quota_getfmt(sb, type, addr);
+ case Q_GETINFO:
+ return quota_getinfo(sb, type, addr);
+ case Q_SETINFO:
+ return quota_setinfo(sb, type, addr);
+ case Q_GETQUOTA:
+ return quota_getquota(sb, type, id, addr);
+ case Q_SETQUOTA:
+ return quota_setquota(sb, type, id, addr);
+ case Q_SYNC:
+ if (!sb->s_qcop->quota_sync)
+ return -ENOSYS;
+ return sb->s_qcop->quota_sync(sb, type, 1);
+ case Q_XQUOTAON:
+ case Q_XQUOTAOFF:
+ case Q_XQUOTARM:
+ return quota_setxstate(sb, cmd, addr);
+ case Q_XGETQSTAT:
+ return quota_getxstate(sb, addr);
+ case Q_XSETQLIM:
+ return quota_setxquota(sb, type, id, addr);
+ case Q_XGETQUOTA:
+ return quota_getxquota(sb, type, id, addr);
+ case Q_XQUOTASYNC:
+ /* caller already holds s_umount */
+ if (sb->s_flags & MS_RDONLY)
+ return -EROFS;
+ writeback_inodes_sb(sb);
+ return 0;
+ default:
+ return -EINVAL;
}
- return 0;
}
/*
@@ -397,224 +319,23 @@ SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special,
cmds = cmd >> SUBCMDSHIFT;
type = cmd & SUBCMDMASK;
- if (cmds != Q_SYNC || special) {
- sb = quotactl_block(special);
- if (IS_ERR(sb))
- return PTR_ERR(sb);
+ /*
+ * As a special case Q_SYNC can be called without a specific device.
+ * It will iterate all superblocks that have quota enabled and call
+ * the sync action on each of them.
+ */
+ if (!special) {
+ if (cmds == Q_SYNC)
+ return quota_sync_all(type);
+ return -ENODEV;
}
- ret = check_quotactl_valid(sb, type, cmds, id);
- if (ret >= 0)
- ret = do_quotactl(sb, type, cmds, id, addr);
- if (sb)
- drop_super(sb);
+ sb = quotactl_block(special);
+ if (IS_ERR(sb))
+ return PTR_ERR(sb);
- return ret;
-}
-
-#if defined(CONFIG_COMPAT_FOR_U64_ALIGNMENT)
-/*
- * This code works only for 32 bit quota tools over 64 bit OS (x86_64, ia64)
- * and is necessary due to alignment problems.
- */
-struct compat_if_dqblk {
- compat_u64 dqb_bhardlimit;
- compat_u64 dqb_bsoftlimit;
- compat_u64 dqb_curspace;
- compat_u64 dqb_ihardlimit;
- compat_u64 dqb_isoftlimit;
- compat_u64 dqb_curinodes;
- compat_u64 dqb_btime;
- compat_u64 dqb_itime;
- compat_uint_t dqb_valid;
-};
-
-/* XFS structures */
-struct compat_fs_qfilestat {
- compat_u64 dqb_bhardlimit;
- compat_u64 qfs_nblks;
- compat_uint_t qfs_nextents;
-};
-
-struct compat_fs_quota_stat {
- __s8 qs_version;
- __u16 qs_flags;
- __s8 qs_pad;
- struct compat_fs_qfilestat qs_uquota;
- struct compat_fs_qfilestat qs_gquota;
- compat_uint_t qs_incoredqs;
- compat_int_t qs_btimelimit;
- compat_int_t qs_itimelimit;
- compat_int_t qs_rtbtimelimit;
- __u16 qs_bwarnlimit;
- __u16 qs_iwarnlimit;
-};
-
-asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special,
- qid_t id, void __user *addr)
-{
- unsigned int cmds;
- struct if_dqblk __user *dqblk;
- struct compat_if_dqblk __user *compat_dqblk;
- struct fs_quota_stat __user *fsqstat;
- struct compat_fs_quota_stat __user *compat_fsqstat;
- compat_uint_t data;
- u16 xdata;
- long ret;
+ ret = do_quotactl(sb, type, cmds, id, addr);
- cmds = cmd >> SUBCMDSHIFT;
-
- switch (cmds) {
- case Q_GETQUOTA:
- dqblk = compat_alloc_user_space(sizeof(struct if_dqblk));
- compat_dqblk = addr;
- ret = sys_quotactl(cmd, special, id, dqblk);
- if (ret)
- break;
- if (copy_in_user(compat_dqblk, dqblk, sizeof(*compat_dqblk)) ||
- get_user(data, &dqblk->dqb_valid) ||
- put_user(data, &compat_dqblk->dqb_valid))
- ret = -EFAULT;
- break;
- case Q_SETQUOTA:
- dqblk = compat_alloc_user_space(sizeof(struct if_dqblk));
- compat_dqblk = addr;
- ret = -EFAULT;
- if (copy_in_user(dqblk, compat_dqblk, sizeof(*compat_dqblk)) ||
- get_user(data, &compat_dqblk->dqb_valid) ||
- put_user(data, &dqblk->dqb_valid))
- break;
- ret = sys_quotactl(cmd, special, id, dqblk);
- break;
- case Q_XGETQSTAT:
- fsqstat = compat_alloc_user_space(sizeof(struct fs_quota_stat));
- compat_fsqstat = addr;
- ret = sys_quotactl(cmd, special, id, fsqstat);
- if (ret)
- break;
- ret = -EFAULT;
- /* Copying qs_version, qs_flags, qs_pad */
- if (copy_in_user(compat_fsqstat, fsqstat,
- offsetof(struct compat_fs_quota_stat, qs_uquota)))
- break;
- /* Copying qs_uquota */
- if (copy_in_user(&compat_fsqstat->qs_uquota,
- &fsqstat->qs_uquota,
- sizeof(compat_fsqstat->qs_uquota)) ||
- get_user(data, &fsqstat->qs_uquota.qfs_nextents) ||
- put_user(data, &compat_fsqstat->qs_uquota.qfs_nextents))
- break;
- /* Copying qs_gquota */
- if (copy_in_user(&compat_fsqstat->qs_gquota,
- &fsqstat->qs_gquota,
- sizeof(compat_fsqstat->qs_gquota)) ||
- get_user(data, &fsqstat->qs_gquota.qfs_nextents) ||
- put_user(data, &compat_fsqstat->qs_gquota.qfs_nextents))
- break;
- /* Copying the rest */
- if (copy_in_user(&compat_fsqstat->qs_incoredqs,
- &fsqstat->qs_incoredqs,
- sizeof(struct compat_fs_quota_stat) -
- offsetof(struct compat_fs_quota_stat, qs_incoredqs)) ||
- get_user(xdata, &fsqstat->qs_iwarnlimit) ||
- put_user(xdata, &compat_fsqstat->qs_iwarnlimit))
- break;
- ret = 0;
- break;
- default:
- ret = sys_quotactl(cmd, special, id, addr);
- }
+ drop_super(sb);
return ret;
}
-#endif
-
-
-#ifdef CONFIG_QUOTA_NETLINK_INTERFACE
-
-/* Netlink family structure for quota */
-static struct genl_family quota_genl_family = {
- .id = GENL_ID_GENERATE,
- .hdrsize = 0,
- .name = "VFS_DQUOT",
- .version = 1,
- .maxattr = QUOTA_NL_A_MAX,
-};
-
-/**
- * quota_send_warning - Send warning to userspace about exceeded quota
- * @type: The quota type: USRQQUOTA, GRPQUOTA,...
- * @id: The user or group id of the quota that was exceeded
- * @dev: The device on which the fs is mounted (sb->s_dev)
- * @warntype: The type of the warning: QUOTA_NL_...
- *
- * This can be used by filesystems (including those which don't use
- * dquot) to send a message to userspace relating to quota limits.
- *
- */
-
-void quota_send_warning(short type, unsigned int id, dev_t dev,
- const char warntype)
-{
- static atomic_t seq;
- struct sk_buff *skb;
- void *msg_head;
- int ret;
- int msg_size = 4 * nla_total_size(sizeof(u32)) +
- 2 * nla_total_size(sizeof(u64));
-
- /* We have to allocate using GFP_NOFS as we are called from a
- * filesystem performing write and thus further recursion into
- * the fs to free some data could cause deadlocks. */
- skb = genlmsg_new(msg_size, GFP_NOFS);
- if (!skb) {
- printk(KERN_ERR
- "VFS: Not enough memory to send quota warning.\n");
- return;
- }
- msg_head = genlmsg_put(skb, 0, atomic_add_return(1, &seq),
- &quota_genl_family, 0, QUOTA_NL_C_WARNING);
- if (!msg_head) {
- printk(KERN_ERR
- "VFS: Cannot store netlink header in quota warning.\n");
- goto err_out;
- }
- ret = nla_put_u32(skb, QUOTA_NL_A_QTYPE, type);
- if (ret)
- goto attr_err_out;
- ret = nla_put_u64(skb, QUOTA_NL_A_EXCESS_ID, id);
- if (ret)
- goto attr_err_out;
- ret = nla_put_u32(skb, QUOTA_NL_A_WARNING, warntype);
- if (ret)
- goto attr_err_out;
- ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MAJOR, MAJOR(dev));
- if (ret)
- goto attr_err_out;
- ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MINOR, MINOR(dev));
- if (ret)
- goto attr_err_out;
- ret = nla_put_u64(skb, QUOTA_NL_A_CAUSED_ID, current_uid());
- if (ret)
- goto attr_err_out;
- genlmsg_end(skb, msg_head);
-
- genlmsg_multicast(skb, 0, quota_genl_family.id, GFP_NOFS);
- return;
-attr_err_out:
- printk(KERN_ERR "VFS: Not enough space to compose quota message!\n");
-err_out:
- kfree_skb(skb);
-}
-EXPORT_SYMBOL(quota_send_warning);
-
-static int __init quota_init(void)
-{
- if (genl_register_family(&quota_genl_family) != 0)
- printk(KERN_ERR
- "VFS: Failed to create quota netlink interface.\n");
- return 0;
-};
-
-module_init(quota_init);
-#endif
-
diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c
index 65c87276117..dc014f7def0 100644
--- a/fs/reiserfs/bitmap.c
+++ b/fs/reiserfs/bitmap.c
@@ -425,7 +425,7 @@ static void _reiserfs_free_block(struct reiserfs_transaction_handle *th,
journal_mark_dirty(th, s, sbh);
if (for_unformatted)
- vfs_dq_free_block_nodirty(inode, 1);
+ dquot_free_block_nodirty(inode, 1);
}
void reiserfs_free_block(struct reiserfs_transaction_handle *th,
@@ -1049,7 +1049,7 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start
amount_needed, hint->inode->i_uid);
#endif
quota_ret =
- vfs_dq_alloc_block_nodirty(hint->inode, amount_needed);
+ dquot_alloc_block_nodirty(hint->inode, amount_needed);
if (quota_ret) /* Quota exceeded? */
return QUOTA_EXCEEDED;
if (hint->preallocate && hint->prealloc_size) {
@@ -1058,7 +1058,7 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start
"reiserquota: allocating (prealloc) %d blocks id=%u",
hint->prealloc_size, hint->inode->i_uid);
#endif
- quota_ret = vfs_dq_prealloc_block_nodirty(hint->inode,
+ quota_ret = dquot_prealloc_block_nodirty(hint->inode,
hint->prealloc_size);
if (quota_ret)
hint->preallocate = hint->prealloc_size = 0;
@@ -1092,7 +1092,7 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start
hint->inode->i_uid);
#endif
/* Free not allocated blocks */
- vfs_dq_free_block_nodirty(hint->inode,
+ dquot_free_block_nodirty(hint->inode,
amount_needed + hint->prealloc_size -
nr_allocated);
}
@@ -1125,7 +1125,7 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start
REISERFS_I(hint->inode)->i_prealloc_count,
hint->inode->i_uid);
#endif
- vfs_dq_free_block_nodirty(hint->inode, amount_needed +
+ dquot_free_block_nodirty(hint->inode, amount_needed +
hint->prealloc_size - nr_allocated -
REISERFS_I(hint->inode)->
i_prealloc_count);
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index da2dba082e2..1d9c12714c5 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -289,7 +289,7 @@ const struct file_operations reiserfs_file_operations = {
.compat_ioctl = reiserfs_compat_ioctl,
#endif
.mmap = reiserfs_file_mmap,
- .open = generic_file_open,
+ .open = dquot_file_open,
.release = reiserfs_file_release,
.fsync = reiserfs_sync_file,
.aio_read = generic_file_aio_read,
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 0d651f980a8..d1da94b82d8 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -34,6 +34,9 @@ void reiserfs_delete_inode(struct inode *inode)
int depth;
int err;
+ if (!is_bad_inode(inode))
+ dquot_initialize(inode);
+
truncate_inode_pages(&inode->i_data, 0);
depth = reiserfs_write_lock_once(inode->i_sb);
@@ -54,7 +57,7 @@ void reiserfs_delete_inode(struct inode *inode)
* after delete_object so that quota updates go into the same transaction as
* stat data deletion */
if (!err)
- vfs_dq_free_inode(inode);
+ dquot_free_inode(inode);
if (journal_end(&th, inode->i_sb, jbegin_count))
goto out;
@@ -1765,10 +1768,10 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
BUG_ON(!th->t_trans_id);
- if (vfs_dq_alloc_inode(inode)) {
- err = -EDQUOT;
+ dquot_initialize(inode);
+ err = dquot_alloc_inode(inode);
+ if (err)
goto out_end_trans;
- }
if (!dir->i_nlink) {
err = -EPERM;
goto out_bad_inode;
@@ -1959,12 +1962,12 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
INODE_PKEY(inode)->k_objectid = 0;
/* Quota change must be inside a transaction for journaling */
- vfs_dq_free_inode(inode);
+ dquot_free_inode(inode);
out_end_trans:
journal_end(th, th->t_super, th->t_blocks_allocated);
/* Drop can be outside and it needs more credits so it's better to have it outside */
- vfs_dq_drop(inode);
+ dquot_drop(inode);
inode->i_flags |= S_NOQUOTA;
make_bad_inode(inode);
@@ -3073,6 +3076,8 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
depth = reiserfs_write_lock_once(inode->i_sb);
if (attr->ia_valid & ATTR_SIZE) {
+ dquot_initialize(inode);
+
/* version 2 items will be caught by the s_maxbytes check
** done for us in vmtruncate
*/
@@ -3134,8 +3139,7 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
jbegin_count);
if (error)
goto out;
- error =
- vfs_dq_transfer(inode, attr) ? -EDQUOT : 0;
+ error = dquot_transfer(inode, attr);
if (error) {
journal_end(&th, inode->i_sb,
jbegin_count);
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 9d4dcf0b07c..96e4cbbfaa1 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -546,7 +546,7 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th,
*/
static int drop_new_inode(struct inode *inode)
{
- vfs_dq_drop(inode);
+ dquot_drop(inode);
make_bad_inode(inode);
inode->i_flags |= S_NOQUOTA;
iput(inode);
@@ -554,7 +554,7 @@ static int drop_new_inode(struct inode *inode)
}
/* utility function that does setup for reiserfs_new_inode.
-** vfs_dq_init needs lots of credits so it's better to have it
+** dquot_initialize needs lots of credits so it's better to have it
** outside of a transaction, so we had to pull some bits of
** reiserfs_new_inode out into this func.
*/
@@ -577,7 +577,7 @@ static int new_inode_init(struct inode *inode, struct inode *dir, int mode)
} else {
inode->i_gid = current_fsgid();
}
- vfs_dq_init(inode);
+ dquot_initialize(inode);
return 0;
}
@@ -594,6 +594,8 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode,
struct reiserfs_transaction_handle th;
struct reiserfs_security_handle security;
+ dquot_initialize(dir);
+
if (!(inode = new_inode(dir->i_sb))) {
return -ENOMEM;
}
@@ -666,6 +668,8 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, int mode,
if (!new_valid_dev(rdev))
return -EINVAL;
+ dquot_initialize(dir);
+
if (!(inode = new_inode(dir->i_sb))) {
return -ENOMEM;
}
@@ -739,6 +743,8 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) +
REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb));
+ dquot_initialize(dir);
+
#ifdef DISPLACE_NEW_PACKING_LOCALITIES
/* set flag that new packing locality created and new blocks for the content * of that directory are not displaced yet */
REISERFS_I(dir)->new_packing_locality = 1;
@@ -842,6 +848,8 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry)
JOURNAL_PER_BALANCE_CNT * 2 + 2 +
4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb);
+ dquot_initialize(dir);
+
reiserfs_write_lock(dir->i_sb);
retval = journal_begin(&th, dir->i_sb, jbegin_count);
if (retval)
@@ -923,6 +931,8 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry)
unsigned long savelink;
int depth;
+ dquot_initialize(dir);
+
inode = dentry->d_inode;
/* in this transaction we can be doing at max two balancings and update
@@ -1024,6 +1034,8 @@ static int reiserfs_symlink(struct inode *parent_dir,
2 * (REISERFS_QUOTA_INIT_BLOCKS(parent_dir->i_sb) +
REISERFS_QUOTA_TRANS_BLOCKS(parent_dir->i_sb));
+ dquot_initialize(parent_dir);
+
if (!(inode = new_inode(parent_dir->i_sb))) {
return -ENOMEM;
}
@@ -1111,6 +1123,8 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir,
JOURNAL_PER_BALANCE_CNT * 3 +
2 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb);
+ dquot_initialize(dir);
+
reiserfs_write_lock(dir->i_sb);
if (inode->i_nlink >= REISERFS_LINK_MAX) {
//FIXME: sd_nlink is 32 bit for new files
@@ -1235,6 +1249,9 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
JOURNAL_PER_BALANCE_CNT * 3 + 5 +
4 * REISERFS_QUOTA_TRANS_BLOCKS(old_dir->i_sb);
+ dquot_initialize(old_dir);
+ dquot_initialize(new_dir);
+
old_inode = old_dentry->d_inode;
new_dentry_inode = new_dentry->d_inode;
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index 5fa7118f04e..313d39d639e 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -1299,7 +1299,7 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th,
"reiserquota delete_item(): freeing %u, id=%u type=%c",
quota_cut_bytes, inode->i_uid, head2type(&s_ih));
#endif
- vfs_dq_free_space_nodirty(inode, quota_cut_bytes);
+ dquot_free_space_nodirty(inode, quota_cut_bytes);
/* Return deleted body length */
return ret_value;
@@ -1383,7 +1383,7 @@ void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th,
quota_cut_bytes, inode->i_uid,
key2type(key));
#endif
- vfs_dq_free_space_nodirty(inode,
+ dquot_free_space_nodirty(inode,
quota_cut_bytes);
}
break;
@@ -1733,7 +1733,7 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
"reiserquota cut_from_item(): freeing %u id=%u type=%c",
quota_cut_bytes, inode->i_uid, '?');
#endif
- vfs_dq_free_space_nodirty(inode, quota_cut_bytes);
+ dquot_free_space_nodirty(inode, quota_cut_bytes);
return ret_value;
}
@@ -1968,9 +1968,10 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree
key2type(&(key->on_disk_key)));
#endif
- if (vfs_dq_alloc_space_nodirty(inode, pasted_size)) {
+ retval = dquot_alloc_space_nodirty(inode, pasted_size);
+ if (retval) {
pathrelse(search_path);
- return -EDQUOT;
+ return retval;
}
init_tb_struct(th, &s_paste_balance, th->t_super, search_path,
pasted_size);
@@ -2024,7 +2025,7 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree
pasted_size, inode->i_uid,
key2type(&(key->on_disk_key)));
#endif
- vfs_dq_free_space_nodirty(inode, pasted_size);
+ dquot_free_space_nodirty(inode, pasted_size);
return retval;
}
@@ -2062,9 +2063,10 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th,
#endif
/* We can't dirty inode here. It would be immediately written but
* appropriate stat item isn't inserted yet... */
- if (vfs_dq_alloc_space_nodirty(inode, quota_bytes)) {
+ retval = dquot_alloc_space_nodirty(inode, quota_bytes);
+ if (retval) {
pathrelse(path);
- return -EDQUOT;
+ return retval;
}
}
init_tb_struct(th, &s_ins_balance, th->t_super, path,
@@ -2113,6 +2115,6 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th,
quota_bytes, inode->i_uid, head2type(ih));
#endif
if (inode)
- vfs_dq_free_space_nodirty(inode, quota_bytes);
+ dquot_free_space_nodirty(inode, quota_bytes);
return retval;
}
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index b4a7dd03bdb..04bf5d791bd 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -246,7 +246,7 @@ static int finish_unfinished(struct super_block *s)
retval = remove_save_link_only(s, &save_link_key, 0);
continue;
}
- vfs_dq_init(inode);
+ dquot_initialize(inode);
if (truncate && S_ISDIR(inode->i_mode)) {
/* We got a truncate request for a dir which is impossible.
@@ -578,6 +578,11 @@ out:
reiserfs_write_unlock_once(inode->i_sb, lock_depth);
}
+static void reiserfs_clear_inode(struct inode *inode)
+{
+ dquot_drop(inode);
+}
+
#ifdef CONFIG_QUOTA
static ssize_t reiserfs_quota_write(struct super_block *, int, const char *,
size_t, loff_t);
@@ -590,6 +595,7 @@ static const struct super_operations reiserfs_sops = {
.destroy_inode = reiserfs_destroy_inode,
.write_inode = reiserfs_write_inode,
.dirty_inode = reiserfs_dirty_inode,
+ .clear_inode = reiserfs_clear_inode,
.delete_inode = reiserfs_delete_inode,
.put_super = reiserfs_put_super,
.write_super = reiserfs_write_super,
@@ -616,13 +622,6 @@ static int reiserfs_write_info(struct super_block *, int);
static int reiserfs_quota_on(struct super_block *, int, int, char *, int);
static const struct dquot_operations reiserfs_quota_operations = {
- .initialize = dquot_initialize,
- .drop = dquot_drop,
- .alloc_space = dquot_alloc_space,
- .alloc_inode = dquot_alloc_inode,
- .free_space = dquot_free_space,
- .free_inode = dquot_free_inode,
- .transfer = dquot_transfer,
.write_dquot = reiserfs_write_dquot,
.acquire_dquot = reiserfs_acquire_dquot,
.release_dquot = reiserfs_release_dquot,
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 81f09fab8ae..37d034ca7d9 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -61,7 +61,6 @@
static int xattr_create(struct inode *dir, struct dentry *dentry, int mode)
{
BUG_ON(!mutex_is_locked(&dir->i_mutex));
- vfs_dq_init(dir);
return dir->i_op->create(dir, dentry, mode, NULL);
}
#endif
@@ -69,7 +68,6 @@ static int xattr_create(struct inode *dir, struct dentry *dentry, int mode)
static int xattr_mkdir(struct inode *dir, struct dentry *dentry, int mode)
{
BUG_ON(!mutex_is_locked(&dir->i_mutex));
- vfs_dq_init(dir);
return dir->i_op->mkdir(dir, dentry, mode);
}
@@ -81,7 +79,6 @@ static int xattr_unlink(struct inode *dir, struct dentry *dentry)
{
int error;
BUG_ON(!mutex_is_locked(&dir->i_mutex));
- vfs_dq_init(dir);
reiserfs_mutex_lock_nested_safe(&dentry->d_inode->i_mutex,
I_MUTEX_CHILD, dir->i_sb);
@@ -97,7 +94,6 @@ static int xattr_rmdir(struct inode *dir, struct dentry *dentry)
{
int error;
BUG_ON(!mutex_is_locked(&dir->i_mutex));
- vfs_dq_init(dir);
reiserfs_mutex_lock_nested_safe(&dentry->d_inode->i_mutex,
I_MUTEX_CHILD, dir->i_sb);
diff --git a/fs/sync.c b/fs/sync.c
index 418727a2a23..f557d71cb09 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -34,14 +34,14 @@ static int __sync_filesystem(struct super_block *sb, int wait)
if (!sb->s_bdi)
return 0;
- /* Avoid doing twice syncing and cache pruning for quota sync */
- if (!wait) {
- writeout_quota_sb(sb, -1);
- writeback_inodes_sb(sb);
- } else {
- sync_quota_sb(sb, -1);
+ if (sb->s_qcop && sb->s_qcop->quota_sync)
+ sb->s_qcop->quota_sync(sb, -1, wait);
+
+ if (wait)
sync_inodes_sb(sb);
- }
+ else
+ writeback_inodes_sb(sb);
+
if (sb->s_op->sync_fs)
sb->s_op->sync_fs(sb, wait);
return __sync_blockdev(sb->s_bdev, wait);
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index b2d96f45c12..ccc3ad7242d 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -208,7 +208,7 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
((char *)bh->b_data)[(bit + i) >> 3]);
} else {
if (inode)
- vfs_dq_free_block(inode, 1);
+ dquot_free_block(inode, 1);
udf_add_free_space(sb, sbi->s_partition, 1);
}
}
@@ -260,11 +260,11 @@ static int udf_bitmap_prealloc_blocks(struct super_block *sb,
while (bit < (sb->s_blocksize << 3) && block_count > 0) {
if (!udf_test_bit(bit, bh->b_data))
goto out;
- else if (vfs_dq_prealloc_block(inode, 1))
+ else if (dquot_prealloc_block(inode, 1))
goto out;
else if (!udf_clear_bit(bit, bh->b_data)) {
udf_debug("bit already cleared for block %d\n", bit);
- vfs_dq_free_block(inode, 1);
+ dquot_free_block(inode, 1);
goto out;
}
block_count--;
@@ -390,10 +390,14 @@ got_block:
/*
* Check quota for allocation of this block.
*/
- if (inode && vfs_dq_alloc_block(inode, 1)) {
- mutex_unlock(&sbi->s_alloc_mutex);
- *err = -EDQUOT;
- return 0;
+ if (inode) {
+ int ret = dquot_alloc_block(inode, 1);
+
+ if (ret) {
+ mutex_unlock(&sbi->s_alloc_mutex);
+ *err = ret;
+ return 0;
+ }
}
newblock = bit + (block_group << (sb->s_blocksize_bits + 3)) -
@@ -449,7 +453,7 @@ static void udf_table_free_blocks(struct super_block *sb,
/* We do this up front - There are some error conditions that
could occure, but.. oh well */
if (inode)
- vfs_dq_free_block(inode, count);
+ dquot_free_block(inode, count);
udf_add_free_space(sb, sbi->s_partition, count);
start = bloc->logicalBlockNum + offset;
@@ -694,7 +698,7 @@ static int udf_table_prealloc_blocks(struct super_block *sb,
epos.offset -= adsize;
alloc_count = (elen >> sb->s_blocksize_bits);
- if (inode && vfs_dq_prealloc_block(inode,
+ if (inode && dquot_prealloc_block(inode,
alloc_count > block_count ? block_count : alloc_count))
alloc_count = 0;
else if (alloc_count > block_count) {
@@ -797,12 +801,13 @@ static int udf_table_new_block(struct super_block *sb,
newblock = goal_eloc.logicalBlockNum;
goal_eloc.logicalBlockNum++;
goal_elen -= sb->s_blocksize;
-
- if (inode && vfs_dq_alloc_block(inode, 1)) {
- brelse(goal_epos.bh);
- mutex_unlock(&sbi->s_alloc_mutex);
- *err = -EDQUOT;
- return 0;
+ if (inode) {
+ *err = dquot_alloc_block(inode, 1);
+ if (*err) {
+ brelse(goal_epos.bh);
+ mutex_unlock(&sbi->s_alloc_mutex);
+ return 0;
+ }
}
if (goal_elen)
diff --git a/fs/udf/file.c b/fs/udf/file.c
index f311d509b6a..1eb06774ed9 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -34,6 +34,7 @@
#include <linux/errno.h>
#include <linux/smp_lock.h>
#include <linux/pagemap.h>
+#include <linux/quotaops.h>
#include <linux/buffer_head.h>
#include <linux/aio.h>
@@ -207,7 +208,7 @@ const struct file_operations udf_file_operations = {
.read = do_sync_read,
.aio_read = generic_file_aio_read,
.ioctl = udf_ioctl,
- .open = generic_file_open,
+ .open = dquot_file_open,
.mmap = generic_file_mmap,
.write = do_sync_write,
.aio_write = udf_file_aio_write,
@@ -217,6 +218,29 @@ const struct file_operations udf_file_operations = {
.llseek = generic_file_llseek,
};
+static int udf_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+ struct inode *inode = dentry->d_inode;
+ int error;
+
+ error = inode_change_ok(inode, iattr);
+ if (error)
+ return error;
+
+ if (iattr->ia_valid & ATTR_SIZE)
+ dquot_initialize(inode);
+
+ if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
+ (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
+ error = dquot_transfer(inode, iattr);
+ if (error)
+ return error;
+ }
+
+ return inode_setattr(inode, iattr);
+}
+
const struct inode_operations udf_file_inode_operations = {
- .truncate = udf_truncate,
+ .truncate = udf_truncate,
+ .setattr = udf_setattr,
};
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index c10fa39f97e..fb68c9cd0c3 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -36,8 +36,8 @@ void udf_free_inode(struct inode *inode)
* Note: we must free any quota before locking the superblock,
* as writing the quota to disk may need the lock as well.
*/
- vfs_dq_free_inode(inode);
- vfs_dq_drop(inode);
+ dquot_free_inode(inode);
+ dquot_drop(inode);
clear_inode(inode);
@@ -61,7 +61,7 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
struct super_block *sb = dir->i_sb;
struct udf_sb_info *sbi = UDF_SB(sb);
struct inode *inode;
- int block;
+ int block, ret;
uint32_t start = UDF_I(dir)->i_location.logicalBlockNum;
struct udf_inode_info *iinfo;
struct udf_inode_info *dinfo = UDF_I(dir);
@@ -153,12 +153,14 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
insert_inode_hash(inode);
mark_inode_dirty(inode);
- if (vfs_dq_alloc_inode(inode)) {
- vfs_dq_drop(inode);
+ dquot_initialize(inode);
+ ret = dquot_alloc_inode(inode);
+ if (ret) {
+ dquot_drop(inode);
inode->i_flags |= S_NOQUOTA;
inode->i_nlink = 0;
iput(inode);
- *err = -EDQUOT;
+ *err = ret;
return NULL;
}
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index b0208924729..b57ab0402d8 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -36,6 +36,7 @@
#include <linux/pagemap.h>
#include <linux/buffer_head.h>
#include <linux/writeback.h>
+#include <linux/quotaops.h>
#include <linux/slab.h>
#include <linux/crc-itu-t.h>
@@ -70,6 +71,9 @@ static int udf_get_block(struct inode *, sector_t, struct buffer_head *, int);
void udf_delete_inode(struct inode *inode)
{
+ if (!is_bad_inode(inode))
+ dquot_initialize(inode);
+
truncate_inode_pages(&inode->i_data, 0);
if (is_bad_inode(inode))
@@ -108,6 +112,8 @@ void udf_clear_inode(struct inode *inode)
(unsigned long long)inode->i_size,
(unsigned long long)iinfo->i_lenExtents);
}
+
+ dquot_drop(inode);
kfree(iinfo->i_ext.i_data);
iinfo->i_ext.i_data = NULL;
}
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 7c56ff00cd5..db423ab078b 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -563,6 +563,8 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode,
int err;
struct udf_inode_info *iinfo;
+ dquot_initialize(dir);
+
lock_kernel();
inode = udf_new_inode(dir, mode, &err);
if (!inode) {
@@ -616,6 +618,8 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode,
if (!old_valid_dev(rdev))
return -EINVAL;
+ dquot_initialize(dir);
+
lock_kernel();
err = -EIO;
inode = udf_new_inode(dir, mode, &err);
@@ -662,6 +666,8 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
struct udf_inode_info *dinfo = UDF_I(dir);
struct udf_inode_info *iinfo;
+ dquot_initialize(dir);
+
lock_kernel();
err = -EMLINK;
if (dir->i_nlink >= (256 << sizeof(dir->i_nlink)) - 1)
@@ -799,6 +805,8 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
struct fileIdentDesc *fi, cfi;
struct kernel_lb_addr tloc;
+ dquot_initialize(dir);
+
retval = -ENOENT;
lock_kernel();
fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
@@ -845,6 +853,8 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry)
struct fileIdentDesc cfi;
struct kernel_lb_addr tloc;
+ dquot_initialize(dir);
+
retval = -ENOENT;
lock_kernel();
fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
@@ -899,6 +909,8 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
struct buffer_head *bh;
struct udf_inode_info *iinfo;
+ dquot_initialize(dir);
+
lock_kernel();
inode = udf_new_inode(dir, S_IFLNK, &err);
if (!inode)
@@ -1069,6 +1081,8 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir,
int err;
struct buffer_head *bh;
+ dquot_initialize(dir);
+
lock_kernel();
if (inode->i_nlink >= (256 << sizeof(inode->i_nlink)) - 1) {
unlock_kernel();
@@ -1131,6 +1145,9 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
struct kernel_lb_addr tloc;
struct udf_inode_info *old_iinfo = UDF_I(old_inode);
+ dquot_initialize(old_dir);
+ dquot_initialize(new_dir);
+
lock_kernel();
ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi);
if (ofi) {
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index 54c16ec95df..5cfa4d85ccf 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -85,7 +85,7 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count)
"bit already cleared for fragment %u", i);
}
- vfs_dq_free_block(inode, count);
+ dquot_free_block(inode, count);
fs32_add(sb, &ucg->cg_cs.cs_nffree, count);
@@ -195,7 +195,7 @@ do_more:
ubh_setblock(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno);
if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
ufs_clusteracct (sb, ucpi, blkno, 1);
- vfs_dq_free_block(inode, uspi->s_fpb);
+ dquot_free_block(inode, uspi->s_fpb);
fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1);
uspi->cs_total.cs_nbfree++;
@@ -511,6 +511,7 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment,
struct ufs_cg_private_info * ucpi;
struct ufs_cylinder_group * ucg;
unsigned cgno, fragno, fragoff, count, fragsize, i;
+ int ret;
UFSD("ENTER, fragment %llu, oldcount %u, newcount %u\n",
(unsigned long long)fragment, oldcount, newcount);
@@ -556,8 +557,9 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment,
fs32_add(sb, &ucg->cg_frsum[fragsize - count], 1);
for (i = oldcount; i < newcount; i++)
ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i);
- if (vfs_dq_alloc_block(inode, count)) {
- *err = -EDQUOT;
+ ret = dquot_alloc_block(inode, count);
+ if (ret) {
+ *err = ret;
return 0;
}
@@ -596,6 +598,7 @@ static u64 ufs_alloc_fragments(struct inode *inode, unsigned cgno,
struct ufs_cylinder_group * ucg;
unsigned oldcg, i, j, k, allocsize;
u64 result;
+ int ret;
UFSD("ENTER, ino %lu, cgno %u, goal %llu, count %u\n",
inode->i_ino, cgno, (unsigned long long)goal, count);
@@ -664,7 +667,7 @@ cg_found:
for (i = count; i < uspi->s_fpb; i++)
ubh_setbit (UCPI_UBH(ucpi), ucpi->c_freeoff, goal + i);
i = uspi->s_fpb - count;
- vfs_dq_free_block(inode, i);
+ dquot_free_block(inode, i);
fs32_add(sb, &ucg->cg_cs.cs_nffree, i);
uspi->cs_total.cs_nffree += i;
@@ -676,8 +679,9 @@ cg_found:
result = ufs_bitmap_search (sb, ucpi, goal, allocsize);
if (result == INVBLOCK)
return 0;
- if (vfs_dq_alloc_block(inode, count)) {
- *err = -EDQUOT;
+ ret = dquot_alloc_block(inode, count);
+ if (ret) {
+ *err = ret;
return 0;
}
for (i = 0; i < count; i++)
@@ -714,6 +718,7 @@ static u64 ufs_alloccg_block(struct inode *inode,
struct ufs_super_block_first * usb1;
struct ufs_cylinder_group * ucg;
u64 result, blkno;
+ int ret;
UFSD("ENTER, goal %llu\n", (unsigned long long)goal);
@@ -747,8 +752,9 @@ gotit:
ubh_clrblock (UCPI_UBH(ucpi), ucpi->c_freeoff, blkno);
if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
ufs_clusteracct (sb, ucpi, blkno, -1);
- if (vfs_dq_alloc_block(inode, uspi->s_fpb)) {
- *err = -EDQUOT;
+ ret = dquot_alloc_block(inode, uspi->s_fpb);
+ if (ret) {
+ *err = ret;
return INVBLOCK;
}
diff --git a/fs/ufs/file.c b/fs/ufs/file.c
index 73655c61240..a8962cecde5 100644
--- a/fs/ufs/file.c
+++ b/fs/ufs/file.c
@@ -24,6 +24,7 @@
*/
#include <linux/fs.h>
+#include <linux/quotaops.h>
#include "ufs_fs.h"
#include "ufs.h"
@@ -40,7 +41,7 @@ const struct file_operations ufs_file_operations = {
.write = do_sync_write,
.aio_write = generic_file_aio_write,
.mmap = generic_file_mmap,
- .open = generic_file_open,
+ .open = dquot_file_open,
.fsync = simple_fsync,
.splice_read = generic_file_splice_read,
};
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index 3527c00fef0..230ecf60802 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -95,8 +95,8 @@ void ufs_free_inode (struct inode * inode)
is_directory = S_ISDIR(inode->i_mode);
- vfs_dq_free_inode(inode);
- vfs_dq_drop(inode);
+ dquot_free_inode(inode);
+ dquot_drop(inode);
clear_inode (inode);
@@ -355,9 +355,10 @@ cg_found:
unlock_super (sb);
- if (vfs_dq_alloc_inode(inode)) {
- vfs_dq_drop(inode);
- err = -EDQUOT;
+ dquot_initialize(inode);
+ err = dquot_alloc_inode(inode);
+ if (err) {
+ dquot_drop(inode);
goto fail_without_unlock;
}
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 0a627e08610..80b68c3702d 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -37,6 +37,7 @@
#include <linux/smp_lock.h>
#include <linux/buffer_head.h>
#include <linux/writeback.h>
+#include <linux/quotaops.h>
#include "ufs_fs.h"
#include "ufs.h"
@@ -909,6 +910,9 @@ void ufs_delete_inode (struct inode * inode)
{
loff_t old_i_size;
+ if (!is_bad_inode(inode))
+ dquot_initialize(inode);
+
truncate_inode_pages(&inode->i_data, 0);
if (is_bad_inode(inode))
goto no_delete;
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 4c26d9e8bc9..118556243e7 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -30,6 +30,7 @@
#include <linux/time.h>
#include <linux/fs.h>
#include <linux/smp_lock.h>
+#include <linux/quotaops.h>
#include "ufs_fs.h"
#include "ufs.h"
@@ -84,6 +85,9 @@ static int ufs_create (struct inode * dir, struct dentry * dentry, int mode,
int err;
UFSD("BEGIN\n");
+
+ dquot_initialize(dir);
+
inode = ufs_new_inode(dir, mode);
err = PTR_ERR(inode);
@@ -107,6 +111,9 @@ static int ufs_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_t
if (!old_valid_dev(rdev))
return -EINVAL;
+
+ dquot_initialize(dir);
+
inode = ufs_new_inode(dir, mode);
err = PTR_ERR(inode);
if (!IS_ERR(inode)) {
@@ -131,6 +138,8 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry,
if (l > sb->s_blocksize)
goto out_notlocked;
+ dquot_initialize(dir);
+
lock_kernel();
inode = ufs_new_inode(dir, S_IFLNK | S_IRWXUGO);
err = PTR_ERR(inode);
@@ -176,6 +185,8 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir,
return -EMLINK;
}
+ dquot_initialize(dir);
+
inode->i_ctime = CURRENT_TIME_SEC;
inode_inc_link_count(inode);
atomic_inc(&inode->i_count);
@@ -193,6 +204,8 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, int mode)
if (dir->i_nlink >= UFS_LINK_MAX)
goto out;
+ dquot_initialize(dir);
+
lock_kernel();
inode_inc_link_count(dir);
@@ -237,6 +250,8 @@ static int ufs_unlink(struct inode *dir, struct dentry *dentry)
struct page *page;
int err = -ENOENT;
+ dquot_initialize(dir);
+
de = ufs_find_entry(dir, &dentry->d_name, &page);
if (!de)
goto out;
@@ -281,6 +296,9 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct ufs_dir_entry *old_de;
int err = -ENOENT;
+ dquot_initialize(old_dir);
+ dquot_initialize(new_dir);
+
old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_page);
if (!old_de)
goto out;
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 143c20bfb04..66b63a75161 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1432,6 +1432,11 @@ static void destroy_inodecache(void)
kmem_cache_destroy(ufs_inode_cachep);
}
+static void ufs_clear_inode(struct inode *inode)
+{
+ dquot_drop(inode);
+}
+
#ifdef CONFIG_QUOTA
static ssize_t ufs_quota_read(struct super_block *, int, char *,size_t, loff_t);
static ssize_t ufs_quota_write(struct super_block *, int, const char *, size_t, loff_t);
@@ -1442,6 +1447,7 @@ static const struct super_operations ufs_super_ops = {
.destroy_inode = ufs_destroy_inode,
.write_inode = ufs_write_inode,
.delete_inode = ufs_delete_inode,
+ .clear_inode = ufs_clear_inode,
.put_super = ufs_put_super,
.write_super = ufs_write_super,
.sync_fs = ufs_sync_fs,
diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c
index 41dd431ce22..d3b6270cb37 100644
--- a/fs/ufs/truncate.c
+++ b/fs/ufs/truncate.c
@@ -44,6 +44,7 @@
#include <linux/buffer_head.h>
#include <linux/blkdev.h>
#include <linux/sched.h>
+#include <linux/quotaops.h>
#include "ufs_fs.h"
#include "ufs.h"
@@ -517,9 +518,18 @@ static int ufs_setattr(struct dentry *dentry, struct iattr *attr)
if (error)
return error;
+ if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
+ (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
+ error = dquot_transfer(inode, attr);
+ if (error)
+ return error;
+ }
if (ia_valid & ATTR_SIZE &&
attr->ia_size != i_size_read(inode)) {
loff_t old_i_size = inode->i_size;
+
+ dquot_initialize(inode);
+
error = vmtruncate(inode, attr->ia_size);
if (error)
return error;
diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c
index 3d4a0c84d63..1947514ce1a 100644
--- a/fs/xfs/linux-2.6/xfs_quotaops.c
+++ b/fs/xfs/linux-2.6/xfs_quotaops.c
@@ -44,20 +44,6 @@ xfs_quota_type(int type)
}
STATIC int
-xfs_fs_quota_sync(
- struct super_block *sb,
- int type)
-{
- struct xfs_mount *mp = XFS_M(sb);
-
- if (sb->s_flags & MS_RDONLY)
- return -EROFS;
- if (!XFS_IS_QUOTA_RUNNING(mp))
- return -ENOSYS;
- return -xfs_sync_data(mp, 0);
-}
-
-STATIC int
xfs_fs_get_xstate(
struct super_block *sb,
struct fs_quota_stat *fqs)
@@ -82,8 +68,6 @@ xfs_fs_set_xstate(
return -EROFS;
if (op != Q_XQUOTARM && !XFS_IS_QUOTA_RUNNING(mp))
return -ENOSYS;
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
if (uflags & XFS_QUOTA_UDQ_ACCT)
flags |= XFS_UQUOTA_ACCT;
@@ -144,14 +128,11 @@ xfs_fs_set_xquota(
return -ENOSYS;
if (!XFS_IS_QUOTA_ON(mp))
return -ESRCH;
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
return -xfs_qm_scall_setqlim(mp, id, xfs_quota_type(type), fdq);
}
const struct quotactl_ops xfs_quotactl_operations = {
- .quota_sync = xfs_fs_quota_sync,
.get_xstate = xfs_fs_get_xstate,
.set_xstate = xfs_fs_set_xstate,
.get_xquota = xfs_fs_get_xquota,
diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h
index deac2566450..cac84b00666 100644
--- a/include/linux/ext3_fs.h
+++ b/include/linux/ext3_fs.h
@@ -202,14 +202,6 @@ static inline __u32 ext3_mask_flags(umode_t mode, __u32 flags)
return flags & EXT3_OTHER_FLMASK;
}
-/*
- * Inode dynamic state flags
- */
-#define EXT3_STATE_JDATA 0x00000001 /* journaled data exists */
-#define EXT3_STATE_NEW 0x00000002 /* inode is newly created */
-#define EXT3_STATE_XATTR 0x00000004 /* has in-inode xattrs */
-#define EXT3_STATE_FLUSH_ON_CLOSE 0x00000008
-
/* Used to pass group descriptor data when online resize is done */
struct ext3_new_group_input {
__u32 group; /* Group number for this data */
@@ -560,6 +552,31 @@ static inline int ext3_valid_inum(struct super_block *sb, unsigned long ino)
(ino >= EXT3_FIRST_INO(sb) &&
ino <= le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count));
}
+
+/*
+ * Inode dynamic state flags
+ */
+enum {
+ EXT3_STATE_JDATA, /* journaled data exists */
+ EXT3_STATE_NEW, /* inode is newly created */
+ EXT3_STATE_XATTR, /* has in-inode xattrs */
+ EXT3_STATE_FLUSH_ON_CLOSE, /* flush dirty pages on close */
+};
+
+static inline int ext3_test_inode_state(struct inode *inode, int bit)
+{
+ return test_bit(bit, &EXT3_I(inode)->i_state);
+}
+
+static inline void ext3_set_inode_state(struct inode *inode, int bit)
+{
+ set_bit(bit, &EXT3_I(inode)->i_state);
+}
+
+static inline void ext3_clear_inode_state(struct inode *inode, int bit)
+{
+ clear_bit(bit, &EXT3_I(inode)->i_state);
+}
#else
/* Assume that user mode programs are passing in an ext3fs superblock, not
* a kernel struct super_block. This will allow us to call the feature-test
diff --git a/include/linux/ext3_fs_i.h b/include/linux/ext3_fs_i.h
index 93e7428156b..7679acdb519 100644
--- a/include/linux/ext3_fs_i.h
+++ b/include/linux/ext3_fs_i.h
@@ -87,7 +87,7 @@ struct ext3_inode_info {
* near to their parent directory's inode.
*/
__u32 i_block_group;
- __u32 i_state; /* Dynamic state flags for ext3 */
+ unsigned long i_state; /* Dynamic state flags for ext3 */
/* block reservation info */
struct ext3_block_alloc_info *i_block_alloc_info;
diff --git a/include/linux/jbd.h b/include/linux/jbd.h
index 331530cd3cc..f3aa59cb675 100644
--- a/include/linux/jbd.h
+++ b/include/linux/jbd.h
@@ -246,19 +246,8 @@ typedef struct journal_superblock_s
#define J_ASSERT(assert) BUG_ON(!(assert))
-#if defined(CONFIG_BUFFER_DEBUG)
-void buffer_assertion_failure(struct buffer_head *bh);
-#define J_ASSERT_BH(bh, expr) \
- do { \
- if (!(expr)) \
- buffer_assertion_failure(bh); \
- J_ASSERT(expr); \
- } while (0)
-#define J_ASSERT_JH(jh, expr) J_ASSERT_BH(jh2bh(jh), expr)
-#else
#define J_ASSERT_BH(bh, expr) J_ASSERT(expr)
#define J_ASSERT_JH(jh, expr) J_ASSERT(expr)
-#endif
#if defined(JBD_PARANOID_IOFAIL)
#define J_EXPECT(expr, why...) J_ASSERT(expr)
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 8ada2a129d0..1ec87635818 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -277,19 +277,8 @@ typedef struct journal_superblock_s
#define J_ASSERT(assert) BUG_ON(!(assert))
-#if defined(CONFIG_BUFFER_DEBUG)
-void buffer_assertion_failure(struct buffer_head *bh);
-#define J_ASSERT_BH(bh, expr) \
- do { \
- if (!(expr)) \
- buffer_assertion_failure(bh); \
- J_ASSERT(expr); \
- } while (0)
-#define J_ASSERT_JH(jh, expr) J_ASSERT_BH(jh2bh(jh), expr)
-#else
#define J_ASSERT_BH(bh, expr) J_ASSERT(expr)
#define J_ASSERT_JH(jh, expr) J_ASSERT(expr)
-#endif
#if defined(JBD2_PARANOID_IOFAIL)
#define J_EXPECT(expr, why...) J_ASSERT(expr)
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index a24de0b1858..60df9c84eca 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -103,7 +103,7 @@ struct kvm_userspace_memory_region {
/* for kvm_memory_region::flags */
#define KVM_MEM_LOG_DIRTY_PAGES 1UL
-
+#define KVM_MEMSLOT_INVALID (1UL << 1)
/* for KVM_IRQ_LINE */
struct kvm_irq_level {
@@ -497,6 +497,11 @@ struct kvm_ioeventfd {
#endif
#define KVM_CAP_S390_PSW 42
#define KVM_CAP_PPC_SEGSTATE 43
+#define KVM_CAP_HYPERV 44
+#define KVM_CAP_HYPERV_VAPIC 45
+#define KVM_CAP_HYPERV_SPIN 46
+#define KVM_CAP_PCI_SEGMENT 47
+#define KVM_CAP_X86_ROBUST_SINGLESTEP 51
#ifdef KVM_CAP_IRQ_ROUTING
@@ -691,8 +696,9 @@ struct kvm_assigned_pci_dev {
__u32 busnr;
__u32 devfn;
__u32 flags;
+ __u32 segnr;
union {
- __u32 reserved[12];
+ __u32 reserved[11];
};
};
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index bd5a616d937..a3fd0f91d94 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -38,6 +38,7 @@
#define KVM_REQ_MMU_SYNC 7
#define KVM_REQ_KVMCLOCK_UPDATE 8
#define KVM_REQ_KICK 9
+#define KVM_REQ_DEACTIVATE_FPU 10
#define KVM_USERSPACE_IRQ_SOURCE_ID 0
@@ -57,20 +58,20 @@ struct kvm_io_bus {
struct kvm_io_device *devs[NR_IOBUS_DEVS];
};
-void kvm_io_bus_init(struct kvm_io_bus *bus);
-void kvm_io_bus_destroy(struct kvm_io_bus *bus);
-int kvm_io_bus_write(struct kvm_io_bus *bus, gpa_t addr, int len,
- const void *val);
-int kvm_io_bus_read(struct kvm_io_bus *bus, gpa_t addr, int len,
+enum kvm_bus {
+ KVM_MMIO_BUS,
+ KVM_PIO_BUS,
+ KVM_NR_BUSES
+};
+
+int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
+ int len, const void *val);
+int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int len,
void *val);
-int __kvm_io_bus_register_dev(struct kvm_io_bus *bus,
- struct kvm_io_device *dev);
-int kvm_io_bus_register_dev(struct kvm *kvm, struct kvm_io_bus *bus,
+int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx,
struct kvm_io_device *dev);
-void __kvm_io_bus_unregister_dev(struct kvm_io_bus *bus,
- struct kvm_io_device *dev);
-void kvm_io_bus_unregister_dev(struct kvm *kvm, struct kvm_io_bus *bus,
- struct kvm_io_device *dev);
+int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
+ struct kvm_io_device *dev);
struct kvm_vcpu {
struct kvm *kvm;
@@ -83,6 +84,8 @@ struct kvm_vcpu {
struct kvm_run *run;
unsigned long requests;
unsigned long guest_debug;
+ int srcu_idx;
+
int fpu_active;
int guest_fpu_loaded;
wait_queue_head_t wq;
@@ -150,14 +153,19 @@ struct kvm_irq_routing_table {};
#endif
-struct kvm {
- spinlock_t mmu_lock;
- spinlock_t requests_lock;
- struct rw_semaphore slots_lock;
- struct mm_struct *mm; /* userspace tied to this vm */
+struct kvm_memslots {
int nmemslots;
struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS +
KVM_PRIVATE_MEM_SLOTS];
+};
+
+struct kvm {
+ spinlock_t mmu_lock;
+ raw_spinlock_t requests_lock;
+ struct mutex slots_lock;
+ struct mm_struct *mm; /* userspace tied to this vm */
+ struct kvm_memslots *memslots;
+ struct srcu_struct srcu;
#ifdef CONFIG_KVM_APIC_ARCHITECTURE
u32 bsp_vcpu_id;
struct kvm_vcpu *bsp_vcpu;
@@ -166,8 +174,7 @@ struct kvm {
atomic_t online_vcpus;
struct list_head vm_list;
struct mutex lock;
- struct kvm_io_bus mmio_bus;
- struct kvm_io_bus pio_bus;
+ struct kvm_io_bus *buses[KVM_NR_BUSES];
#ifdef CONFIG_HAVE_KVM_EVENTFD
struct {
spinlock_t lock;
@@ -249,13 +256,20 @@ int kvm_set_memory_region(struct kvm *kvm,
int __kvm_set_memory_region(struct kvm *kvm,
struct kvm_userspace_memory_region *mem,
int user_alloc);
-int kvm_arch_set_memory_region(struct kvm *kvm,
+int kvm_arch_prepare_memory_region(struct kvm *kvm,
+ struct kvm_memory_slot *memslot,
+ struct kvm_memory_slot old,
+ struct kvm_userspace_memory_region *mem,
+ int user_alloc);
+void kvm_arch_commit_memory_region(struct kvm *kvm,
struct kvm_userspace_memory_region *mem,
struct kvm_memory_slot old,
int user_alloc);
void kvm_disable_largepages(void);
void kvm_arch_flush_shadow(struct kvm *kvm);
gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn);
+gfn_t unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn);
+
struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn);
void kvm_release_page_clean(struct page *page);
@@ -264,6 +278,9 @@ void kvm_set_page_dirty(struct page *page);
void kvm_set_page_accessed(struct page *page);
pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn);
+pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
+ struct kvm_memory_slot *slot, gfn_t gfn);
+int memslot_id(struct kvm *kvm, gfn_t gfn);
void kvm_release_pfn_dirty(pfn_t);
void kvm_release_pfn_clean(pfn_t pfn);
void kvm_set_pfn_dirty(pfn_t pfn);
@@ -283,6 +300,7 @@ int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len);
int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len);
struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn);
int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn);
+unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn);
void mark_page_dirty(struct kvm *kvm, gfn_t gfn);
void kvm_vcpu_block(struct kvm_vcpu *vcpu);
@@ -383,6 +401,7 @@ struct kvm_assigned_dev_kernel {
struct work_struct interrupt_work;
struct list_head list;
int assigned_dev_id;
+ int host_segnr;
int host_busnr;
int host_devfn;
unsigned int entries_nr;
@@ -429,8 +448,7 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
#define KVM_IOMMU_CACHE_COHERENCY 0x1
#ifdef CONFIG_IOMMU_API
-int kvm_iommu_map_pages(struct kvm *kvm, gfn_t base_gfn,
- unsigned long npages);
+int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot);
int kvm_iommu_map_guest(struct kvm *kvm);
int kvm_iommu_unmap_guest(struct kvm *kvm);
int kvm_assign_device(struct kvm *kvm,
@@ -480,11 +498,6 @@ static inline void kvm_guest_exit(void)
current->flags &= ~PF_VCPU;
}
-static inline int memslot_id(struct kvm *kvm, struct kvm_memory_slot *slot)
-{
- return slot - kvm->memslots;
-}
-
static inline gpa_t gfn_to_gpa(gfn_t gfn)
{
return (gpa_t)gfn << PAGE_SHIFT;
@@ -532,6 +545,10 @@ static inline int mmu_notifier_retry(struct kvm_vcpu *vcpu, unsigned long mmu_se
}
#endif
+#ifndef KVM_ARCH_HAS_UNALIAS_INSTANTIATION
+#define unalias_gfn_instantiation unalias_gfn
+#endif
+
#ifdef CONFIG_HAVE_KVM_IRQCHIP
#define KVM_MAX_IRQ_ROUTES 1024
diff --git a/include/linux/quota.h b/include/linux/quota.h
index a6861f11748..b462916b2a0 100644
--- a/include/linux/quota.h
+++ b/include/linux/quota.h
@@ -279,9 +279,6 @@ struct dquot {
struct mem_dqblk dq_dqb; /* Diskquota usage */
};
-#define QUOTA_OK 0
-#define NO_QUOTA 1
-
/* Operations which must be implemented by each quota format */
struct quota_format_ops {
int (*check_quota_file)(struct super_block *sb, int type); /* Detect whether file is in our format */
@@ -295,13 +292,6 @@ struct quota_format_ops {
/* Operations working with dquots */
struct dquot_operations {
- int (*initialize) (struct inode *, int);
- int (*drop) (struct inode *);
- int (*alloc_space) (struct inode *, qsize_t, int);
- int (*alloc_inode) (const struct inode *, qsize_t);
- int (*free_space) (struct inode *, qsize_t);
- int (*free_inode) (const struct inode *, qsize_t);
- int (*transfer) (struct inode *, struct iattr *);
int (*write_dquot) (struct dquot *); /* Ordinary dquot write */
struct dquot *(*alloc_dquot)(struct super_block *, int); /* Allocate memory for new dquot */
void (*destroy_dquot)(struct dquot *); /* Free memory for dquot */
@@ -309,12 +299,6 @@ struct dquot_operations {
int (*release_dquot) (struct dquot *); /* Quota is going to be deleted from disk */
int (*mark_dirty) (struct dquot *); /* Dquot is marked dirty */
int (*write_info) (struct super_block *, int); /* Write of quota "superblock" */
- /* reserve quota for delayed block allocation */
- int (*reserve_space) (struct inode *, qsize_t, int);
- /* claim reserved quota for delayed alloc */
- int (*claim_space) (struct inode *, qsize_t);
- /* release rsved quota for delayed alloc */
- void (*release_rsv) (struct inode *, qsize_t);
/* get reserved quota for delayed alloc, value returned is managed by
* quota code only */
qsize_t *(*get_reserved_space) (struct inode *);
@@ -324,7 +308,7 @@ struct dquot_operations {
struct quotactl_ops {
int (*quota_on)(struct super_block *, int, int, char *, int);
int (*quota_off)(struct super_block *, int, int);
- int (*quota_sync)(struct super_block *, int);
+ int (*quota_sync)(struct super_block *, int, int);
int (*get_info)(struct super_block *, int, struct if_dqinfo *);
int (*set_info)(struct super_block *, int, struct if_dqinfo *);
int (*get_dqblk)(struct super_block *, int, qid_t, struct if_dqblk *);
@@ -357,26 +341,25 @@ enum {
#define DQUOT_STATE_FLAGS (DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED | \
DQUOT_SUSPENDED)
/* Other quota flags */
-#define DQUOT_QUOTA_SYS_FILE (1 << 6) /* Quota file is a special
+#define DQUOT_STATE_LAST (_DQUOT_STATE_FLAGS * MAXQUOTAS)
+#define DQUOT_QUOTA_SYS_FILE (1 << DQUOT_STATE_LAST)
+ /* Quota file is a special
* system file and user cannot
* touch it. Filesystem is
* responsible for setting
* S_NOQUOTA, S_NOATIME flags
*/
-#define DQUOT_NEGATIVE_USAGE (1 << 7) /* Allow negative quota usage */
+#define DQUOT_NEGATIVE_USAGE (1 << (DQUOT_STATE_LAST + 1))
+ /* Allow negative quota usage */
static inline unsigned int dquot_state_flag(unsigned int flags, int type)
{
- if (type == USRQUOTA)
- return flags;
- return flags << _DQUOT_STATE_FLAGS;
+ return flags << _DQUOT_STATE_FLAGS * type;
}
static inline unsigned int dquot_generic_flag(unsigned int flags, int type)
{
- if (type == USRQUOTA)
- return flags;
- return flags >> _DQUOT_STATE_FLAGS;
+ return (flags >> _DQUOT_STATE_FLAGS * type) & DQUOT_STATE_FLAGS;
}
#ifdef CONFIG_QUOTA_NETLINK_INTERFACE
diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h
index 3ebb2315364..e6fa7acce29 100644
--- a/include/linux/quotaops.h
+++ b/include/linux/quotaops.h
@@ -19,15 +19,12 @@ static inline struct quota_info *sb_dqopt(struct super_block *sb)
/*
* declaration of quota_function calls in kernel.
*/
-void sync_quota_sb(struct super_block *sb, int type);
-static inline void writeout_quota_sb(struct super_block *sb, int type)
-{
- if (sb->s_qcop->quota_sync)
- sb->s_qcop->quota_sync(sb, type);
-}
+void inode_add_rsv_space(struct inode *inode, qsize_t number);
+void inode_claim_rsv_space(struct inode *inode, qsize_t number);
+void inode_sub_rsv_space(struct inode *inode, qsize_t number);
-int dquot_initialize(struct inode *inode, int type);
-int dquot_drop(struct inode *inode);
+void dquot_initialize(struct inode *inode);
+void dquot_drop(struct inode *inode);
struct dquot *dqget(struct super_block *sb, unsigned int id, int type);
void dqput(struct dquot *dquot);
int dquot_scan_active(struct super_block *sb,
@@ -36,24 +33,23 @@ int dquot_scan_active(struct super_block *sb,
struct dquot *dquot_alloc(struct super_block *sb, int type);
void dquot_destroy(struct dquot *dquot);
-int dquot_alloc_space(struct inode *inode, qsize_t number, int prealloc);
-int dquot_alloc_inode(const struct inode *inode, qsize_t number);
+int __dquot_alloc_space(struct inode *inode, qsize_t number,
+ int warn, int reserve);
+void __dquot_free_space(struct inode *inode, qsize_t number, int reserve);
-int dquot_reserve_space(struct inode *inode, qsize_t number, int prealloc);
-int dquot_claim_space(struct inode *inode, qsize_t number);
-void dquot_release_reserved_space(struct inode *inode, qsize_t number);
-qsize_t dquot_get_reserved_space(struct inode *inode);
+int dquot_alloc_inode(const struct inode *inode);
-int dquot_free_space(struct inode *inode, qsize_t number);
-int dquot_free_inode(const struct inode *inode, qsize_t number);
+int dquot_claim_space_nodirty(struct inode *inode, qsize_t number);
+void dquot_free_inode(const struct inode *inode);
-int dquot_transfer(struct inode *inode, struct iattr *iattr);
int dquot_commit(struct dquot *dquot);
int dquot_acquire(struct dquot *dquot);
int dquot_release(struct dquot *dquot);
int dquot_commit_info(struct super_block *sb, int type);
int dquot_mark_dquot_dirty(struct dquot *dquot);
+int dquot_file_open(struct inode *inode, struct file *file);
+
int vfs_quota_on(struct super_block *sb, int type, int format_id,
char *path, int remount);
int vfs_quota_enable(struct inode *inode, int type, int format_id,
@@ -64,14 +60,13 @@ int vfs_quota_on_mount(struct super_block *sb, char *qf_name,
int format_id, int type);
int vfs_quota_off(struct super_block *sb, int type, int remount);
int vfs_quota_disable(struct super_block *sb, int type, unsigned int flags);
-int vfs_quota_sync(struct super_block *sb, int type);
+int vfs_quota_sync(struct super_block *sb, int type, int wait);
int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii);
int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii);
int vfs_get_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *di);
int vfs_set_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *di);
-void vfs_dq_drop(struct inode *inode);
-int vfs_dq_transfer(struct inode *inode, struct iattr *iattr);
+int dquot_transfer(struct inode *inode, struct iattr *iattr);
int vfs_dq_quota_on_remount(struct super_block *sb);
static inline struct mem_dqinfo *sb_dqinfo(struct super_block *sb, int type)
@@ -83,53 +78,56 @@ static inline struct mem_dqinfo *sb_dqinfo(struct super_block *sb, int type)
* Functions for checking status of quota
*/
-static inline int sb_has_quota_usage_enabled(struct super_block *sb, int type)
+static inline bool sb_has_quota_usage_enabled(struct super_block *sb, int type)
{
return sb_dqopt(sb)->flags &
dquot_state_flag(DQUOT_USAGE_ENABLED, type);
}
-static inline int sb_has_quota_limits_enabled(struct super_block *sb, int type)
+static inline bool sb_has_quota_limits_enabled(struct super_block *sb, int type)
{
return sb_dqopt(sb)->flags &
dquot_state_flag(DQUOT_LIMITS_ENABLED, type);
}
-static inline int sb_has_quota_suspended(struct super_block *sb, int type)
+static inline bool sb_has_quota_suspended(struct super_block *sb, int type)
{
return sb_dqopt(sb)->flags &
dquot_state_flag(DQUOT_SUSPENDED, type);
}
-static inline int sb_any_quota_suspended(struct super_block *sb)
+static inline unsigned sb_any_quota_suspended(struct super_block *sb)
{
- return sb_has_quota_suspended(sb, USRQUOTA) ||
- sb_has_quota_suspended(sb, GRPQUOTA);
+ unsigned type, tmsk = 0;
+ for (type = 0; type < MAXQUOTAS; type++)
+ tmsk |= sb_has_quota_suspended(sb, type) << type;
+ return tmsk;
}
/* Does kernel know about any quota information for given sb + type? */
-static inline int sb_has_quota_loaded(struct super_block *sb, int type)
+static inline bool sb_has_quota_loaded(struct super_block *sb, int type)
{
/* Currently if anything is on, then quota usage is on as well */
return sb_has_quota_usage_enabled(sb, type);
}
-static inline int sb_any_quota_loaded(struct super_block *sb)
+static inline unsigned sb_any_quota_loaded(struct super_block *sb)
{
- return sb_has_quota_loaded(sb, USRQUOTA) ||
- sb_has_quota_loaded(sb, GRPQUOTA);
+ unsigned type, tmsk = 0;
+ for (type = 0; type < MAXQUOTAS; type++)
+ tmsk |= sb_has_quota_loaded(sb, type) << type;
+ return tmsk;
}
-static inline int sb_has_quota_active(struct super_block *sb, int type)
+static inline bool sb_has_quota_active(struct super_block *sb, int type)
{
return sb_has_quota_loaded(sb, type) &&
!sb_has_quota_suspended(sb, type);
}
-static inline int sb_any_quota_active(struct super_block *sb)
+static inline unsigned sb_any_quota_active(struct super_block *sb)
{
- return sb_has_quota_active(sb, USRQUOTA) ||
- sb_has_quota_active(sb, GRPQUOTA);
+ return sb_any_quota_loaded(sb) & ~sb_any_quota_suspended(sb);
}
/*
@@ -141,122 +139,6 @@ extern const struct quotactl_ops vfs_quotactl_ops;
#define sb_dquot_ops (&dquot_operations)
#define sb_quotactl_ops (&vfs_quotactl_ops)
-/* It is better to call this function outside of any transaction as it might
- * need a lot of space in journal for dquot structure allocation. */
-static inline void vfs_dq_init(struct inode *inode)
-{
- BUG_ON(!inode->i_sb);
- if (sb_any_quota_active(inode->i_sb) && !IS_NOQUOTA(inode))
- inode->i_sb->dq_op->initialize(inode, -1);
-}
-
-/* The following allocation/freeing/transfer functions *must* be called inside
- * a transaction (deadlocks possible otherwise) */
-static inline int vfs_dq_prealloc_space_nodirty(struct inode *inode, qsize_t nr)
-{
- if (sb_any_quota_active(inode->i_sb)) {
- /* Used space is updated in alloc_space() */
- if (inode->i_sb->dq_op->alloc_space(inode, nr, 1) == NO_QUOTA)
- return 1;
- }
- else
- inode_add_bytes(inode, nr);
- return 0;
-}
-
-static inline int vfs_dq_prealloc_space(struct inode *inode, qsize_t nr)
-{
- int ret;
- if (!(ret = vfs_dq_prealloc_space_nodirty(inode, nr)))
- mark_inode_dirty(inode);
- return ret;
-}
-
-static inline int vfs_dq_alloc_space_nodirty(struct inode *inode, qsize_t nr)
-{
- if (sb_any_quota_active(inode->i_sb)) {
- /* Used space is updated in alloc_space() */
- if (inode->i_sb->dq_op->alloc_space(inode, nr, 0) == NO_QUOTA)
- return 1;
- }
- else
- inode_add_bytes(inode, nr);
- return 0;
-}
-
-static inline int vfs_dq_alloc_space(struct inode *inode, qsize_t nr)
-{
- int ret;
- if (!(ret = vfs_dq_alloc_space_nodirty(inode, nr)))
- mark_inode_dirty(inode);
- return ret;
-}
-
-static inline int vfs_dq_reserve_space(struct inode *inode, qsize_t nr)
-{
- if (sb_any_quota_active(inode->i_sb)) {
- /* Used space is updated in alloc_space() */
- if (inode->i_sb->dq_op->reserve_space(inode, nr, 0) == NO_QUOTA)
- return 1;
- }
- return 0;
-}
-
-static inline int vfs_dq_alloc_inode(struct inode *inode)
-{
- if (sb_any_quota_active(inode->i_sb)) {
- vfs_dq_init(inode);
- if (inode->i_sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA)
- return 1;
- }
- return 0;
-}
-
-/*
- * Convert in-memory reserved quotas to real consumed quotas
- */
-static inline int vfs_dq_claim_space(struct inode *inode, qsize_t nr)
-{
- if (sb_any_quota_active(inode->i_sb)) {
- if (inode->i_sb->dq_op->claim_space(inode, nr) == NO_QUOTA)
- return 1;
- } else
- inode_add_bytes(inode, nr);
-
- mark_inode_dirty(inode);
- return 0;
-}
-
-/*
- * Release reserved (in-memory) quotas
- */
-static inline
-void vfs_dq_release_reservation_space(struct inode *inode, qsize_t nr)
-{
- if (sb_any_quota_active(inode->i_sb))
- inode->i_sb->dq_op->release_rsv(inode, nr);
-}
-
-static inline void vfs_dq_free_space_nodirty(struct inode *inode, qsize_t nr)
-{
- if (sb_any_quota_active(inode->i_sb))
- inode->i_sb->dq_op->free_space(inode, nr);
- else
- inode_sub_bytes(inode, nr);
-}
-
-static inline void vfs_dq_free_space(struct inode *inode, qsize_t nr)
-{
- vfs_dq_free_space_nodirty(inode, nr);
- mark_inode_dirty(inode);
-}
-
-static inline void vfs_dq_free_inode(struct inode *inode)
-{
- if (sb_any_quota_active(inode->i_sb))
- inode->i_sb->dq_op->free_inode(inode, 1);
-}
-
/* Cannot be called inside a transaction */
static inline int vfs_dq_off(struct super_block *sb, int remount)
{
@@ -316,28 +198,20 @@ static inline int sb_any_quota_active(struct super_block *sb)
#define sb_dquot_ops (NULL)
#define sb_quotactl_ops (NULL)
-static inline void vfs_dq_init(struct inode *inode)
+static inline void dquot_initialize(struct inode *inode)
{
}
-static inline void vfs_dq_drop(struct inode *inode)
+static inline void dquot_drop(struct inode *inode)
{
}
-static inline int vfs_dq_alloc_inode(struct inode *inode)
+static inline int dquot_alloc_inode(const struct inode *inode)
{
return 0;
}
-static inline void vfs_dq_free_inode(struct inode *inode)
-{
-}
-
-static inline void sync_quota_sb(struct super_block *sb, int type)
-{
-}
-
-static inline void writeout_quota_sb(struct super_block *sb, int type)
+static inline void dquot_free_inode(const struct inode *inode)
{
}
@@ -351,110 +225,116 @@ static inline int vfs_dq_quota_on_remount(struct super_block *sb)
return 0;
}
-static inline int vfs_dq_transfer(struct inode *inode, struct iattr *iattr)
+static inline int dquot_transfer(struct inode *inode, struct iattr *iattr)
{
return 0;
}
-static inline int vfs_dq_prealloc_space_nodirty(struct inode *inode, qsize_t nr)
+static inline int __dquot_alloc_space(struct inode *inode, qsize_t number,
+ int warn, int reserve)
{
- inode_add_bytes(inode, nr);
+ if (!reserve)
+ inode_add_bytes(inode, number);
return 0;
}
-static inline int vfs_dq_prealloc_space(struct inode *inode, qsize_t nr)
+static inline void __dquot_free_space(struct inode *inode, qsize_t number,
+ int reserve)
{
- vfs_dq_prealloc_space_nodirty(inode, nr);
- mark_inode_dirty(inode);
- return 0;
+ if (!reserve)
+ inode_sub_bytes(inode, number);
}
-static inline int vfs_dq_alloc_space_nodirty(struct inode *inode, qsize_t nr)
+static inline int dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
{
- inode_add_bytes(inode, nr);
+ inode_add_bytes(inode, number);
return 0;
}
-static inline int vfs_dq_alloc_space(struct inode *inode, qsize_t nr)
+#define dquot_file_open generic_file_open
+
+#endif /* CONFIG_QUOTA */
+
+static inline int dquot_alloc_space_nodirty(struct inode *inode, qsize_t nr)
{
- vfs_dq_alloc_space_nodirty(inode, nr);
- mark_inode_dirty(inode);
- return 0;
+ return __dquot_alloc_space(inode, nr, 1, 0);
}
-static inline int vfs_dq_reserve_space(struct inode *inode, qsize_t nr)
+static inline int dquot_alloc_space(struct inode *inode, qsize_t nr)
{
- return 0;
+ int ret;
+
+ ret = dquot_alloc_space_nodirty(inode, nr);
+ if (!ret)
+ mark_inode_dirty(inode);
+ return ret;
}
-static inline int vfs_dq_claim_space(struct inode *inode, qsize_t nr)
+static inline int dquot_alloc_block_nodirty(struct inode *inode, qsize_t nr)
{
- return vfs_dq_alloc_space(inode, nr);
+ return dquot_alloc_space_nodirty(inode, nr << inode->i_blkbits);
}
-static inline
-int vfs_dq_release_reservation_space(struct inode *inode, qsize_t nr)
+static inline int dquot_alloc_block(struct inode *inode, qsize_t nr)
{
- return 0;
+ return dquot_alloc_space(inode, nr << inode->i_blkbits);
}
-static inline void vfs_dq_free_space_nodirty(struct inode *inode, qsize_t nr)
+static inline int dquot_prealloc_block_nodirty(struct inode *inode, qsize_t nr)
{
- inode_sub_bytes(inode, nr);
+ return __dquot_alloc_space(inode, nr << inode->i_blkbits, 0, 0);
}
-static inline void vfs_dq_free_space(struct inode *inode, qsize_t nr)
+static inline int dquot_prealloc_block(struct inode *inode, qsize_t nr)
{
- vfs_dq_free_space_nodirty(inode, nr);
- mark_inode_dirty(inode);
-}
-
-#endif /* CONFIG_QUOTA */
+ int ret;
-static inline int vfs_dq_prealloc_block_nodirty(struct inode *inode, qsize_t nr)
-{
- return vfs_dq_prealloc_space_nodirty(inode, nr << inode->i_blkbits);
+ ret = dquot_prealloc_block_nodirty(inode, nr);
+ if (!ret)
+ mark_inode_dirty(inode);
+ return ret;
}
-static inline int vfs_dq_prealloc_block(struct inode *inode, qsize_t nr)
+static inline int dquot_reserve_block(struct inode *inode, qsize_t nr)
{
- return vfs_dq_prealloc_space(inode, nr << inode->i_blkbits);
+ return __dquot_alloc_space(inode, nr << inode->i_blkbits, 1, 1);
}
-static inline int vfs_dq_alloc_block_nodirty(struct inode *inode, qsize_t nr)
+static inline int dquot_claim_block(struct inode *inode, qsize_t nr)
{
- return vfs_dq_alloc_space_nodirty(inode, nr << inode->i_blkbits);
-}
+ int ret;
-static inline int vfs_dq_alloc_block(struct inode *inode, qsize_t nr)
-{
- return vfs_dq_alloc_space(inode, nr << inode->i_blkbits);
+ ret = dquot_claim_space_nodirty(inode, nr << inode->i_blkbits);
+ if (!ret)
+ mark_inode_dirty(inode);
+ return ret;
}
-static inline int vfs_dq_reserve_block(struct inode *inode, qsize_t nr)
+static inline void dquot_free_space_nodirty(struct inode *inode, qsize_t nr)
{
- return vfs_dq_reserve_space(inode, nr << inode->i_blkbits);
+ __dquot_free_space(inode, nr, 0);
}
-static inline int vfs_dq_claim_block(struct inode *inode, qsize_t nr)
+static inline void dquot_free_space(struct inode *inode, qsize_t nr)
{
- return vfs_dq_claim_space(inode, nr << inode->i_blkbits);
+ dquot_free_space_nodirty(inode, nr);
+ mark_inode_dirty(inode);
}
-static inline
-void vfs_dq_release_reservation_block(struct inode *inode, qsize_t nr)
+static inline void dquot_free_block_nodirty(struct inode *inode, qsize_t nr)
{
- vfs_dq_release_reservation_space(inode, nr << inode->i_blkbits);
+ dquot_free_space_nodirty(inode, nr << inode->i_blkbits);
}
-static inline void vfs_dq_free_block_nodirty(struct inode *inode, qsize_t nr)
+static inline void dquot_free_block(struct inode *inode, qsize_t nr)
{
- vfs_dq_free_space_nodirty(inode, nr << inode->i_blkbits);
+ dquot_free_space(inode, nr << inode->i_blkbits);
}
-static inline void vfs_dq_free_block(struct inode *inode, qsize_t nr)
+static inline void dquot_release_reservation_block(struct inode *inode,
+ qsize_t nr)
{
- vfs_dq_free_space(inode, nr << inode->i_blkbits);
+ __dquot_free_space(inode, nr << inode->i_blkbits, 1);
}
#endif /* _LINUX_QUOTAOPS_ */
diff --git a/include/linux/virtio_9p.h b/include/linux/virtio_9p.h
index 095e10d148b..33227508008 100644
--- a/include/linux/virtio_9p.h
+++ b/include/linux/virtio_9p.h
@@ -5,7 +5,4 @@
#include <linux/virtio_ids.h>
#include <linux/virtio_config.h>
-/* Maximum number of virtio channels per partition (1 for now) */
-#define MAX_9P_CHAN 1
-
#endif /* _LINUX_VIRTIO_9P_H */
diff --git a/include/net/9p/client.h b/include/net/9p/client.h
index fb00b329f0d..52e1fff709e 100644
--- a/include/net/9p/client.h
+++ b/include/net/9p/client.h
@@ -29,6 +29,19 @@
/* Number of requests per row */
#define P9_ROW_MAXTAG 255
+/** enum p9_proto_versions - 9P protocol versions
+ * @p9_proto_legacy: 9P Legacy mode, pre-9P2000.u
+ * @p9_proto_2000u: 9P2000.u extension
+ * @p9_proto_2010L: 9P2010.L extension
+ */
+
+enum p9_proto_versions{
+ p9_proto_legacy = 0,
+ p9_proto_2000u = 1,
+ p9_proto_2010L = 2,
+};
+
+
/**
* enum p9_trans_status - different states of underlying transports
* @Connected: transport is connected and healthy
@@ -111,6 +124,7 @@ struct p9_req_t {
* @lock: protect @fidlist
* @msize: maximum data size negotiated by protocol
* @dotu: extension flags negotiated by protocol
+ * @proto_version: 9P protocol version to use
* @trans_mod: module API instantiated with this client
* @trans: tranport instance state and API
* @conn: connection state information used by trans_fd
@@ -137,7 +151,7 @@ struct p9_req_t {
struct p9_client {
spinlock_t lock; /* protect client structure */
int msize;
- unsigned char dotu;
+ unsigned char proto_version;
struct p9_trans_module *trans_mod;
enum p9_trans_status status;
void *trans;
@@ -209,5 +223,7 @@ int p9_parse_header(struct p9_fcall *, int32_t *, int8_t *, int16_t *, int);
int p9stat_read(char *, int, struct p9_wstat *, int);
void p9stat_free(struct p9_wstat *);
+int p9_is_proto_dotu(struct p9_client *clnt);
+int p9_is_proto_dotl(struct p9_client *clnt);
#endif /* NET_9P_CLIENT_H */
diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
index dbe10845527..b17d49dfc3e 100644
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -145,6 +145,47 @@ TRACE_EVENT(kvm_mmio,
__entry->len, __entry->gpa, __entry->val)
);
+#define kvm_fpu_load_symbol \
+ {0, "unload"}, \
+ {1, "load"}
+
+TRACE_EVENT(kvm_fpu,
+ TP_PROTO(int load),
+ TP_ARGS(load),
+
+ TP_STRUCT__entry(
+ __field( u32, load )
+ ),
+
+ TP_fast_assign(
+ __entry->load = load;
+ ),
+
+ TP_printk("%s", __print_symbolic(__entry->load, kvm_fpu_load_symbol))
+);
+
+TRACE_EVENT(kvm_age_page,
+ TP_PROTO(ulong hva, struct kvm_memory_slot *slot, int ref),
+ TP_ARGS(hva, slot, ref),
+
+ TP_STRUCT__entry(
+ __field( u64, hva )
+ __field( u64, gfn )
+ __field( u8, referenced )
+ ),
+
+ TP_fast_assign(
+ __entry->hva = hva;
+ __entry->gfn =
+ slot->base_gfn + ((hva - slot->userspace_addr) >> PAGE_SHIFT);
+ __entry->referenced = ref;
+ ),
+
+ TP_printk("hva %llx gfn %llx %s",
+ __entry->hva, __entry->gfn,
+ __entry->referenced ? "YOUNG" : "OLD")
+);
+
#endif /* _TRACE_KVM_MAIN_H */
/* This part must be outside protection */
diff --git a/net/9p/client.c b/net/9p/client.c
index 09d4f1e2e4a..bde9f3d38c5 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -46,6 +46,7 @@ enum {
Opt_msize,
Opt_trans,
Opt_legacy,
+ Opt_version,
Opt_err,
};
@@ -53,9 +54,42 @@ static const match_table_t tokens = {
{Opt_msize, "msize=%u"},
{Opt_legacy, "noextend"},
{Opt_trans, "trans=%s"},
+ {Opt_version, "version=%s"},
{Opt_err, NULL},
};
+inline int p9_is_proto_dotl(struct p9_client *clnt)
+{
+ return (clnt->proto_version == p9_proto_2010L);
+}
+EXPORT_SYMBOL(p9_is_proto_dotl);
+
+inline int p9_is_proto_dotu(struct p9_client *clnt)
+{
+ return (clnt->proto_version == p9_proto_2000u);
+}
+EXPORT_SYMBOL(p9_is_proto_dotu);
+
+/* Interpret mount option for protocol version */
+static unsigned char get_protocol_version(const substring_t *name)
+{
+ unsigned char version = -EINVAL;
+ if (!strncmp("9p2000", name->from, name->to-name->from)) {
+ version = p9_proto_legacy;
+ P9_DPRINTK(P9_DEBUG_9P, "Protocol version: Legacy\n");
+ } else if (!strncmp("9p2000.u", name->from, name->to-name->from)) {
+ version = p9_proto_2000u;
+ P9_DPRINTK(P9_DEBUG_9P, "Protocol version: 9P2000.u\n");
+ } else if (!strncmp("9p2010.L", name->from, name->to-name->from)) {
+ version = p9_proto_2010L;
+ P9_DPRINTK(P9_DEBUG_9P, "Protocol version: 9P2010.L\n");
+ } else {
+ P9_DPRINTK(P9_DEBUG_ERROR, "Unknown protocol version %s. ",
+ name->from);
+ }
+ return version;
+}
+
static struct p9_req_t *
p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...);
@@ -75,7 +109,7 @@ static int parse_opts(char *opts, struct p9_client *clnt)
int option;
int ret = 0;
- clnt->dotu = 1;
+ clnt->proto_version = p9_proto_2000u;
clnt->msize = 8192;
if (!opts)
@@ -118,7 +152,13 @@ static int parse_opts(char *opts, struct p9_client *clnt)
}
break;
case Opt_legacy:
- clnt->dotu = 0;
+ clnt->proto_version = p9_proto_legacy;
+ break;
+ case Opt_version:
+ ret = get_protocol_version(&args[0]);
+ if (ret == -EINVAL)
+ goto free_and_return;
+ clnt->proto_version = ret;
break;
default:
continue;
@@ -410,14 +450,15 @@ static int p9_check_errors(struct p9_client *c, struct p9_req_t *req)
int ecode;
char *ename;
- err = p9pdu_readf(req->rc, c->dotu, "s?d", &ename, &ecode);
+ err = p9pdu_readf(req->rc, c->proto_version, "s?d",
+ &ename, &ecode);
if (err) {
P9_DPRINTK(P9_DEBUG_ERROR, "couldn't parse error%d\n",
err);
return err;
}
- if (c->dotu)
+ if (p9_is_proto_dotu(c))
err = -ecode;
if (!err || !IS_ERR_VALUE(err))
@@ -515,7 +556,7 @@ p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...)
/* marshall the data */
p9pdu_prepare(req->tc, tag, type);
va_start(ap, fmt);
- err = p9pdu_vwritef(req->tc, c->dotu, fmt, ap);
+ err = p9pdu_vwritef(req->tc, c->proto_version, fmt, ap);
va_end(ap);
p9pdu_finalize(req->tc);
@@ -627,14 +668,31 @@ int p9_client_version(struct p9_client *c)
char *version;
int msize;
- P9_DPRINTK(P9_DEBUG_9P, ">>> TVERSION msize %d extended %d\n",
- c->msize, c->dotu);
- req = p9_client_rpc(c, P9_TVERSION, "ds", c->msize,
- c->dotu ? "9P2000.u" : "9P2000");
+ P9_DPRINTK(P9_DEBUG_9P, ">>> TVERSION msize %d protocol %d\n",
+ c->msize, c->proto_version);
+
+ switch (c->proto_version) {
+ case p9_proto_2010L:
+ req = p9_client_rpc(c, P9_TVERSION, "ds",
+ c->msize, "9P2010.L");
+ break;
+ case p9_proto_2000u:
+ req = p9_client_rpc(c, P9_TVERSION, "ds",
+ c->msize, "9P2000.u");
+ break;
+ case p9_proto_legacy:
+ req = p9_client_rpc(c, P9_TVERSION, "ds",
+ c->msize, "9P2000");
+ break;
+ default:
+ return -EINVAL;
+ break;
+ }
+
if (IS_ERR(req))
return PTR_ERR(req);
- err = p9pdu_readf(req->rc, c->dotu, "ds", &msize, &version);
+ err = p9pdu_readf(req->rc, c->proto_version, "ds", &msize, &version);
if (err) {
P9_DPRINTK(P9_DEBUG_9P, "version error %d\n", err);
p9pdu_dump(1, req->rc);
@@ -642,10 +700,12 @@ int p9_client_version(struct p9_client *c)
}
P9_DPRINTK(P9_DEBUG_9P, "<<< RVERSION msize %d %s\n", msize, version);
- if (!memcmp(version, "9P2000.u", 8))
- c->dotu = 1;
- else if (!memcmp(version, "9P2000", 6))
- c->dotu = 0;
+ if (!strncmp(version, "9P2010.L", 8))
+ c->proto_version = p9_proto_2010L;
+ else if (!strncmp(version, "9P2000.u", 8))
+ c->proto_version = p9_proto_2000u;
+ else if (!strncmp(version, "9P2000", 6))
+ c->proto_version = p9_proto_legacy;
else {
err = -EREMOTEIO;
goto error;
@@ -700,8 +760,8 @@ struct p9_client *p9_client_create(const char *dev_name, char *options)
goto put_trans;
}
- P9_DPRINTK(P9_DEBUG_MUX, "clnt %p trans %p msize %d dotu %d\n",
- clnt, clnt->trans_mod, clnt->msize, clnt->dotu);
+ P9_DPRINTK(P9_DEBUG_MUX, "clnt %p trans %p msize %d protocol %d\n",
+ clnt, clnt->trans_mod, clnt->msize, clnt->proto_version);
err = clnt->trans_mod->create(clnt, dev_name, options);
if (err)
@@ -784,7 +844,7 @@ struct p9_fid *p9_client_attach(struct p9_client *clnt, struct p9_fid *afid,
goto error;
}
- err = p9pdu_readf(req->rc, clnt->dotu, "Q", &qid);
+ err = p9pdu_readf(req->rc, clnt->proto_version, "Q", &qid);
if (err) {
p9pdu_dump(1, req->rc);
p9_free_req(clnt, req);
@@ -833,7 +893,7 @@ p9_client_auth(struct p9_client *clnt, char *uname, u32 n_uname, char *aname)
goto error;
}
- err = p9pdu_readf(req->rc, clnt->dotu, "Q", &qid);
+ err = p9pdu_readf(req->rc, clnt->proto_version, "Q", &qid);
if (err) {
p9pdu_dump(1, req->rc);
p9_free_req(clnt, req);
@@ -891,7 +951,7 @@ struct p9_fid *p9_client_walk(struct p9_fid *oldfid, int nwname, char **wnames,
goto error;
}
- err = p9pdu_readf(req->rc, clnt->dotu, "R", &nwqids, &wqids);
+ err = p9pdu_readf(req->rc, clnt->proto_version, "R", &nwqids, &wqids);
if (err) {
p9pdu_dump(1, req->rc);
p9_free_req(clnt, req);
@@ -952,7 +1012,7 @@ int p9_client_open(struct p9_fid *fid, int mode)
goto error;
}
- err = p9pdu_readf(req->rc, clnt->dotu, "Qd", &qid, &iounit);
+ err = p9pdu_readf(req->rc, clnt->proto_version, "Qd", &qid, &iounit);
if (err) {
p9pdu_dump(1, req->rc);
goto free_and_error;
@@ -997,7 +1057,7 @@ int p9_client_fcreate(struct p9_fid *fid, char *name, u32 perm, int mode,
goto error;
}
- err = p9pdu_readf(req->rc, clnt->dotu, "Qd", &qid, &iounit);
+ err = p9pdu_readf(req->rc, clnt->proto_version, "Qd", &qid, &iounit);
if (err) {
p9pdu_dump(1, req->rc);
goto free_and_error;
@@ -1098,7 +1158,7 @@ p9_client_read(struct p9_fid *fid, char *data, char __user *udata, u64 offset,
goto error;
}
- err = p9pdu_readf(req->rc, clnt->dotu, "D", &count, &dataptr);
+ err = p9pdu_readf(req->rc, clnt->proto_version, "D", &count, &dataptr);
if (err) {
p9pdu_dump(1, req->rc);
goto free_and_error;
@@ -1159,7 +1219,7 @@ p9_client_write(struct p9_fid *fid, char *data, const char __user *udata,
goto error;
}
- err = p9pdu_readf(req->rc, clnt->dotu, "d", &count);
+ err = p9pdu_readf(req->rc, clnt->proto_version, "d", &count);
if (err) {
p9pdu_dump(1, req->rc);
goto free_and_error;
@@ -1199,7 +1259,7 @@ struct p9_wstat *p9_client_stat(struct p9_fid *fid)
goto error;
}
- err = p9pdu_readf(req->rc, clnt->dotu, "wS", &ignored, ret);
+ err = p9pdu_readf(req->rc, clnt->proto_version, "wS", &ignored, ret);
if (err) {
p9pdu_dump(1, req->rc);
p9_free_req(clnt, req);
@@ -1226,7 +1286,7 @@ error:
}
EXPORT_SYMBOL(p9_client_stat);
-static int p9_client_statsize(struct p9_wstat *wst, int optional)
+static int p9_client_statsize(struct p9_wstat *wst, int proto_version)
{
int ret;
@@ -1245,7 +1305,7 @@ static int p9_client_statsize(struct p9_wstat *wst, int optional)
if (wst->muid)
ret += strlen(wst->muid);
- if (optional) {
+ if (proto_version == p9_proto_2000u) {
ret += 2+4+4+4; /* extension[s] n_uid[4] n_gid[4] n_muid[4] */
if (wst->extension)
ret += strlen(wst->extension);
@@ -1262,7 +1322,7 @@ int p9_client_wstat(struct p9_fid *fid, struct p9_wstat *wst)
err = 0;
clnt = fid->clnt;
- wst->size = p9_client_statsize(wst, clnt->dotu);
+ wst->size = p9_client_statsize(wst, clnt->proto_version);
P9_DPRINTK(P9_DEBUG_9P, ">>> TWSTAT fid %d\n", fid->fid);
P9_DPRINTK(P9_DEBUG_9P,
" sz=%x type=%x dev=%x qid=%x.%llx.%x\n"
diff --git a/net/9p/protocol.c b/net/9p/protocol.c
index fc70147c771..94f5a8f65e9 100644
--- a/net/9p/protocol.c
+++ b/net/9p/protocol.c
@@ -52,7 +52,7 @@
#endif
static int
-p9pdu_writef(struct p9_fcall *pdu, int optional, const char *fmt, ...);
+p9pdu_writef(struct p9_fcall *pdu, int proto_version, const char *fmt, ...);
#ifdef CONFIG_NET_9P_DEBUG
void
@@ -144,7 +144,8 @@ pdu_write_u(struct p9_fcall *pdu, const char __user *udata, size_t size)
*/
static int
-p9pdu_vreadf(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap)
+p9pdu_vreadf(struct p9_fcall *pdu, int proto_version, const char *fmt,
+ va_list ap)
{
const char *ptr;
int errcode = 0;
@@ -194,7 +195,8 @@ p9pdu_vreadf(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap)
int16_t len;
int size;
- errcode = p9pdu_readf(pdu, optional, "w", &len);
+ errcode = p9pdu_readf(pdu, proto_version,
+ "w", &len);
if (errcode)
break;
@@ -217,7 +219,7 @@ p9pdu_vreadf(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap)
struct p9_qid *qid =
va_arg(ap, struct p9_qid *);
- errcode = p9pdu_readf(pdu, optional, "bdq",
+ errcode = p9pdu_readf(pdu, proto_version, "bdq",
&qid->type, &qid->version,
&qid->path);
}
@@ -230,7 +232,7 @@ p9pdu_vreadf(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap)
stbuf->n_uid = stbuf->n_gid = stbuf->n_muid =
-1;
errcode =
- p9pdu_readf(pdu, optional,
+ p9pdu_readf(pdu, proto_version,
"wwdQdddqssss?sddd",
&stbuf->size, &stbuf->type,
&stbuf->dev, &stbuf->qid,
@@ -250,7 +252,7 @@ p9pdu_vreadf(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap)
void **data = va_arg(ap, void **);
errcode =
- p9pdu_readf(pdu, optional, "d", count);
+ p9pdu_readf(pdu, proto_version, "d", count);
if (!errcode) {
*count =
MIN(*count,
@@ -263,8 +265,8 @@ p9pdu_vreadf(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap)
int16_t *nwname = va_arg(ap, int16_t *);
char ***wnames = va_arg(ap, char ***);
- errcode =
- p9pdu_readf(pdu, optional, "w", nwname);
+ errcode = p9pdu_readf(pdu, proto_version,
+ "w", nwname);
if (!errcode) {
*wnames =
kmalloc(sizeof(char *) * *nwname,
@@ -278,7 +280,8 @@ p9pdu_vreadf(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap)
for (i = 0; i < *nwname; i++) {
errcode =
- p9pdu_readf(pdu, optional,
+ p9pdu_readf(pdu,
+ proto_version,
"s",
&(*wnames)[i]);
if (errcode)
@@ -306,7 +309,7 @@ p9pdu_vreadf(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap)
*wqids = NULL;
errcode =
- p9pdu_readf(pdu, optional, "w", nwqid);
+ p9pdu_readf(pdu, proto_version, "w", nwqid);
if (!errcode) {
*wqids =
kmalloc(*nwqid *
@@ -321,7 +324,8 @@ p9pdu_vreadf(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap)
for (i = 0; i < *nwqid; i++) {
errcode =
- p9pdu_readf(pdu, optional,
+ p9pdu_readf(pdu,
+ proto_version,
"Q",
&(*wqids)[i]);
if (errcode)
@@ -336,7 +340,7 @@ p9pdu_vreadf(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap)
}
break;
case '?':
- if (!optional)
+ if (proto_version != p9_proto_2000u)
return 0;
break;
default:
@@ -352,7 +356,8 @@ p9pdu_vreadf(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap)
}
int
-p9pdu_vwritef(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap)
+p9pdu_vwritef(struct p9_fcall *pdu, int proto_version, const char *fmt,
+ va_list ap)
{
const char *ptr;
int errcode = 0;
@@ -389,7 +394,8 @@ p9pdu_vwritef(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap)
if (sptr)
len = MIN(strlen(sptr), USHORT_MAX);
- errcode = p9pdu_writef(pdu, optional, "w", len);
+ errcode = p9pdu_writef(pdu, proto_version,
+ "w", len);
if (!errcode && pdu_write(pdu, sptr, len))
errcode = -EFAULT;
}
@@ -398,7 +404,7 @@ p9pdu_vwritef(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap)
const struct p9_qid *qid =
va_arg(ap, const struct p9_qid *);
errcode =
- p9pdu_writef(pdu, optional, "bdq",
+ p9pdu_writef(pdu, proto_version, "bdq",
qid->type, qid->version,
qid->path);
} break;
@@ -406,7 +412,7 @@ p9pdu_vwritef(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap)
const struct p9_wstat *stbuf =
va_arg(ap, const struct p9_wstat *);
errcode =
- p9pdu_writef(pdu, optional,
+ p9pdu_writef(pdu, proto_version,
"wwdQdddqssss?sddd",
stbuf->size, stbuf->type,
stbuf->dev, &stbuf->qid,
@@ -421,8 +427,8 @@ p9pdu_vwritef(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap)
int32_t count = va_arg(ap, int32_t);
const void *data = va_arg(ap, const void *);
- errcode =
- p9pdu_writef(pdu, optional, "d", count);
+ errcode = p9pdu_writef(pdu, proto_version, "d",
+ count);
if (!errcode && pdu_write(pdu, data, count))
errcode = -EFAULT;
}
@@ -431,8 +437,8 @@ p9pdu_vwritef(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap)
int32_t count = va_arg(ap, int32_t);
const char __user *udata =
va_arg(ap, const void __user *);
- errcode =
- p9pdu_writef(pdu, optional, "d", count);
+ errcode = p9pdu_writef(pdu, proto_version, "d",
+ count);
if (!errcode && pdu_write_u(pdu, udata, count))
errcode = -EFAULT;
}
@@ -441,14 +447,15 @@ p9pdu_vwritef(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap)
int16_t nwname = va_arg(ap, int);
const char **wnames = va_arg(ap, const char **);
- errcode =
- p9pdu_writef(pdu, optional, "w", nwname);
+ errcode = p9pdu_writef(pdu, proto_version, "w",
+ nwname);
if (!errcode) {
int i;
for (i = 0; i < nwname; i++) {
errcode =
- p9pdu_writef(pdu, optional,
+ p9pdu_writef(pdu,
+ proto_version,
"s",
wnames[i]);
if (errcode)
@@ -462,14 +469,15 @@ p9pdu_vwritef(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap)
struct p9_qid *wqids =
va_arg(ap, struct p9_qid *);
- errcode =
- p9pdu_writef(pdu, optional, "w", nwqid);
+ errcode = p9pdu_writef(pdu, proto_version, "w",
+ nwqid);
if (!errcode) {
int i;
for (i = 0; i < nwqid; i++) {
errcode =
- p9pdu_writef(pdu, optional,
+ p9pdu_writef(pdu,
+ proto_version,
"Q",
&wqids[i]);
if (errcode)
@@ -479,7 +487,7 @@ p9pdu_vwritef(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap)
}
break;
case '?':
- if (!optional)
+ if (proto_version != p9_proto_2000u)
return 0;
break;
default:
@@ -494,32 +502,32 @@ p9pdu_vwritef(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap)
return errcode;
}
-int p9pdu_readf(struct p9_fcall *pdu, int optional, const char *fmt, ...)
+int p9pdu_readf(struct p9_fcall *pdu, int proto_version, const char *fmt, ...)
{
va_list ap;
int ret;
va_start(ap, fmt);
- ret = p9pdu_vreadf(pdu, optional, fmt, ap);
+ ret = p9pdu_vreadf(pdu, proto_version, fmt, ap);
va_end(ap);
return ret;
}
static int
-p9pdu_writef(struct p9_fcall *pdu, int optional, const char *fmt, ...)
+p9pdu_writef(struct p9_fcall *pdu, int proto_version, const char *fmt, ...)
{
va_list ap;
int ret;
va_start(ap, fmt);
- ret = p9pdu_vwritef(pdu, optional, fmt, ap);
+ ret = p9pdu_vwritef(pdu, proto_version, fmt, ap);
va_end(ap);
return ret;
}
-int p9stat_read(char *buf, int len, struct p9_wstat *st, int dotu)
+int p9stat_read(char *buf, int len, struct p9_wstat *st, int proto_version)
{
struct p9_fcall fake_pdu;
int ret;
@@ -529,7 +537,7 @@ int p9stat_read(char *buf, int len, struct p9_wstat *st, int dotu)
fake_pdu.sdata = buf;
fake_pdu.offset = 0;
- ret = p9pdu_readf(&fake_pdu, dotu, "S", st);
+ ret = p9pdu_readf(&fake_pdu, proto_version, "S", st);
if (ret) {
P9_DPRINTK(P9_DEBUG_9P, "<<< p9stat_read failed: %d\n", ret);
p9pdu_dump(1, &fake_pdu);
diff --git a/net/9p/protocol.h b/net/9p/protocol.h
index ccde462e7ac..2431c0f38d5 100644
--- a/net/9p/protocol.h
+++ b/net/9p/protocol.h
@@ -25,9 +25,9 @@
*
*/
-int
-p9pdu_vwritef(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap);
-int p9pdu_readf(struct p9_fcall *pdu, int optional, const char *fmt, ...);
+int p9pdu_vwritef(struct p9_fcall *pdu, int proto_version, const char *fmt,
+ va_list ap);
+int p9pdu_readf(struct p9_fcall *pdu, int proto_version, const char *fmt, ...);
int p9pdu_prepare(struct p9_fcall *pdu, int16_t tag, int8_t type);
int p9pdu_finalize(struct p9_fcall *pdu);
void p9pdu_dump(int, struct p9_fcall *);
diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c
index cb50f4ae5ee..0aaed481937 100644
--- a/net/9p/trans_virtio.c
+++ b/net/9p/trans_virtio.c
@@ -49,8 +49,6 @@
/* a single mutex to manage channel initialization and attachment */
static DEFINE_MUTEX(virtio_9p_lock);
-/* global which tracks highest initialized channel */
-static int chan_index;
/**
* struct virtio_chan - per-instance transport information
@@ -68,8 +66,7 @@ static int chan_index;
*
*/
-static struct virtio_chan {
- bool initialized;
+struct virtio_chan {
bool inuse;
spinlock_t lock;
@@ -80,7 +77,11 @@ static struct virtio_chan {
/* Scatterlist: can be too big for stack. */
struct scatterlist sg[VIRTQUEUE_NUM];
-} channels[MAX_9P_CHAN];
+
+ struct list_head chan_list;
+};
+
+static struct list_head virtio_chan_list;
/* How many bytes left in this page. */
static unsigned int rest_of_page(void *data)
@@ -217,9 +218,7 @@ p9_virtio_request(struct p9_client *client, struct p9_req_t *req)
* p9_virtio_probe - probe for existence of 9P virtio channels
* @vdev: virtio device to probe
*
- * This probes for existing virtio channels. At present only
- * a single channel is in use, so in the future more work may need
- * to be done here.
+ * This probes for existing virtio channels.
*
*/
@@ -227,16 +226,10 @@ static int p9_virtio_probe(struct virtio_device *vdev)
{
int err;
struct virtio_chan *chan;
- int index;
- mutex_lock(&virtio_9p_lock);
- index = chan_index++;
- chan = &channels[index];
- mutex_unlock(&virtio_9p_lock);
-
- if (chan_index > MAX_9P_CHAN) {
- printk(KERN_ERR "9p: virtio: Maximum channels exceeded\n");
- BUG();
+ chan = kmalloc(sizeof(struct virtio_chan), GFP_KERNEL);
+ if (!chan) {
+ printk(KERN_ERR "9p: Failed to allocate virtio 9P channel\n");
err = -ENOMEM;
goto fail;
}
@@ -255,15 +248,15 @@ static int p9_virtio_probe(struct virtio_device *vdev)
sg_init_table(chan->sg, VIRTQUEUE_NUM);
chan->inuse = false;
- chan->initialized = true;
+ mutex_lock(&virtio_9p_lock);
+ list_add_tail(&chan->chan_list, &virtio_chan_list);
+ mutex_unlock(&virtio_9p_lock);
return 0;
out_free_vq:
vdev->config->del_vqs(vdev);
+ kfree(chan);
fail:
- mutex_lock(&virtio_9p_lock);
- chan_index--;
- mutex_unlock(&virtio_9p_lock);
return err;
}
@@ -280,35 +273,31 @@ fail:
* We use a simple reference count mechanism to ensure that only a single
* mount has a channel open at a time.
*
- * Bugs: doesn't allow identification of a specific channel
- * to allocate, channels are allocated sequentially. This was
- * a pragmatic decision to get things rolling, but ideally some
- * way of identifying the channel to attach to would be nice
- * if we are going to support multiple channels.
- *
*/
static int
p9_virtio_create(struct p9_client *client, const char *devname, char *args)
{
- struct virtio_chan *chan = channels;
- int index = 0;
+ struct virtio_chan *chan;
+ int ret = -ENOENT;
+ int found = 0;
mutex_lock(&virtio_9p_lock);
- while (index < MAX_9P_CHAN) {
- if (chan->initialized && !chan->inuse) {
- chan->inuse = true;
- break;
- } else {
- index++;
- chan = &channels[index];
+ list_for_each_entry(chan, &virtio_chan_list, chan_list) {
+ if (!strcmp(devname, dev_name(&chan->vdev->dev))) {
+ if (!chan->inuse) {
+ chan->inuse = true;
+ found = 1;
+ break;
+ }
+ ret = -EBUSY;
}
}
mutex_unlock(&virtio_9p_lock);
- if (index >= MAX_9P_CHAN) {
+ if (!found) {
printk(KERN_ERR "9p: no channels available\n");
- return -ENODEV;
+ return ret;
}
client->trans = (void *)chan;
@@ -329,11 +318,13 @@ static void p9_virtio_remove(struct virtio_device *vdev)
struct virtio_chan *chan = vdev->priv;
BUG_ON(chan->inuse);
+ vdev->config->del_vqs(vdev);
+
+ mutex_lock(&virtio_9p_lock);
+ list_del(&chan->chan_list);
+ mutex_unlock(&virtio_9p_lock);
+ kfree(chan);
- if (chan->initialized) {
- vdev->config->del_vqs(vdev);
- chan->initialized = false;
- }
}
static struct virtio_device_id id_table[] = {
@@ -364,10 +355,7 @@ static struct p9_trans_module p9_virtio_trans = {
/* The standard init function */
static int __init p9_virtio_init(void)
{
- int count;
-
- for (count = 0; count < MAX_9P_CHAN; count++)
- channels[count].initialized = false;
+ INIT_LIST_HEAD(&virtio_chan_list);
v9fs_register_trans(&p9_virtio_trans);
return register_virtio_driver(&p9_virtio_drv);
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index daece36c0a5..7f1178f6b83 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -12,3 +12,6 @@ config HAVE_KVM_EVENTFD
config KVM_APIC_ARCHITECTURE
bool
+
+config KVM_MMIO
+ bool
diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
index f73de631e3e..057e2cca6af 100644
--- a/virt/kvm/assigned-dev.c
+++ b/virt/kvm/assigned-dev.c
@@ -504,12 +504,12 @@ out:
static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
struct kvm_assigned_pci_dev *assigned_dev)
{
- int r = 0;
+ int r = 0, idx;
struct kvm_assigned_dev_kernel *match;
struct pci_dev *dev;
mutex_lock(&kvm->lock);
- down_read(&kvm->slots_lock);
+ idx = srcu_read_lock(&kvm->srcu);
match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
assigned_dev->assigned_dev_id);
@@ -526,7 +526,8 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
r = -ENOMEM;
goto out;
}
- dev = pci_get_bus_and_slot(assigned_dev->busnr,
+ dev = pci_get_domain_bus_and_slot(assigned_dev->segnr,
+ assigned_dev->busnr,
assigned_dev->devfn);
if (!dev) {
printk(KERN_INFO "%s: host device not found\n", __func__);
@@ -548,6 +549,7 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
pci_reset_function(dev);
match->assigned_dev_id = assigned_dev->assigned_dev_id;
+ match->host_segnr = assigned_dev->segnr;
match->host_busnr = assigned_dev->busnr;
match->host_devfn = assigned_dev->devfn;
match->flags = assigned_dev->flags;
@@ -573,7 +575,7 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
}
out:
- up_read(&kvm->slots_lock);
+ srcu_read_unlock(&kvm->srcu, idx);
mutex_unlock(&kvm->lock);
return r;
out_list_del:
@@ -585,7 +587,7 @@ out_put:
pci_dev_put(dev);
out_free:
kfree(match);
- up_read(&kvm->slots_lock);
+ srcu_read_unlock(&kvm->srcu, idx);
mutex_unlock(&kvm->lock);
return r;
}
diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c
index 04d69cd7049..5169736377a 100644
--- a/virt/kvm/coalesced_mmio.c
+++ b/virt/kvm/coalesced_mmio.c
@@ -92,41 +92,64 @@ static const struct kvm_io_device_ops coalesced_mmio_ops = {
int kvm_coalesced_mmio_init(struct kvm *kvm)
{
struct kvm_coalesced_mmio_dev *dev;
+ struct page *page;
int ret;
+ ret = -ENOMEM;
+ page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ if (!page)
+ goto out_err;
+ kvm->coalesced_mmio_ring = page_address(page);
+
+ ret = -ENOMEM;
dev = kzalloc(sizeof(struct kvm_coalesced_mmio_dev), GFP_KERNEL);
if (!dev)
- return -ENOMEM;
+ goto out_free_page;
spin_lock_init(&dev->lock);
kvm_iodevice_init(&dev->dev, &coalesced_mmio_ops);
dev->kvm = kvm;
kvm->coalesced_mmio_dev = dev;
- ret = kvm_io_bus_register_dev(kvm, &kvm->mmio_bus, &dev->dev);
+ mutex_lock(&kvm->slots_lock);
+ ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, &dev->dev);
+ mutex_unlock(&kvm->slots_lock);
if (ret < 0)
- kfree(dev);
+ goto out_free_dev;
+
+ return ret;
+out_free_dev:
+ kfree(dev);
+out_free_page:
+ __free_page(page);
+out_err:
return ret;
}
+void kvm_coalesced_mmio_free(struct kvm *kvm)
+{
+ if (kvm->coalesced_mmio_ring)
+ free_page((unsigned long)kvm->coalesced_mmio_ring);
+}
+
int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm,
- struct kvm_coalesced_mmio_zone *zone)
+ struct kvm_coalesced_mmio_zone *zone)
{
struct kvm_coalesced_mmio_dev *dev = kvm->coalesced_mmio_dev;
if (dev == NULL)
return -EINVAL;
- down_write(&kvm->slots_lock);
+ mutex_lock(&kvm->slots_lock);
if (dev->nb_zones >= KVM_COALESCED_MMIO_ZONE_MAX) {
- up_write(&kvm->slots_lock);
+ mutex_unlock(&kvm->slots_lock);
return -ENOBUFS;
}
dev->zone[dev->nb_zones] = *zone;
dev->nb_zones++;
- up_write(&kvm->slots_lock);
+ mutex_unlock(&kvm->slots_lock);
return 0;
}
@@ -140,10 +163,10 @@ int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm,
if (dev == NULL)
return -EINVAL;
- down_write(&kvm->slots_lock);
+ mutex_lock(&kvm->slots_lock);
i = dev->nb_zones;
- while(i) {
+ while (i) {
z = &dev->zone[i - 1];
/* unregister all zones
@@ -158,7 +181,7 @@ int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm,
i--;
}
- up_write(&kvm->slots_lock);
+ mutex_unlock(&kvm->slots_lock);
return 0;
}
diff --git a/virt/kvm/coalesced_mmio.h b/virt/kvm/coalesced_mmio.h
index 4b49f27fa31..8a5959e3535 100644
--- a/virt/kvm/coalesced_mmio.h
+++ b/virt/kvm/coalesced_mmio.h
@@ -1,3 +1,6 @@
+#ifndef __KVM_COALESCED_MMIO_H__
+#define __KVM_COALESCED_MMIO_H__
+
/*
* KVM coalesced MMIO
*
@@ -7,6 +10,8 @@
*
*/
+#ifdef CONFIG_KVM_MMIO
+
#define KVM_COALESCED_MMIO_ZONE_MAX 100
struct kvm_coalesced_mmio_dev {
@@ -18,7 +23,17 @@ struct kvm_coalesced_mmio_dev {
};
int kvm_coalesced_mmio_init(struct kvm *kvm);
+void kvm_coalesced_mmio_free(struct kvm *kvm);
int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm,
struct kvm_coalesced_mmio_zone *zone);
int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm,
struct kvm_coalesced_mmio_zone *zone);
+
+#else
+
+static inline int kvm_coalesced_mmio_init(struct kvm *kvm) { return 0; }
+static inline void kvm_coalesced_mmio_free(struct kvm *kvm) { }
+
+#endif
+
+#endif
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index a9d3fc6c681..7016319b1ec 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -47,7 +47,6 @@ struct _irqfd {
int gsi;
struct list_head list;
poll_table pt;
- wait_queue_head_t *wqh;
wait_queue_t wait;
struct work_struct inject;
struct work_struct shutdown;
@@ -159,8 +158,6 @@ irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh,
poll_table *pt)
{
struct _irqfd *irqfd = container_of(pt, struct _irqfd, pt);
-
- irqfd->wqh = wqh;
add_wait_queue(wqh, &irqfd->wait);
}
@@ -463,7 +460,7 @@ static int
kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
{
int pio = args->flags & KVM_IOEVENTFD_FLAG_PIO;
- struct kvm_io_bus *bus = pio ? &kvm->pio_bus : &kvm->mmio_bus;
+ enum kvm_bus bus_idx = pio ? KVM_PIO_BUS : KVM_MMIO_BUS;
struct _ioeventfd *p;
struct eventfd_ctx *eventfd;
int ret;
@@ -508,7 +505,7 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
else
p->wildcard = true;
- down_write(&kvm->slots_lock);
+ mutex_lock(&kvm->slots_lock);
/* Verify that there isnt a match already */
if (ioeventfd_check_collision(kvm, p)) {
@@ -518,18 +515,18 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
kvm_iodevice_init(&p->dev, &ioeventfd_ops);
- ret = __kvm_io_bus_register_dev(bus, &p->dev);
+ ret = kvm_io_bus_register_dev(kvm, bus_idx, &p->dev);
if (ret < 0)
goto unlock_fail;
list_add_tail(&p->list, &kvm->ioeventfds);
- up_write(&kvm->slots_lock);
+ mutex_unlock(&kvm->slots_lock);
return 0;
unlock_fail:
- up_write(&kvm->slots_lock);
+ mutex_unlock(&kvm->slots_lock);
fail:
kfree(p);
@@ -542,7 +539,7 @@ static int
kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
{
int pio = args->flags & KVM_IOEVENTFD_FLAG_PIO;
- struct kvm_io_bus *bus = pio ? &kvm->pio_bus : &kvm->mmio_bus;
+ enum kvm_bus bus_idx = pio ? KVM_PIO_BUS : KVM_MMIO_BUS;
struct _ioeventfd *p, *tmp;
struct eventfd_ctx *eventfd;
int ret = -ENOENT;
@@ -551,7 +548,7 @@ kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
if (IS_ERR(eventfd))
return PTR_ERR(eventfd);
- down_write(&kvm->slots_lock);
+ mutex_lock(&kvm->slots_lock);
list_for_each_entry_safe(p, tmp, &kvm->ioeventfds, list) {
bool wildcard = !(args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH);
@@ -565,13 +562,13 @@ kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
if (!p->wildcard && p->datamatch != args->datamatch)
continue;
- __kvm_io_bus_unregister_dev(bus, &p->dev);
+ kvm_io_bus_unregister_dev(kvm, bus_idx, &p->dev);
ioeventfd_release(p);
ret = 0;
break;
}
- up_write(&kvm->slots_lock);
+ mutex_unlock(&kvm->slots_lock);
eventfd_ctx_put(eventfd);
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index 38a2d20b89d..3db15a807f8 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -100,6 +100,19 @@ static int ioapic_service(struct kvm_ioapic *ioapic, unsigned int idx)
return injected;
}
+static void update_handled_vectors(struct kvm_ioapic *ioapic)
+{
+ DECLARE_BITMAP(handled_vectors, 256);
+ int i;
+
+ memset(handled_vectors, 0, sizeof(handled_vectors));
+ for (i = 0; i < IOAPIC_NUM_PINS; ++i)
+ __set_bit(ioapic->redirtbl[i].fields.vector, handled_vectors);
+ memcpy(ioapic->handled_vectors, handled_vectors,
+ sizeof(handled_vectors));
+ smp_wmb();
+}
+
static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
{
unsigned index;
@@ -134,6 +147,7 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
e->bits |= (u32) val;
e->fields.remote_irr = 0;
}
+ update_handled_vectors(ioapic);
mask_after = e->fields.mask;
if (mask_before != mask_after)
kvm_fire_mask_notifiers(ioapic->kvm, index, mask_after);
@@ -241,6 +255,9 @@ void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode)
{
struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+ smp_rmb();
+ if (!test_bit(vector, ioapic->handled_vectors))
+ return;
mutex_lock(&ioapic->lock);
__kvm_ioapic_update_eoi(ioapic, vector, trigger_mode);
mutex_unlock(&ioapic->lock);
@@ -352,6 +369,7 @@ void kvm_ioapic_reset(struct kvm_ioapic *ioapic)
ioapic->ioregsel = 0;
ioapic->irr = 0;
ioapic->id = 0;
+ update_handled_vectors(ioapic);
}
static const struct kvm_io_device_ops ioapic_mmio_ops = {
@@ -372,13 +390,28 @@ int kvm_ioapic_init(struct kvm *kvm)
kvm_ioapic_reset(ioapic);
kvm_iodevice_init(&ioapic->dev, &ioapic_mmio_ops);
ioapic->kvm = kvm;
- ret = kvm_io_bus_register_dev(kvm, &kvm->mmio_bus, &ioapic->dev);
- if (ret < 0)
+ mutex_lock(&kvm->slots_lock);
+ ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, &ioapic->dev);
+ mutex_unlock(&kvm->slots_lock);
+ if (ret < 0) {
+ kvm->arch.vioapic = NULL;
kfree(ioapic);
+ }
return ret;
}
+void kvm_ioapic_destroy(struct kvm *kvm)
+{
+ struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+
+ if (ioapic) {
+ kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &ioapic->dev);
+ kvm->arch.vioapic = NULL;
+ kfree(ioapic);
+ }
+}
+
int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
{
struct kvm_ioapic *ioapic = ioapic_irqchip(kvm);
@@ -399,6 +432,7 @@ int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
mutex_lock(&ioapic->lock);
memcpy(ioapic, state, sizeof(struct kvm_ioapic_state));
+ update_handled_vectors(ioapic);
mutex_unlock(&ioapic->lock);
return 0;
}
diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h
index 419c43b667a..8a751b78a43 100644
--- a/virt/kvm/ioapic.h
+++ b/virt/kvm/ioapic.h
@@ -46,6 +46,7 @@ struct kvm_ioapic {
struct kvm *kvm;
void (*ack_notifier)(void *opaque, int irq);
struct mutex lock;
+ DECLARE_BITMAP(handled_vectors, 256);
};
#ifdef DEBUG
@@ -71,6 +72,7 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2);
void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode);
int kvm_ioapic_init(struct kvm *kvm);
+void kvm_ioapic_destroy(struct kvm *kvm);
int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
void kvm_ioapic_reset(struct kvm_ioapic *ioapic);
int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c
index 15147583abd..80fd3ad3b2d 100644
--- a/virt/kvm/iommu.c
+++ b/virt/kvm/iommu.c
@@ -32,10 +32,10 @@ static int kvm_iommu_unmap_memslots(struct kvm *kvm);
static void kvm_iommu_put_pages(struct kvm *kvm,
gfn_t base_gfn, unsigned long npages);
-int kvm_iommu_map_pages(struct kvm *kvm,
- gfn_t base_gfn, unsigned long npages)
+int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
{
- gfn_t gfn = base_gfn;
+ gfn_t gfn = slot->base_gfn;
+ unsigned long npages = slot->npages;
pfn_t pfn;
int i, r = 0;
struct iommu_domain *domain = kvm->arch.iommu_domain;
@@ -54,7 +54,7 @@ int kvm_iommu_map_pages(struct kvm *kvm,
if (iommu_iova_to_phys(domain, gfn_to_gpa(gfn)))
continue;
- pfn = gfn_to_pfn(kvm, gfn);
+ pfn = gfn_to_pfn_memslot(kvm, slot, gfn);
r = iommu_map_range(domain,
gfn_to_gpa(gfn),
pfn_to_hpa(pfn),
@@ -69,17 +69,19 @@ int kvm_iommu_map_pages(struct kvm *kvm,
return 0;
unmap_pages:
- kvm_iommu_put_pages(kvm, base_gfn, i);
+ kvm_iommu_put_pages(kvm, slot->base_gfn, i);
return r;
}
static int kvm_iommu_map_memslots(struct kvm *kvm)
{
int i, r = 0;
+ struct kvm_memslots *slots;
+
+ slots = rcu_dereference(kvm->memslots);
- for (i = 0; i < kvm->nmemslots; i++) {
- r = kvm_iommu_map_pages(kvm, kvm->memslots[i].base_gfn,
- kvm->memslots[i].npages);
+ for (i = 0; i < slots->nmemslots; i++) {
+ r = kvm_iommu_map_pages(kvm, &slots->memslots[i]);
if (r)
break;
}
@@ -104,7 +106,8 @@ int kvm_assign_device(struct kvm *kvm,
r = iommu_attach_device(domain, &pdev->dev);
if (r) {
- printk(KERN_ERR "assign device %x:%x.%x failed",
+ printk(KERN_ERR "assign device %x:%x:%x.%x failed",
+ pci_domain_nr(pdev->bus),
pdev->bus->number,
PCI_SLOT(pdev->devfn),
PCI_FUNC(pdev->devfn));
@@ -125,7 +128,8 @@ int kvm_assign_device(struct kvm *kvm,
goto out_unmap;
}
- printk(KERN_DEBUG "assign device: host bdf = %x:%x:%x\n",
+ printk(KERN_DEBUG "assign device %x:%x:%x.%x\n",
+ assigned_dev->host_segnr,
assigned_dev->host_busnr,
PCI_SLOT(assigned_dev->host_devfn),
PCI_FUNC(assigned_dev->host_devfn));
@@ -152,7 +156,8 @@ int kvm_deassign_device(struct kvm *kvm,
iommu_detach_device(domain, &pdev->dev);
- printk(KERN_DEBUG "deassign device: host bdf = %x:%x:%x\n",
+ printk(KERN_DEBUG "deassign device %x:%x:%x.%x\n",
+ assigned_dev->host_segnr,
assigned_dev->host_busnr,
PCI_SLOT(assigned_dev->host_devfn),
PCI_FUNC(assigned_dev->host_devfn));
@@ -210,10 +215,13 @@ static void kvm_iommu_put_pages(struct kvm *kvm,
static int kvm_iommu_unmap_memslots(struct kvm *kvm)
{
int i;
+ struct kvm_memslots *slots;
+
+ slots = rcu_dereference(kvm->memslots);
- for (i = 0; i < kvm->nmemslots; i++) {
- kvm_iommu_put_pages(kvm, kvm->memslots[i].base_gfn,
- kvm->memslots[i].npages);
+ for (i = 0; i < slots->nmemslots; i++) {
+ kvm_iommu_put_pages(kvm, slots->memslots[i].base_gfn,
+ slots->memslots[i].npages);
}
return 0;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index a944be392d6..548f9253c19 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -44,6 +44,8 @@
#include <linux/bitops.h>
#include <linux/spinlock.h>
#include <linux/compat.h>
+#include <linux/srcu.h>
+#include <linux/hugetlb.h>
#include <asm/processor.h>
#include <asm/io.h>
@@ -51,9 +53,7 @@
#include <asm/pgtable.h>
#include <asm-generic/bitops/le.h>
-#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
#include "coalesced_mmio.h"
-#endif
#define CREATE_TRACE_POINTS
#include <trace/events/kvm.h>
@@ -86,6 +86,8 @@ static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
static int hardware_enable_all(void);
static void hardware_disable_all(void);
+static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
+
static bool kvm_rebooting;
static bool largepages_enabled = true;
@@ -136,7 +138,7 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
zalloc_cpumask_var(&cpus, GFP_ATOMIC);
- spin_lock(&kvm->requests_lock);
+ raw_spin_lock(&kvm->requests_lock);
me = smp_processor_id();
kvm_for_each_vcpu(i, vcpu, kvm) {
if (test_and_set_bit(req, &vcpu->requests))
@@ -151,7 +153,7 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
smp_call_function_many(cpus, ack_flush, NULL, 1);
else
called = false;
- spin_unlock(&kvm->requests_lock);
+ raw_spin_unlock(&kvm->requests_lock);
free_cpumask_var(cpus);
return called;
}
@@ -215,7 +217,7 @@ static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
unsigned long address)
{
struct kvm *kvm = mmu_notifier_to_kvm(mn);
- int need_tlb_flush;
+ int need_tlb_flush, idx;
/*
* When ->invalidate_page runs, the linux pte has been zapped
@@ -235,10 +237,12 @@ static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
* pte after kvm_unmap_hva returned, without noticing the page
* is going to be freed.
*/
+ idx = srcu_read_lock(&kvm->srcu);
spin_lock(&kvm->mmu_lock);
kvm->mmu_notifier_seq++;
need_tlb_flush = kvm_unmap_hva(kvm, address);
spin_unlock(&kvm->mmu_lock);
+ srcu_read_unlock(&kvm->srcu, idx);
/* we've to flush the tlb before the pages can be freed */
if (need_tlb_flush)
@@ -252,11 +256,14 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
pte_t pte)
{
struct kvm *kvm = mmu_notifier_to_kvm(mn);
+ int idx;
+ idx = srcu_read_lock(&kvm->srcu);
spin_lock(&kvm->mmu_lock);
kvm->mmu_notifier_seq++;
kvm_set_spte_hva(kvm, address, pte);
spin_unlock(&kvm->mmu_lock);
+ srcu_read_unlock(&kvm->srcu, idx);
}
static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
@@ -265,8 +272,9 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
unsigned long end)
{
struct kvm *kvm = mmu_notifier_to_kvm(mn);
- int need_tlb_flush = 0;
+ int need_tlb_flush = 0, idx;
+ idx = srcu_read_lock(&kvm->srcu);
spin_lock(&kvm->mmu_lock);
/*
* The count increase must become visible at unlock time as no
@@ -277,6 +285,7 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
for (; start < end; start += PAGE_SIZE)
need_tlb_flush |= kvm_unmap_hva(kvm, start);
spin_unlock(&kvm->mmu_lock);
+ srcu_read_unlock(&kvm->srcu, idx);
/* we've to flush the tlb before the pages can be freed */
if (need_tlb_flush)
@@ -314,11 +323,13 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
unsigned long address)
{
struct kvm *kvm = mmu_notifier_to_kvm(mn);
- int young;
+ int young, idx;
+ idx = srcu_read_lock(&kvm->srcu);
spin_lock(&kvm->mmu_lock);
young = kvm_age_hva(kvm, address);
spin_unlock(&kvm->mmu_lock);
+ srcu_read_unlock(&kvm->srcu, idx);
if (young)
kvm_flush_remote_tlbs(kvm);
@@ -341,15 +352,26 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
.change_pte = kvm_mmu_notifier_change_pte,
.release = kvm_mmu_notifier_release,
};
+
+static int kvm_init_mmu_notifier(struct kvm *kvm)
+{
+ kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
+ return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
+}
+
+#else /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */
+
+static int kvm_init_mmu_notifier(struct kvm *kvm)
+{
+ return 0;
+}
+
#endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
static struct kvm *kvm_create_vm(void)
{
- int r = 0;
+ int r = 0, i;
struct kvm *kvm = kvm_arch_create_vm();
-#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
- struct page *page;
-#endif
if (IS_ERR(kvm))
goto out;
@@ -363,39 +385,35 @@ static struct kvm *kvm_create_vm(void)
INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
#endif
-#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
- page = alloc_page(GFP_KERNEL | __GFP_ZERO);
- if (!page) {
- r = -ENOMEM;
+ r = -ENOMEM;
+ kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
+ if (!kvm->memslots)
goto out_err;
- }
- kvm->coalesced_mmio_ring =
- (struct kvm_coalesced_mmio_ring *)page_address(page);
-#endif
-
-#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
- {
- kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
- r = mmu_notifier_register(&kvm->mmu_notifier, current->mm);
- if (r) {
-#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
- put_page(page);
-#endif
+ if (init_srcu_struct(&kvm->srcu))
+ goto out_err;
+ for (i = 0; i < KVM_NR_BUSES; i++) {
+ kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus),
+ GFP_KERNEL);
+ if (!kvm->buses[i]) {
+ cleanup_srcu_struct(&kvm->srcu);
goto out_err;
}
}
-#endif
+
+ r = kvm_init_mmu_notifier(kvm);
+ if (r) {
+ cleanup_srcu_struct(&kvm->srcu);
+ goto out_err;
+ }
kvm->mm = current->mm;
atomic_inc(&kvm->mm->mm_count);
spin_lock_init(&kvm->mmu_lock);
- spin_lock_init(&kvm->requests_lock);
- kvm_io_bus_init(&kvm->pio_bus);
+ raw_spin_lock_init(&kvm->requests_lock);
kvm_eventfd_init(kvm);
mutex_init(&kvm->lock);
mutex_init(&kvm->irq_lock);
- kvm_io_bus_init(&kvm->mmio_bus);
- init_rwsem(&kvm->slots_lock);
+ mutex_init(&kvm->slots_lock);
atomic_set(&kvm->users_count, 1);
spin_lock(&kvm_lock);
list_add(&kvm->vm_list, &vm_list);
@@ -406,12 +424,12 @@ static struct kvm *kvm_create_vm(void)
out:
return kvm;
-#if defined(KVM_COALESCED_MMIO_PAGE_OFFSET) || \
- (defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER))
out_err:
hardware_disable_all();
-#endif
out_err_nodisable:
+ for (i = 0; i < KVM_NR_BUSES; i++)
+ kfree(kvm->buses[i]);
+ kfree(kvm->memslots);
kfree(kvm);
return ERR_PTR(r);
}
@@ -446,13 +464,17 @@ static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
void kvm_free_physmem(struct kvm *kvm)
{
int i;
+ struct kvm_memslots *slots = kvm->memslots;
+
+ for (i = 0; i < slots->nmemslots; ++i)
+ kvm_free_physmem_slot(&slots->memslots[i], NULL);
- for (i = 0; i < kvm->nmemslots; ++i)
- kvm_free_physmem_slot(&kvm->memslots[i], NULL);
+ kfree(kvm->memslots);
}
static void kvm_destroy_vm(struct kvm *kvm)
{
+ int i;
struct mm_struct *mm = kvm->mm;
kvm_arch_sync_events(kvm);
@@ -460,12 +482,9 @@ static void kvm_destroy_vm(struct kvm *kvm)
list_del(&kvm->vm_list);
spin_unlock(&kvm_lock);
kvm_free_irq_routing(kvm);
- kvm_io_bus_destroy(&kvm->pio_bus);
- kvm_io_bus_destroy(&kvm->mmio_bus);
-#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
- if (kvm->coalesced_mmio_ring != NULL)
- free_page((unsigned long)kvm->coalesced_mmio_ring);
-#endif
+ for (i = 0; i < KVM_NR_BUSES; i++)
+ kvm_io_bus_destroy(kvm->buses[i]);
+ kvm_coalesced_mmio_free(kvm);
#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
#else
@@ -512,12 +531,13 @@ int __kvm_set_memory_region(struct kvm *kvm,
struct kvm_userspace_memory_region *mem,
int user_alloc)
{
- int r;
+ int r, flush_shadow = 0;
gfn_t base_gfn;
unsigned long npages;
unsigned long i;
struct kvm_memory_slot *memslot;
struct kvm_memory_slot old, new;
+ struct kvm_memslots *slots, *old_memslots;
r = -EINVAL;
/* General sanity checks */
@@ -532,7 +552,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
goto out;
- memslot = &kvm->memslots[mem->slot];
+ memslot = &kvm->memslots->memslots[mem->slot];
base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
npages = mem->memory_size >> PAGE_SHIFT;
@@ -553,7 +573,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
/* Check for overlaps */
r = -EEXIST;
for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
- struct kvm_memory_slot *s = &kvm->memslots[i];
+ struct kvm_memory_slot *s = &kvm->memslots->memslots[i];
if (s == memslot || !s->npages)
continue;
@@ -579,15 +599,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
memset(new.rmap, 0, npages * sizeof(*new.rmap));
new.user_alloc = user_alloc;
- /*
- * hva_to_rmmap() serialzies with the mmu_lock and to be
- * safe it has to ignore memslots with !user_alloc &&
- * !userspace_addr.
- */
- if (user_alloc)
- new.userspace_addr = mem->userspace_addr;
- else
- new.userspace_addr = 0;
+ new.userspace_addr = mem->userspace_addr;
}
if (!npages)
goto skip_lpage;
@@ -642,8 +654,9 @@ skip_lpage:
if (!new.dirty_bitmap)
goto out_free;
memset(new.dirty_bitmap, 0, dirty_bytes);
+ /* destroy any largepage mappings for dirty tracking */
if (old.npages)
- kvm_arch_flush_shadow(kvm);
+ flush_shadow = 1;
}
#else /* not defined CONFIG_S390 */
new.user_alloc = user_alloc;
@@ -651,36 +664,72 @@ skip_lpage:
new.userspace_addr = mem->userspace_addr;
#endif /* not defined CONFIG_S390 */
- if (!npages)
+ if (!npages) {
+ r = -ENOMEM;
+ slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
+ if (!slots)
+ goto out_free;
+ memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
+ if (mem->slot >= slots->nmemslots)
+ slots->nmemslots = mem->slot + 1;
+ slots->memslots[mem->slot].flags |= KVM_MEMSLOT_INVALID;
+
+ old_memslots = kvm->memslots;
+ rcu_assign_pointer(kvm->memslots, slots);
+ synchronize_srcu_expedited(&kvm->srcu);
+ /* From this point no new shadow pages pointing to a deleted
+ * memslot will be created.
+ *
+ * validation of sp->gfn happens in:
+ * - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
+ * - kvm_is_visible_gfn (mmu_check_roots)
+ */
kvm_arch_flush_shadow(kvm);
+ kfree(old_memslots);
+ }
- spin_lock(&kvm->mmu_lock);
- if (mem->slot >= kvm->nmemslots)
- kvm->nmemslots = mem->slot + 1;
-
- *memslot = new;
- spin_unlock(&kvm->mmu_lock);
-
- r = kvm_arch_set_memory_region(kvm, mem, old, user_alloc);
- if (r) {
- spin_lock(&kvm->mmu_lock);
- *memslot = old;
- spin_unlock(&kvm->mmu_lock);
+ r = kvm_arch_prepare_memory_region(kvm, &new, old, mem, user_alloc);
+ if (r)
goto out_free;
- }
- kvm_free_physmem_slot(&old, npages ? &new : NULL);
- /* Slot deletion case: we have to update the current slot */
- spin_lock(&kvm->mmu_lock);
- if (!npages)
- *memslot = old;
- spin_unlock(&kvm->mmu_lock);
#ifdef CONFIG_DMAR
/* map the pages in iommu page table */
- r = kvm_iommu_map_pages(kvm, base_gfn, npages);
- if (r)
- goto out;
+ if (npages) {
+ r = kvm_iommu_map_pages(kvm, &new);
+ if (r)
+ goto out_free;
+ }
#endif
+
+ r = -ENOMEM;
+ slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
+ if (!slots)
+ goto out_free;
+ memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
+ if (mem->slot >= slots->nmemslots)
+ slots->nmemslots = mem->slot + 1;
+
+ /* actual memory is freed via old in kvm_free_physmem_slot below */
+ if (!npages) {
+ new.rmap = NULL;
+ new.dirty_bitmap = NULL;
+ for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i)
+ new.lpage_info[i] = NULL;
+ }
+
+ slots->memslots[mem->slot] = new;
+ old_memslots = kvm->memslots;
+ rcu_assign_pointer(kvm->memslots, slots);
+ synchronize_srcu_expedited(&kvm->srcu);
+
+ kvm_arch_commit_memory_region(kvm, mem, old, user_alloc);
+
+ kvm_free_physmem_slot(&old, &new);
+ kfree(old_memslots);
+
+ if (flush_shadow)
+ kvm_arch_flush_shadow(kvm);
+
return 0;
out_free:
@@ -697,9 +746,9 @@ int kvm_set_memory_region(struct kvm *kvm,
{
int r;
- down_write(&kvm->slots_lock);
+ mutex_lock(&kvm->slots_lock);
r = __kvm_set_memory_region(kvm, mem, user_alloc);
- up_write(&kvm->slots_lock);
+ mutex_unlock(&kvm->slots_lock);
return r;
}
EXPORT_SYMBOL_GPL(kvm_set_memory_region);
@@ -726,7 +775,7 @@ int kvm_get_dirty_log(struct kvm *kvm,
if (log->slot >= KVM_MEMORY_SLOTS)
goto out;
- memslot = &kvm->memslots[log->slot];
+ memslot = &kvm->memslots->memslots[log->slot];
r = -ENOENT;
if (!memslot->dirty_bitmap)
goto out;
@@ -780,9 +829,10 @@ EXPORT_SYMBOL_GPL(kvm_is_error_hva);
struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn)
{
int i;
+ struct kvm_memslots *slots = rcu_dereference(kvm->memslots);
- for (i = 0; i < kvm->nmemslots; ++i) {
- struct kvm_memory_slot *memslot = &kvm->memslots[i];
+ for (i = 0; i < slots->nmemslots; ++i) {
+ struct kvm_memory_slot *memslot = &slots->memslots[i];
if (gfn >= memslot->base_gfn
&& gfn < memslot->base_gfn + memslot->npages)
@@ -801,10 +851,14 @@ struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
{
int i;
+ struct kvm_memslots *slots = rcu_dereference(kvm->memslots);
- gfn = unalias_gfn(kvm, gfn);
+ gfn = unalias_gfn_instantiation(kvm, gfn);
for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
- struct kvm_memory_slot *memslot = &kvm->memslots[i];
+ struct kvm_memory_slot *memslot = &slots->memslots[i];
+
+ if (memslot->flags & KVM_MEMSLOT_INVALID)
+ continue;
if (gfn >= memslot->base_gfn
&& gfn < memslot->base_gfn + memslot->npages)
@@ -814,33 +868,68 @@ int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
}
EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
+unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn)
+{
+ struct vm_area_struct *vma;
+ unsigned long addr, size;
+
+ size = PAGE_SIZE;
+
+ addr = gfn_to_hva(kvm, gfn);
+ if (kvm_is_error_hva(addr))
+ return PAGE_SIZE;
+
+ down_read(&current->mm->mmap_sem);
+ vma = find_vma(current->mm, addr);
+ if (!vma)
+ goto out;
+
+ size = vma_kernel_pagesize(vma);
+
+out:
+ up_read(&current->mm->mmap_sem);
+
+ return size;
+}
+
+int memslot_id(struct kvm *kvm, gfn_t gfn)
+{
+ int i;
+ struct kvm_memslots *slots = rcu_dereference(kvm->memslots);
+ struct kvm_memory_slot *memslot = NULL;
+
+ gfn = unalias_gfn(kvm, gfn);
+ for (i = 0; i < slots->nmemslots; ++i) {
+ memslot = &slots->memslots[i];
+
+ if (gfn >= memslot->base_gfn
+ && gfn < memslot->base_gfn + memslot->npages)
+ break;
+ }
+
+ return memslot - slots->memslots;
+}
+
unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
{
struct kvm_memory_slot *slot;
- gfn = unalias_gfn(kvm, gfn);
+ gfn = unalias_gfn_instantiation(kvm, gfn);
slot = gfn_to_memslot_unaliased(kvm, gfn);
- if (!slot)
+ if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
return bad_hva();
return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE);
}
EXPORT_SYMBOL_GPL(gfn_to_hva);
-pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
+static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr)
{
struct page *page[1];
- unsigned long addr;
int npages;
pfn_t pfn;
might_sleep();
- addr = gfn_to_hva(kvm, gfn);
- if (kvm_is_error_hva(addr)) {
- get_page(bad_page);
- return page_to_pfn(bad_page);
- }
-
npages = get_user_pages_fast(addr, 1, 1, page);
if (unlikely(npages != 1)) {
@@ -865,8 +954,32 @@ pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
return pfn;
}
+pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
+{
+ unsigned long addr;
+
+ addr = gfn_to_hva(kvm, gfn);
+ if (kvm_is_error_hva(addr)) {
+ get_page(bad_page);
+ return page_to_pfn(bad_page);
+ }
+
+ return hva_to_pfn(kvm, addr);
+}
EXPORT_SYMBOL_GPL(gfn_to_pfn);
+static unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
+{
+ return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE);
+}
+
+pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
+ struct kvm_memory_slot *slot, gfn_t gfn)
+{
+ unsigned long addr = gfn_to_hva_memslot(slot, gfn);
+ return hva_to_pfn(kvm, addr);
+}
+
struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
{
pfn_t pfn;
@@ -1854,12 +1967,7 @@ static struct notifier_block kvm_reboot_notifier = {
.priority = 0,
};
-void kvm_io_bus_init(struct kvm_io_bus *bus)
-{
- memset(bus, 0, sizeof(*bus));
-}
-
-void kvm_io_bus_destroy(struct kvm_io_bus *bus)
+static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
{
int i;
@@ -1868,13 +1976,15 @@ void kvm_io_bus_destroy(struct kvm_io_bus *bus)
kvm_iodevice_destructor(pos);
}
+ kfree(bus);
}
/* kvm_io_bus_write - called under kvm->slots_lock */
-int kvm_io_bus_write(struct kvm_io_bus *bus, gpa_t addr,
+int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
int len, const void *val)
{
int i;
+ struct kvm_io_bus *bus = rcu_dereference(kvm->buses[bus_idx]);
for (i = 0; i < bus->dev_count; i++)
if (!kvm_iodevice_write(bus->devs[i], addr, len, val))
return 0;
@@ -1882,59 +1992,71 @@ int kvm_io_bus_write(struct kvm_io_bus *bus, gpa_t addr,
}
/* kvm_io_bus_read - called under kvm->slots_lock */
-int kvm_io_bus_read(struct kvm_io_bus *bus, gpa_t addr, int len, void *val)
+int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
+ int len, void *val)
{
int i;
+ struct kvm_io_bus *bus = rcu_dereference(kvm->buses[bus_idx]);
+
for (i = 0; i < bus->dev_count; i++)
if (!kvm_iodevice_read(bus->devs[i], addr, len, val))
return 0;
return -EOPNOTSUPP;
}
-int kvm_io_bus_register_dev(struct kvm *kvm, struct kvm_io_bus *bus,
- struct kvm_io_device *dev)
+/* Caller must hold slots_lock. */
+int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx,
+ struct kvm_io_device *dev)
{
- int ret;
-
- down_write(&kvm->slots_lock);
- ret = __kvm_io_bus_register_dev(bus, dev);
- up_write(&kvm->slots_lock);
+ struct kvm_io_bus *new_bus, *bus;
- return ret;
-}
-
-/* An unlocked version. Caller must have write lock on slots_lock. */
-int __kvm_io_bus_register_dev(struct kvm_io_bus *bus,
- struct kvm_io_device *dev)
-{
+ bus = kvm->buses[bus_idx];
if (bus->dev_count > NR_IOBUS_DEVS-1)
return -ENOSPC;
- bus->devs[bus->dev_count++] = dev;
+ new_bus = kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL);
+ if (!new_bus)
+ return -ENOMEM;
+ memcpy(new_bus, bus, sizeof(struct kvm_io_bus));
+ new_bus->devs[new_bus->dev_count++] = dev;
+ rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
+ synchronize_srcu_expedited(&kvm->srcu);
+ kfree(bus);
return 0;
}
-void kvm_io_bus_unregister_dev(struct kvm *kvm,
- struct kvm_io_bus *bus,
- struct kvm_io_device *dev)
+/* Caller must hold slots_lock. */
+int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
+ struct kvm_io_device *dev)
{
- down_write(&kvm->slots_lock);
- __kvm_io_bus_unregister_dev(bus, dev);
- up_write(&kvm->slots_lock);
-}
+ int i, r;
+ struct kvm_io_bus *new_bus, *bus;
-/* An unlocked version. Caller must have write lock on slots_lock. */
-void __kvm_io_bus_unregister_dev(struct kvm_io_bus *bus,
- struct kvm_io_device *dev)
-{
- int i;
+ new_bus = kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL);
+ if (!new_bus)
+ return -ENOMEM;
- for (i = 0; i < bus->dev_count; i++)
- if (bus->devs[i] == dev) {
- bus->devs[i] = bus->devs[--bus->dev_count];
+ bus = kvm->buses[bus_idx];
+ memcpy(new_bus, bus, sizeof(struct kvm_io_bus));
+
+ r = -ENOENT;
+ for (i = 0; i < new_bus->dev_count; i++)
+ if (new_bus->devs[i] == dev) {
+ r = 0;
+ new_bus->devs[i] = new_bus->devs[--new_bus->dev_count];
break;
}
+
+ if (r) {
+ kfree(new_bus);
+ return r;
+ }
+
+ rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
+ synchronize_srcu_expedited(&kvm->srcu);
+ kfree(bus);
+ return r;
}
static struct notifier_block kvm_cpu_notifier = {