diff options
Diffstat (limited to 'virt/kvm')
-rw-r--r-- | virt/kvm/Kconfig | 3 | ||||
-rw-r--r-- | virt/kvm/coalesced_mmio.c | 8 | ||||
-rw-r--r-- | virt/kvm/kvm_main.c | 297 |
3 files changed, 164 insertions, 144 deletions
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 9fb1ff6f19e5..b74916de5183 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -92,3 +92,6 @@ config KVM_XFER_TO_GUEST_WORK config HAVE_KVM_PM_NOTIFIER bool + +config KVM_GENERIC_HARDWARE_ENABLING + bool diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c index 0be80c213f7f..5ef88f5a0864 100644 --- a/virt/kvm/coalesced_mmio.c +++ b/virt/kvm/coalesced_mmio.c @@ -187,15 +187,17 @@ int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm, r = kvm_io_bus_unregister_dev(kvm, zone->pio ? KVM_PIO_BUS : KVM_MMIO_BUS, &dev->dev); + kvm_iodevice_destructor(&dev->dev); + /* * On failure, unregister destroys all devices on the * bus _except_ the target device, i.e. coalesced_zones - * has been modified. No need to restart the walk as - * there aren't any zones left. + * has been modified. Bail after destroying the target + * device, there's no need to restart the walk as there + * aren't any zones left. */ if (r) break; - kvm_iodevice_destructor(&dev->dev); } } diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 9c60384b5ae0..d255964ec331 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -100,13 +100,8 @@ EXPORT_SYMBOL_GPL(halt_poll_ns_shrink); */ DEFINE_MUTEX(kvm_lock); -static DEFINE_RAW_SPINLOCK(kvm_count_lock); LIST_HEAD(vm_list); -static cpumask_var_t cpus_hardware_enabled; -static int kvm_usage_count; -static atomic_t hardware_enable_failed; - static struct kmem_cache *kvm_vcpu_cache; static __read_mostly struct preempt_ops kvm_preempt_ops; @@ -148,9 +143,6 @@ static void hardware_disable_all(void); static void kvm_io_bus_destroy(struct kvm_io_bus *bus); -__visible bool kvm_rebooting; -EXPORT_SYMBOL_GPL(kvm_rebooting); - #define KVM_EVENT_CREATE_VM 0 #define KVM_EVENT_DESTROY_VM 1 static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm); @@ -5102,50 +5094,70 @@ static struct miscdevice kvm_dev = { &kvm_chardev_ops, }; -static void hardware_enable_nolock(void *junk) -{ - int cpu = raw_smp_processor_id(); - int r; - - if (cpumask_test_cpu(cpu, cpus_hardware_enabled)) - return; +#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING +__visible bool kvm_rebooting; +EXPORT_SYMBOL_GPL(kvm_rebooting); - cpumask_set_cpu(cpu, cpus_hardware_enabled); +static DEFINE_PER_CPU(bool, hardware_enabled); +static int kvm_usage_count; - r = kvm_arch_hardware_enable(); +static int __hardware_enable_nolock(void) +{ + if (__this_cpu_read(hardware_enabled)) + return 0; - if (r) { - cpumask_clear_cpu(cpu, cpus_hardware_enabled); - atomic_inc(&hardware_enable_failed); - pr_info("kvm: enabling virtualization on CPU%d failed\n", cpu); + if (kvm_arch_hardware_enable()) { + pr_info("kvm: enabling virtualization on CPU%d failed\n", + raw_smp_processor_id()); + return -EIO; } + + __this_cpu_write(hardware_enabled, true); + return 0; } -static int kvm_starting_cpu(unsigned int cpu) +static void hardware_enable_nolock(void *failed) { - raw_spin_lock(&kvm_count_lock); + if (__hardware_enable_nolock()) + atomic_inc(failed); +} + +static int kvm_online_cpu(unsigned int cpu) +{ + int ret = 0; + + /* + * Abort the CPU online process if hardware virtualization cannot + * be enabled. Otherwise running VMs would encounter unrecoverable + * errors when scheduled to this CPU. + */ + mutex_lock(&kvm_lock); if (kvm_usage_count) - hardware_enable_nolock(NULL); - raw_spin_unlock(&kvm_count_lock); - return 0; + ret = __hardware_enable_nolock(); + mutex_unlock(&kvm_lock); + return ret; } static void hardware_disable_nolock(void *junk) { - int cpu = raw_smp_processor_id(); - - if (!cpumask_test_cpu(cpu, cpus_hardware_enabled)) + /* + * Note, hardware_disable_all_nolock() tells all online CPUs to disable + * hardware, not just CPUs that successfully enabled hardware! + */ + if (!__this_cpu_read(hardware_enabled)) return; - cpumask_clear_cpu(cpu, cpus_hardware_enabled); + kvm_arch_hardware_disable(); + + __this_cpu_write(hardware_enabled, false); } -static int kvm_dying_cpu(unsigned int cpu) +static int kvm_offline_cpu(unsigned int cpu) { - raw_spin_lock(&kvm_count_lock); + mutex_lock(&kvm_lock); if (kvm_usage_count) hardware_disable_nolock(NULL); - raw_spin_unlock(&kvm_count_lock); + mutex_unlock(&kvm_lock); return 0; } @@ -5160,29 +5172,41 @@ static void hardware_disable_all_nolock(void) static void hardware_disable_all(void) { - raw_spin_lock(&kvm_count_lock); + cpus_read_lock(); + mutex_lock(&kvm_lock); hardware_disable_all_nolock(); - raw_spin_unlock(&kvm_count_lock); + mutex_unlock(&kvm_lock); + cpus_read_unlock(); } static int hardware_enable_all(void) { + atomic_t failed = ATOMIC_INIT(0); int r = 0; - raw_spin_lock(&kvm_count_lock); + /* + * When onlining a CPU, cpu_online_mask is set before kvm_online_cpu() + * is called, and so on_each_cpu() between them includes the CPU that + * is being onlined. As a result, hardware_enable_nolock() may get + * invoked before kvm_online_cpu(), which also enables hardware if the + * usage count is non-zero. Disable CPU hotplug to avoid attempting to + * enable hardware multiple times. + */ + cpus_read_lock(); + mutex_lock(&kvm_lock); kvm_usage_count++; if (kvm_usage_count == 1) { - atomic_set(&hardware_enable_failed, 0); - on_each_cpu(hardware_enable_nolock, NULL, 1); + on_each_cpu(hardware_enable_nolock, &failed, 1); - if (atomic_read(&hardware_enable_failed)) { + if (atomic_read(&failed)) { hardware_disable_all_nolock(); r = -EBUSY; } } - raw_spin_unlock(&kvm_count_lock); + mutex_unlock(&kvm_lock); + cpus_read_unlock(); return r; } @@ -5207,6 +5231,49 @@ static struct notifier_block kvm_reboot_notifier = { .priority = 0, }; +static int kvm_suspend(void) +{ + /* + * Secondary CPUs and CPU hotplug are disabled across the suspend/resume + * callbacks, i.e. no need to acquire kvm_lock to ensure the usage count + * is stable. Assert that kvm_lock is not held to ensure the system + * isn't suspended while KVM is enabling hardware. Hardware enabling + * can be preempted, but the task cannot be frozen until it has dropped + * all locks (userspace tasks are frozen via a fake signal). + */ + lockdep_assert_not_held(&kvm_lock); + lockdep_assert_irqs_disabled(); + + if (kvm_usage_count) + hardware_disable_nolock(NULL); + return 0; +} + +static void kvm_resume(void) +{ + lockdep_assert_not_held(&kvm_lock); + lockdep_assert_irqs_disabled(); + + if (kvm_usage_count) + WARN_ON_ONCE(__hardware_enable_nolock()); +} + +static struct syscore_ops kvm_syscore_ops = { + .suspend = kvm_suspend, + .resume = kvm_resume, +}; +#else /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */ +static int hardware_enable_all(void) +{ + return 0; +} + +static void hardware_disable_all(void) +{ + +} +#endif /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */ + static void kvm_io_bus_destroy(struct kvm_io_bus *bus) { int i; @@ -5785,26 +5852,6 @@ static void kvm_init_debug(void) } } -static int kvm_suspend(void) -{ - if (kvm_usage_count) - hardware_disable_nolock(NULL); - return 0; -} - -static void kvm_resume(void) -{ - if (kvm_usage_count) { - lockdep_assert_not_held(&kvm_count_lock); - hardware_enable_nolock(NULL); - } -} - -static struct syscore_ops kvm_syscore_ops = { - .suspend = kvm_suspend, - .resume = kvm_resume, -}; - static inline struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn) { @@ -5909,62 +5956,20 @@ void kvm_unregister_perf_callbacks(void) } #endif -struct kvm_cpu_compat_check { - void *opaque; - int *ret; -}; - -static void check_processor_compat(void *data) -{ - struct kvm_cpu_compat_check *c = data; - - *c->ret = kvm_arch_check_processor_compat(c->opaque); -} - -int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, - struct module *module) +int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module) { - struct kvm_cpu_compat_check c; int r; int cpu; - r = kvm_arch_init(opaque); - if (r) - goto out_fail; - - /* - * kvm_arch_init makes sure there's at most one caller - * for architectures that support multiple implementations, - * like intel and amd on x86. - * kvm_arch_init must be called before kvm_irqfd_init to avoid creating - * conflicts in case kvm is already setup for another implementation. - */ - r = kvm_irqfd_init(); +#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING + r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_ONLINE, "kvm/cpu:online", + kvm_online_cpu, kvm_offline_cpu); if (r) - goto out_irqfd; - - if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { - r = -ENOMEM; - goto out_free_0; - } - - r = kvm_arch_hardware_setup(opaque); - if (r < 0) - goto out_free_1; - - c.ret = &r; - c.opaque = opaque; - for_each_online_cpu(cpu) { - smp_call_function_single(cpu, check_processor_compat, &c, 1); - if (r < 0) - goto out_free_2; - } + return r; - r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_STARTING, "kvm/cpu:starting", - kvm_starting_cpu, kvm_dying_cpu); - if (r) - goto out_free_2; register_reboot_notifier(&kvm_reboot_notifier); + register_syscore_ops(&kvm_syscore_ops); +#endif /* A kmem cache lets us meet the alignment requirements of fx_save. */ if (!vcpu_align) @@ -5978,59 +5983,65 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, NULL); if (!kvm_vcpu_cache) { r = -ENOMEM; - goto out_free_3; + goto err_vcpu_cache; } for_each_possible_cpu(cpu) { if (!alloc_cpumask_var_node(&per_cpu(cpu_kick_mask, cpu), GFP_KERNEL, cpu_to_node(cpu))) { r = -ENOMEM; - goto out_free_4; + goto err_cpu_kick_mask; } } + r = kvm_irqfd_init(); + if (r) + goto err_irqfd; + r = kvm_async_pf_init(); if (r) - goto out_free_4; + goto err_async_pf; kvm_chardev_ops.owner = module; - r = misc_register(&kvm_dev); - if (r) { - pr_err("kvm: misc device register failed\n"); - goto out_unreg; - } - - register_syscore_ops(&kvm_syscore_ops); - kvm_preempt_ops.sched_in = kvm_sched_in; kvm_preempt_ops.sched_out = kvm_sched_out; kvm_init_debug(); r = kvm_vfio_ops_init(); - WARN_ON(r); + if (WARN_ON_ONCE(r)) + goto err_vfio; + + /* + * Registration _must_ be the very last thing done, as this exposes + * /dev/kvm to userspace, i.e. all infrastructure must be setup! + */ + r = misc_register(&kvm_dev); + if (r) { + pr_err("kvm: misc device register failed\n"); + goto err_register; + } return 0; -out_unreg: +err_register: + kvm_vfio_ops_exit(); +err_vfio: kvm_async_pf_deinit(); -out_free_4: +err_async_pf: + kvm_irqfd_exit(); +err_irqfd: +err_cpu_kick_mask: for_each_possible_cpu(cpu) free_cpumask_var(per_cpu(cpu_kick_mask, cpu)); kmem_cache_destroy(kvm_vcpu_cache); -out_free_3: +err_vcpu_cache: +#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING + unregister_syscore_ops(&kvm_syscore_ops); unregister_reboot_notifier(&kvm_reboot_notifier); - cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING); -out_free_2: - kvm_arch_hardware_unsetup(); -out_free_1: - free_cpumask_var(cpus_hardware_enabled); -out_free_0: - kvm_irqfd_exit(); -out_irqfd: - kvm_arch_exit(); -out_fail: + cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE); +#endif return r; } EXPORT_SYMBOL_GPL(kvm_init); @@ -6039,21 +6050,25 @@ void kvm_exit(void) { int cpu; - debugfs_remove_recursive(kvm_debugfs_dir); + /* + * Note, unregistering /dev/kvm doesn't strictly need to come first, + * fops_get(), a.k.a. try_module_get(), prevents acquiring references + * to KVM while the module is being stopped. + */ misc_deregister(&kvm_dev); + + debugfs_remove_recursive(kvm_debugfs_dir); for_each_possible_cpu(cpu) free_cpumask_var(per_cpu(cpu_kick_mask, cpu)); kmem_cache_destroy(kvm_vcpu_cache); + kvm_vfio_ops_exit(); kvm_async_pf_deinit(); +#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING unregister_syscore_ops(&kvm_syscore_ops); unregister_reboot_notifier(&kvm_reboot_notifier); - cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING); - on_each_cpu(hardware_disable_nolock, NULL, 1); - kvm_arch_hardware_unsetup(); - kvm_arch_exit(); + cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE); +#endif kvm_irqfd_exit(); - free_cpumask_var(cpus_hardware_enabled); - kvm_vfio_ops_exit(); } EXPORT_SYMBOL_GPL(kvm_exit); |