summaryrefslogtreecommitdiff
path: root/virt/kvm
diff options
context:
space:
mode:
Diffstat (limited to 'virt/kvm')
-rw-r--r--virt/kvm/Kconfig3
-rw-r--r--virt/kvm/coalesced_mmio.c8
-rw-r--r--virt/kvm/kvm_main.c297
3 files changed, 164 insertions, 144 deletions
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index 9fb1ff6f19e5..b74916de5183 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -92,3 +92,6 @@ config KVM_XFER_TO_GUEST_WORK
config HAVE_KVM_PM_NOTIFIER
bool
+
+config KVM_GENERIC_HARDWARE_ENABLING
+ bool
diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c
index 0be80c213f7f..5ef88f5a0864 100644
--- a/virt/kvm/coalesced_mmio.c
+++ b/virt/kvm/coalesced_mmio.c
@@ -187,15 +187,17 @@ int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm,
r = kvm_io_bus_unregister_dev(kvm,
zone->pio ? KVM_PIO_BUS : KVM_MMIO_BUS, &dev->dev);
+ kvm_iodevice_destructor(&dev->dev);
+
/*
* On failure, unregister destroys all devices on the
* bus _except_ the target device, i.e. coalesced_zones
- * has been modified. No need to restart the walk as
- * there aren't any zones left.
+ * has been modified. Bail after destroying the target
+ * device, there's no need to restart the walk as there
+ * aren't any zones left.
*/
if (r)
break;
- kvm_iodevice_destructor(&dev->dev);
}
}
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 9c60384b5ae0..d255964ec331 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -100,13 +100,8 @@ EXPORT_SYMBOL_GPL(halt_poll_ns_shrink);
*/
DEFINE_MUTEX(kvm_lock);
-static DEFINE_RAW_SPINLOCK(kvm_count_lock);
LIST_HEAD(vm_list);
-static cpumask_var_t cpus_hardware_enabled;
-static int kvm_usage_count;
-static atomic_t hardware_enable_failed;
-
static struct kmem_cache *kvm_vcpu_cache;
static __read_mostly struct preempt_ops kvm_preempt_ops;
@@ -148,9 +143,6 @@ static void hardware_disable_all(void);
static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
-__visible bool kvm_rebooting;
-EXPORT_SYMBOL_GPL(kvm_rebooting);
-
#define KVM_EVENT_CREATE_VM 0
#define KVM_EVENT_DESTROY_VM 1
static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm);
@@ -5102,50 +5094,70 @@ static struct miscdevice kvm_dev = {
&kvm_chardev_ops,
};
-static void hardware_enable_nolock(void *junk)
-{
- int cpu = raw_smp_processor_id();
- int r;
-
- if (cpumask_test_cpu(cpu, cpus_hardware_enabled))
- return;
+#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
+__visible bool kvm_rebooting;
+EXPORT_SYMBOL_GPL(kvm_rebooting);
- cpumask_set_cpu(cpu, cpus_hardware_enabled);
+static DEFINE_PER_CPU(bool, hardware_enabled);
+static int kvm_usage_count;
- r = kvm_arch_hardware_enable();
+static int __hardware_enable_nolock(void)
+{
+ if (__this_cpu_read(hardware_enabled))
+ return 0;
- if (r) {
- cpumask_clear_cpu(cpu, cpus_hardware_enabled);
- atomic_inc(&hardware_enable_failed);
- pr_info("kvm: enabling virtualization on CPU%d failed\n", cpu);
+ if (kvm_arch_hardware_enable()) {
+ pr_info("kvm: enabling virtualization on CPU%d failed\n",
+ raw_smp_processor_id());
+ return -EIO;
}
+
+ __this_cpu_write(hardware_enabled, true);
+ return 0;
}
-static int kvm_starting_cpu(unsigned int cpu)
+static void hardware_enable_nolock(void *failed)
{
- raw_spin_lock(&kvm_count_lock);
+ if (__hardware_enable_nolock())
+ atomic_inc(failed);
+}
+
+static int kvm_online_cpu(unsigned int cpu)
+{
+ int ret = 0;
+
+ /*
+ * Abort the CPU online process if hardware virtualization cannot
+ * be enabled. Otherwise running VMs would encounter unrecoverable
+ * errors when scheduled to this CPU.
+ */
+ mutex_lock(&kvm_lock);
if (kvm_usage_count)
- hardware_enable_nolock(NULL);
- raw_spin_unlock(&kvm_count_lock);
- return 0;
+ ret = __hardware_enable_nolock();
+ mutex_unlock(&kvm_lock);
+ return ret;
}
static void hardware_disable_nolock(void *junk)
{
- int cpu = raw_smp_processor_id();
-
- if (!cpumask_test_cpu(cpu, cpus_hardware_enabled))
+ /*
+ * Note, hardware_disable_all_nolock() tells all online CPUs to disable
+ * hardware, not just CPUs that successfully enabled hardware!
+ */
+ if (!__this_cpu_read(hardware_enabled))
return;
- cpumask_clear_cpu(cpu, cpus_hardware_enabled);
+
kvm_arch_hardware_disable();
+
+ __this_cpu_write(hardware_enabled, false);
}
-static int kvm_dying_cpu(unsigned int cpu)
+static int kvm_offline_cpu(unsigned int cpu)
{
- raw_spin_lock(&kvm_count_lock);
+ mutex_lock(&kvm_lock);
if (kvm_usage_count)
hardware_disable_nolock(NULL);
- raw_spin_unlock(&kvm_count_lock);
+ mutex_unlock(&kvm_lock);
return 0;
}
@@ -5160,29 +5172,41 @@ static void hardware_disable_all_nolock(void)
static void hardware_disable_all(void)
{
- raw_spin_lock(&kvm_count_lock);
+ cpus_read_lock();
+ mutex_lock(&kvm_lock);
hardware_disable_all_nolock();
- raw_spin_unlock(&kvm_count_lock);
+ mutex_unlock(&kvm_lock);
+ cpus_read_unlock();
}
static int hardware_enable_all(void)
{
+ atomic_t failed = ATOMIC_INIT(0);
int r = 0;
- raw_spin_lock(&kvm_count_lock);
+ /*
+ * When onlining a CPU, cpu_online_mask is set before kvm_online_cpu()
+ * is called, and so on_each_cpu() between them includes the CPU that
+ * is being onlined. As a result, hardware_enable_nolock() may get
+ * invoked before kvm_online_cpu(), which also enables hardware if the
+ * usage count is non-zero. Disable CPU hotplug to avoid attempting to
+ * enable hardware multiple times.
+ */
+ cpus_read_lock();
+ mutex_lock(&kvm_lock);
kvm_usage_count++;
if (kvm_usage_count == 1) {
- atomic_set(&hardware_enable_failed, 0);
- on_each_cpu(hardware_enable_nolock, NULL, 1);
+ on_each_cpu(hardware_enable_nolock, &failed, 1);
- if (atomic_read(&hardware_enable_failed)) {
+ if (atomic_read(&failed)) {
hardware_disable_all_nolock();
r = -EBUSY;
}
}
- raw_spin_unlock(&kvm_count_lock);
+ mutex_unlock(&kvm_lock);
+ cpus_read_unlock();
return r;
}
@@ -5207,6 +5231,49 @@ static struct notifier_block kvm_reboot_notifier = {
.priority = 0,
};
+static int kvm_suspend(void)
+{
+ /*
+ * Secondary CPUs and CPU hotplug are disabled across the suspend/resume
+ * callbacks, i.e. no need to acquire kvm_lock to ensure the usage count
+ * is stable. Assert that kvm_lock is not held to ensure the system
+ * isn't suspended while KVM is enabling hardware. Hardware enabling
+ * can be preempted, but the task cannot be frozen until it has dropped
+ * all locks (userspace tasks are frozen via a fake signal).
+ */
+ lockdep_assert_not_held(&kvm_lock);
+ lockdep_assert_irqs_disabled();
+
+ if (kvm_usage_count)
+ hardware_disable_nolock(NULL);
+ return 0;
+}
+
+static void kvm_resume(void)
+{
+ lockdep_assert_not_held(&kvm_lock);
+ lockdep_assert_irqs_disabled();
+
+ if (kvm_usage_count)
+ WARN_ON_ONCE(__hardware_enable_nolock());
+}
+
+static struct syscore_ops kvm_syscore_ops = {
+ .suspend = kvm_suspend,
+ .resume = kvm_resume,
+};
+#else /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */
+static int hardware_enable_all(void)
+{
+ return 0;
+}
+
+static void hardware_disable_all(void)
+{
+
+}
+#endif /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */
+
static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
{
int i;
@@ -5785,26 +5852,6 @@ static void kvm_init_debug(void)
}
}
-static int kvm_suspend(void)
-{
- if (kvm_usage_count)
- hardware_disable_nolock(NULL);
- return 0;
-}
-
-static void kvm_resume(void)
-{
- if (kvm_usage_count) {
- lockdep_assert_not_held(&kvm_count_lock);
- hardware_enable_nolock(NULL);
- }
-}
-
-static struct syscore_ops kvm_syscore_ops = {
- .suspend = kvm_suspend,
- .resume = kvm_resume,
-};
-
static inline
struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
{
@@ -5909,62 +5956,20 @@ void kvm_unregister_perf_callbacks(void)
}
#endif
-struct kvm_cpu_compat_check {
- void *opaque;
- int *ret;
-};
-
-static void check_processor_compat(void *data)
-{
- struct kvm_cpu_compat_check *c = data;
-
- *c->ret = kvm_arch_check_processor_compat(c->opaque);
-}
-
-int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
- struct module *module)
+int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
{
- struct kvm_cpu_compat_check c;
int r;
int cpu;
- r = kvm_arch_init(opaque);
- if (r)
- goto out_fail;
-
- /*
- * kvm_arch_init makes sure there's at most one caller
- * for architectures that support multiple implementations,
- * like intel and amd on x86.
- * kvm_arch_init must be called before kvm_irqfd_init to avoid creating
- * conflicts in case kvm is already setup for another implementation.
- */
- r = kvm_irqfd_init();
+#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
+ r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_ONLINE, "kvm/cpu:online",
+ kvm_online_cpu, kvm_offline_cpu);
if (r)
- goto out_irqfd;
-
- if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
- r = -ENOMEM;
- goto out_free_0;
- }
-
- r = kvm_arch_hardware_setup(opaque);
- if (r < 0)
- goto out_free_1;
-
- c.ret = &r;
- c.opaque = opaque;
- for_each_online_cpu(cpu) {
- smp_call_function_single(cpu, check_processor_compat, &c, 1);
- if (r < 0)
- goto out_free_2;
- }
+ return r;
- r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_STARTING, "kvm/cpu:starting",
- kvm_starting_cpu, kvm_dying_cpu);
- if (r)
- goto out_free_2;
register_reboot_notifier(&kvm_reboot_notifier);
+ register_syscore_ops(&kvm_syscore_ops);
+#endif
/* A kmem cache lets us meet the alignment requirements of fx_save. */
if (!vcpu_align)
@@ -5978,59 +5983,65 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
NULL);
if (!kvm_vcpu_cache) {
r = -ENOMEM;
- goto out_free_3;
+ goto err_vcpu_cache;
}
for_each_possible_cpu(cpu) {
if (!alloc_cpumask_var_node(&per_cpu(cpu_kick_mask, cpu),
GFP_KERNEL, cpu_to_node(cpu))) {
r = -ENOMEM;
- goto out_free_4;
+ goto err_cpu_kick_mask;
}
}
+ r = kvm_irqfd_init();
+ if (r)
+ goto err_irqfd;
+
r = kvm_async_pf_init();
if (r)
- goto out_free_4;
+ goto err_async_pf;
kvm_chardev_ops.owner = module;
- r = misc_register(&kvm_dev);
- if (r) {
- pr_err("kvm: misc device register failed\n");
- goto out_unreg;
- }
-
- register_syscore_ops(&kvm_syscore_ops);
-
kvm_preempt_ops.sched_in = kvm_sched_in;
kvm_preempt_ops.sched_out = kvm_sched_out;
kvm_init_debug();
r = kvm_vfio_ops_init();
- WARN_ON(r);
+ if (WARN_ON_ONCE(r))
+ goto err_vfio;
+
+ /*
+ * Registration _must_ be the very last thing done, as this exposes
+ * /dev/kvm to userspace, i.e. all infrastructure must be setup!
+ */
+ r = misc_register(&kvm_dev);
+ if (r) {
+ pr_err("kvm: misc device register failed\n");
+ goto err_register;
+ }
return 0;
-out_unreg:
+err_register:
+ kvm_vfio_ops_exit();
+err_vfio:
kvm_async_pf_deinit();
-out_free_4:
+err_async_pf:
+ kvm_irqfd_exit();
+err_irqfd:
+err_cpu_kick_mask:
for_each_possible_cpu(cpu)
free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
kmem_cache_destroy(kvm_vcpu_cache);
-out_free_3:
+err_vcpu_cache:
+#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
+ unregister_syscore_ops(&kvm_syscore_ops);
unregister_reboot_notifier(&kvm_reboot_notifier);
- cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING);
-out_free_2:
- kvm_arch_hardware_unsetup();
-out_free_1:
- free_cpumask_var(cpus_hardware_enabled);
-out_free_0:
- kvm_irqfd_exit();
-out_irqfd:
- kvm_arch_exit();
-out_fail:
+ cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE);
+#endif
return r;
}
EXPORT_SYMBOL_GPL(kvm_init);
@@ -6039,21 +6050,25 @@ void kvm_exit(void)
{
int cpu;
- debugfs_remove_recursive(kvm_debugfs_dir);
+ /*
+ * Note, unregistering /dev/kvm doesn't strictly need to come first,
+ * fops_get(), a.k.a. try_module_get(), prevents acquiring references
+ * to KVM while the module is being stopped.
+ */
misc_deregister(&kvm_dev);
+
+ debugfs_remove_recursive(kvm_debugfs_dir);
for_each_possible_cpu(cpu)
free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
kmem_cache_destroy(kvm_vcpu_cache);
+ kvm_vfio_ops_exit();
kvm_async_pf_deinit();
+#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
unregister_syscore_ops(&kvm_syscore_ops);
unregister_reboot_notifier(&kvm_reboot_notifier);
- cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING);
- on_each_cpu(hardware_disable_nolock, NULL, 1);
- kvm_arch_hardware_unsetup();
- kvm_arch_exit();
+ cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE);
+#endif
kvm_irqfd_exit();
- free_cpumask_var(cpus_hardware_enabled);
- kvm_vfio_ops_exit();
}
EXPORT_SYMBOL_GPL(kvm_exit);