KVM: Use the scheduler preemption notifiers to make kvm preemptible Current kvm disables preemption while the new virtualization registers are in use. This of course is not very good for latency sensitive workloads (one use of virtualization is to offload user interface and other latency insensitive stuff to a container, so that it is easier to analyze the remaining workload). This patch re-enables preemption for kvm; preemption is now only disabled when switching the registers in and out, and during the switch to guest mode and back. Contains fixes from Shaohua Li <shaohua.li@intel.com>. Signed-off-by: Avi Kivity <avi@qumranet.com>

commit: 15ad71460d75fd7ca41bb248a2310f3f39b302ba [log] [tgz]
author: Avi Kivity <avi@qumranet.com> Wed Jul 11 18:17:21 2007 +0300
committer: Avi Kivity <avi@qumranet.com> Sat Oct 13 10:18:20 2007 +0200
tree: 1ea549e5c5629561c121a54def146fb6b706c2d4
parent: 519ef35341b4f360f072ea74e398b70a5a2fc270 [diff]
diff --git a/drivers/kvm/Kconfig b/drivers/kvm/Kconfig
index 0a419a0..8749fa4 100644
--- a/drivers/kvm/Kconfig
+++ b/drivers/kvm/Kconfig

@@ -17,6 +17,7 @@
 config KVM
 	tristate "Kernel-based Virtual Machine (KVM) support"
 	depends on X86 && EXPERIMENTAL
+	select PREEMPT_NOTIFIERS
 	select ANON_INODES
 	---help---
 	  Support hosting fully virtualized guest machines using hardware

diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h
index e92c84b..0667183 100644
--- a/drivers/kvm/kvm.h
+++ b/drivers/kvm/kvm.h

@@ -13,6 +13,7 @@
 #include <linux/signal.h>
 #include <linux/sched.h>
 #include <linux/mm.h>
+#include <linux/preempt.h>
 #include <asm/signal.h>
 
 #include <linux/kvm.h>
@@ -301,6 +302,7 @@
 
 struct kvm_vcpu {
 	struct kvm *kvm;
+	struct preempt_notifier preempt_notifier;
 	int vcpu_id;
 	struct mutex mutex;
 	int   cpu;
@@ -429,7 +431,7 @@
 	struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id);
 	void (*vcpu_free)(struct kvm_vcpu *vcpu);
 
-	void (*vcpu_load)(struct kvm_vcpu *vcpu);
+	void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
 	void (*vcpu_put)(struct kvm_vcpu *vcpu);
 	void (*vcpu_decache)(struct kvm_vcpu *vcpu);
 

diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c
index 2094746..6035e6d 100644
--- a/drivers/kvm/kvm_main.c
+++ b/drivers/kvm/kvm_main.c

@@ -54,6 +54,8 @@
 
 struct kvm_arch_ops *kvm_arch_ops;
 
+static __read_mostly struct preempt_ops kvm_preempt_ops;
+
 #define STAT_OFFSET(x) offsetof(struct kvm_vcpu, stat.x)
 
 static struct kvm_stats_debugfs_item {
@@ -239,13 +241,21 @@
  */
 static void vcpu_load(struct kvm_vcpu *vcpu)
 {
+	int cpu;
+
 	mutex_lock(&vcpu->mutex);
-	kvm_arch_ops->vcpu_load(vcpu);
+	cpu = get_cpu();
+	preempt_notifier_register(&vcpu->preempt_notifier);
+	kvm_arch_ops->vcpu_load(vcpu, cpu);
+	put_cpu();
 }
 
 static void vcpu_put(struct kvm_vcpu *vcpu)
 {
+	preempt_disable();
 	kvm_arch_ops->vcpu_put(vcpu);
+	preempt_notifier_unregister(&vcpu->preempt_notifier);
+	preempt_enable();
 	mutex_unlock(&vcpu->mutex);
 }
 
@@ -1672,9 +1682,7 @@
 {
 	if (!need_resched())
 		return;
-	vcpu_put(vcpu);
 	cond_resched();
-	vcpu_load(vcpu);
 }
 EXPORT_SYMBOL_GPL(kvm_resched);
 
@@ -1722,11 +1730,9 @@
 	unsigned bytes;
 	int nr_pages = vcpu->pio.guest_pages[1] ? 2 : 1;
 
-	kvm_arch_ops->vcpu_put(vcpu);
 	q = vmap(vcpu->pio.guest_pages, nr_pages, VM_READ|VM_WRITE,
 		 PAGE_KERNEL);
 	if (!q) {
-		kvm_arch_ops->vcpu_load(vcpu);
 		free_pio_guest_pages(vcpu);
 		return -ENOMEM;
 	}
@@ -1738,7 +1744,6 @@
 		memcpy(p, q, bytes);
 	q -= vcpu->pio.guest_page_offset;
 	vunmap(q);
-	kvm_arch_ops->vcpu_load(vcpu);
 	free_pio_guest_pages(vcpu);
 	return 0;
 }
@@ -2413,6 +2418,8 @@
 	if (IS_ERR(vcpu))
 		return PTR_ERR(vcpu);
 
+	preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
+
 	vcpu_load(vcpu);
 	r = kvm_mmu_setup(vcpu);
 	vcpu_put(vcpu);
@@ -3145,6 +3152,27 @@
 
 hpa_t bad_page_address;
 
+static inline
+struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
+{
+	return container_of(pn, struct kvm_vcpu, preempt_notifier);
+}
+
+static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
+{
+	struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
+
+	kvm_arch_ops->vcpu_load(vcpu, cpu);
+}
+
+static void kvm_sched_out(struct preempt_notifier *pn,
+			  struct task_struct *next)
+{
+	struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
+
+	kvm_arch_ops->vcpu_put(vcpu);
+}
+
 int kvm_init_arch(struct kvm_arch_ops *ops, struct module *module)
 {
 	int r;
@@ -3191,6 +3219,9 @@
 		goto out_free;
 	}
 
+	kvm_preempt_ops.sched_in = kvm_sched_in;
+	kvm_preempt_ops.sched_out = kvm_sched_out;
+
 	return r;
 
 out_free:

diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c
index 5437de2..396c736 100644
--- a/drivers/kvm/mmu.c
+++ b/drivers/kvm/mmu.c

@@ -276,9 +276,7 @@
 	kvm_mmu_free_some_pages(vcpu);
 	if (r < 0) {
 		spin_unlock(&vcpu->kvm->lock);
-		kvm_arch_ops->vcpu_put(vcpu);
 		r = __mmu_topup_memory_caches(vcpu, GFP_KERNEL);
-		kvm_arch_ops->vcpu_load(vcpu);
 		spin_lock(&vcpu->kvm->lock);
 		kvm_mmu_free_some_pages(vcpu);
 	}

diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c
index 0feec85..3997bbd 100644
--- a/drivers/kvm/svm.c
+++ b/drivers/kvm/svm.c

@@ -625,12 +625,11 @@
 	kfree(svm);
 }
 
-static void svm_vcpu_load(struct kvm_vcpu *vcpu)
+static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
-	int cpu, i;
+	int i;
 
-	cpu = get_cpu();
 	if (unlikely(cpu != vcpu->cpu)) {
 		u64 tsc_this, delta;
 
@@ -657,7 +656,6 @@
 		wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
 
 	rdtscll(vcpu->host_tsc);
-	put_cpu();
 }
 
 static void svm_vcpu_decache(struct kvm_vcpu *vcpu)

diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c
index 18f9b0b..8c87d20 100644
--- a/drivers/kvm/vmx.c
+++ b/drivers/kvm/vmx.c

@@ -396,6 +396,7 @@
 static void vmx_load_host_state(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	unsigned long flags;
 
 	if (!vmx->host_state.loaded)
 		return;
@@ -408,12 +409,12 @@
 		 * If we have to reload gs, we must take care to
 		 * preserve our gs base.
 		 */
-		local_irq_disable();
+		local_irq_save(flags);
 		load_gs(vmx->host_state.gs_sel);
 #ifdef CONFIG_X86_64
 		wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE));
 #endif
-		local_irq_enable();
+		local_irq_restore(flags);
 
 		reload_tss();
 	}
@@ -427,15 +428,12 @@
  * Switches to specified vcpu, until a matching vcpu_put(), but assumes
  * vcpu mutex is already taken.
  */
-static void vmx_vcpu_load(struct kvm_vcpu *vcpu)
+static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	u64 phys_addr = __pa(vmx->vmcs);
-	int cpu;
 	u64 tsc_this, delta;
 
-	cpu = get_cpu();
-
 	if (vcpu->cpu != cpu)
 		vcpu_clear(vcpu);
 
@@ -480,7 +478,6 @@
 {
 	vmx_load_host_state(vcpu);
 	kvm_put_guest_fpu(vcpu);
-	put_cpu();
 }
 
 static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
@@ -2127,6 +2124,8 @@
 	if (unlikely(r))
 		goto out;
 
+	preempt_disable();
+
 	if (!vcpu->mmio_read_completed)
 		do_interrupt_requests(vcpu, kvm_run);
 
@@ -2269,6 +2268,9 @@
 	vcpu->interrupt_window_open = (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0;
 
 	asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
+	vmx->launched = 1;
+
+	preempt_enable();
 
 	if (unlikely(fail)) {
 		kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
@@ -2283,7 +2285,6 @@
 	if (unlikely(prof_on == KVM_PROFILING))
 		profile_hit(KVM_PROFILING, (void *)vmcs_readl(GUEST_RIP));
 
-	vmx->launched = 1;
 	r = kvm_handle_exit(kvm_run, vcpu);
 	if (r > 0) {
 		/* Give scheduler a change to reschedule. */
@@ -2372,6 +2373,7 @@
 {
 	int err;
 	struct vcpu_vmx *vmx = kzalloc(sizeof(*vmx), GFP_KERNEL);
+	int cpu;
 
 	if (!vmx)
 		return ERR_PTR(-ENOMEM);
@@ -2396,9 +2398,11 @@
 
 	vmcs_clear(vmx->vmcs);
 
-	vmx_vcpu_load(&vmx->vcpu);
+	cpu = get_cpu();
+	vmx_vcpu_load(&vmx->vcpu, cpu);
 	err = vmx_vcpu_setup(&vmx->vcpu);
 	vmx_vcpu_put(&vmx->vcpu);
+	put_cpu();
 	if (err)
 		goto free_vmcs;
commit	15ad71460d75fd7ca41bb248a2310f3f39b302ba	[log] [tgz]
author	Avi Kivity <avi@qumranet.com>	Wed Jul 11 18:17:21 2007 +0300
committer	Avi Kivity <avi@qumranet.com>	Sat Oct 13 10:18:20 2007 +0200
tree	1ea549e5c5629561c121a54def146fb6b706c2d4
parent	519ef35341b4f360f072ea74e398b70a5a2fc270 [diff]