diff options
49 files changed, 646 insertions, 499 deletions
diff --git a/Documentation/DocBook/kgdb.tmpl b/Documentation/DocBook/kgdb.tmpl index 028a8444d95..e8acd1f0345 100644 --- a/Documentation/DocBook/kgdb.tmpl +++ b/Documentation/DocBook/kgdb.tmpl @@ -84,10 +84,9 @@ runs an instance of gdb against the vmlinux file which contains the symbols (not boot image such as bzImage, zImage, uImage...). In gdb the developer specifies the connection parameters and - connects to kgdb. Depending on which kgdb I/O modules exist in - the kernel for a given architecture, it may be possible to debug - the test machine's kernel with the development machine using a - rs232 or ethernet connection. + connects to kgdb. The type of connection a developer makes with + gdb depends on the availability of kgdb I/O modules compiled as + builtin's or kernel modules in the test machine's kernel. </para> </chapter> <chapter id="CompilingAKernel"> @@ -223,7 +222,7 @@ </para> <para> IMPORTANT NOTE: Using this option with kgdb over the console - (kgdboc) or kgdb over ethernet (kgdboe) is not supported. + (kgdboc) is not supported. </para> </sect1> </chapter> @@ -249,18 +248,11 @@ (gdb) target remote /dev/ttyS0 </programlisting> <para> - Example (kgdb to a terminal server): + Example (kgdb to a terminal server on tcp port 2012): </para> <programlisting> % gdb ./vmlinux - (gdb) target remote udp:192.168.2.2:6443 - </programlisting> - <para> - Example (kgdb over ethernet): - </para> - <programlisting> - % gdb ./vmlinux - (gdb) target remote udp:192.168.2.2:6443 + (gdb) target remote 192.168.2.2:2012 </programlisting> <para> Once connected, you can debug a kernel the way you would debug an @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 6 SUBLEVEL = 26 -EXTRAVERSION = -rc7 +EXTRAVERSION = -rc8 NAME = Rotary Wombat # *DOCUMENTATION* diff --git a/arch/ia64/kernel/iosapic.c b/arch/ia64/kernel/iosapic.c index 082c31dcfd9..39752cdef6f 100644 --- a/arch/ia64/kernel/iosapic.c +++ b/arch/ia64/kernel/iosapic.c @@ -558,8 +558,6 @@ static struct iosapic_rte_info * __init_refok iosapic_alloc_rte (void) if (!iosapic_kmalloc_ok && list_empty(&free_rte_list)) { rte = alloc_bootmem(sizeof(struct iosapic_rte_info) * NR_PREALLOCATE_RTE_ENTRIES); - if (!rte) - return NULL; for (i = 0; i < NR_PREALLOCATE_RTE_ENTRIES; i++, rte++) list_add(&rte->rte_list, &free_rte_list); } diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c index f48a809c686..4ae15c8c248 100644 --- a/arch/ia64/kernel/setup.c +++ b/arch/ia64/kernel/setup.c @@ -578,8 +578,6 @@ setup_arch (char **cmdline_p) cpu_init(); /* initialize the bootstrap CPU */ mmu_context_init(); /* initialize context_id bitmap */ - check_sal_cache_flush(); - #ifdef CONFIG_ACPI acpi_boot_init(); #endif @@ -607,6 +605,7 @@ setup_arch (char **cmdline_p) ia64_mca_init(); platform_setup(cmdline_p); + check_sal_cache_flush(); paging_init(); } diff --git a/arch/ia64/sn/kernel/sn2/sn2_smp.c b/arch/ia64/sn/kernel/sn2/sn2_smp.c index 6dd886c5d86..e585f9a2afb 100644 --- a/arch/ia64/sn/kernel/sn2/sn2_smp.c +++ b/arch/ia64/sn/kernel/sn2/sn2_smp.c @@ -512,7 +512,7 @@ static ssize_t sn2_ptc_proc_write(struct file *file, const char __user *user, si int cpu; char optstr[64]; - if (count > sizeof(optstr)) + if (count == 0 || count > sizeof(optstr)) return -EINVAL; if (copy_from_user(optstr, user, count)) return -EFAULT; diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index b0937c03af3..40013514855 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -385,6 +385,7 @@ config VMI config KVM_CLOCK bool "KVM paravirtualized clock" select PARAVIRT + select PARAVIRT_CLOCK depends on !(X86_VISWS || X86_VOYAGER) help Turning on this option will allow you to run a paravirtualized clock @@ -412,6 +413,10 @@ config PARAVIRT over full virtualization. However, when run without a hypervisor the kernel is theoretically slower and slightly larger. +config PARAVIRT_CLOCK + bool + default n + endif config MEMTEST_BOOTPARAM diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 739d49acd2f..5ff67208d4a 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -90,6 +90,7 @@ obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o obj-$(CONFIG_KVM_GUEST) += kvm.o obj-$(CONFIG_KVM_CLOCK) += kvmclock.o obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o +obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 08a30986d47..87edf1ceb1d 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -18,6 +18,7 @@ #include <linux/clocksource.h> #include <linux/kvm_para.h> +#include <asm/pvclock.h> #include <asm/arch_hooks.h> #include <asm/msr.h> #include <asm/apic.h> @@ -36,18 +37,9 @@ static int parse_no_kvmclock(char *arg) early_param("no-kvmclock", parse_no_kvmclock); /* The hypervisor will put information about time periodically here */ -static DEFINE_PER_CPU_SHARED_ALIGNED(struct kvm_vcpu_time_info, hv_clock); -#define get_clock(cpu, field) per_cpu(hv_clock, cpu).field +static DEFINE_PER_CPU_SHARED_ALIGNED(struct pvclock_vcpu_time_info, hv_clock); +static struct pvclock_wall_clock wall_clock; -static inline u64 kvm_get_delta(u64 last_tsc) -{ - int cpu = smp_processor_id(); - u64 delta = native_read_tsc() - last_tsc; - return (delta * get_clock(cpu, tsc_to_system_mul)) >> KVM_SCALE; -} - -static struct kvm_wall_clock wall_clock; -static cycle_t kvm_clock_read(void); /* * The wallclock is the time of day when we booted. Since then, some time may * have elapsed since the hypervisor wrote the data. So we try to account for @@ -55,64 +47,37 @@ static cycle_t kvm_clock_read(void); */ static unsigned long kvm_get_wallclock(void) { - u32 wc_sec, wc_nsec; - u64 delta; + struct pvclock_vcpu_time_info *vcpu_time; struct timespec ts; - int version, nsec; int low, high; low = (int)__pa(&wall_clock); high = ((u64)__pa(&wall_clock) >> 32); + native_write_msr(MSR_KVM_WALL_CLOCK, low, high); - delta = kvm_clock_read(); + vcpu_time = &get_cpu_var(hv_clock); + pvclock_read_wallclock(&wall_clock, vcpu_time, &ts); + put_cpu_var(hv_clock); - native_write_msr(MSR_KVM_WALL_CLOCK, low, high); - do { - version = wall_clock.wc_version; - rmb(); - wc_sec = wall_clock.wc_sec; - wc_nsec = wall_clock.wc_nsec; - rmb(); - } while ((wall_clock.wc_version != version) || (version & 1)); - - delta = kvm_clock_read() - delta; - delta += wc_nsec; - nsec = do_div(delta, NSEC_PER_SEC); - set_normalized_timespec(&ts, wc_sec + delta, nsec); - /* - * Of all mechanisms of time adjustment I've tested, this one - * was the champion! - */ - return ts.tv_sec + 1; + return ts.tv_sec; } static int kvm_set_wallclock(unsigned long now) { - return 0; + return -1; } -/* - * This is our read_clock function. The host puts an tsc timestamp each time - * it updates a new time. Without the tsc adjustment, we can have a situation - * in which a vcpu starts to run earlier (smaller system_time), but probes - * time later (compared to another vcpu), leading to backwards time - */ static cycle_t kvm_clock_read(void) { - u64 last_tsc, now; - int cpu; + struct pvclock_vcpu_time_info *src; + cycle_t ret; - preempt_disable(); - cpu = smp_processor_id(); - - last_tsc = get_clock(cpu, tsc_timestamp); - now = get_clock(cpu, system_time); - - now += kvm_get_delta(last_tsc); - preempt_enable(); - - return now; + src = &get_cpu_var(hv_clock); + ret = pvclock_clocksource_read(src); + put_cpu_var(hv_clock); + return ret; } + static struct clocksource kvm_clock = { .name = "kvm-clock", .read = kvm_clock_read, @@ -123,13 +88,14 @@ static struct clocksource kvm_clock = { .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; -static int kvm_register_clock(void) +static int kvm_register_clock(char *txt) { int cpu = smp_processor_id(); int low, high; low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1; high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32); - + printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n", + cpu, high, low, txt); return native_write_msr_safe(MSR_KVM_SYSTEM_TIME, low, high); } @@ -140,12 +106,20 @@ static void kvm_setup_secondary_clock(void) * Now that the first cpu already had this clocksource initialized, * we shouldn't fail. */ - WARN_ON(kvm_register_clock()); + WARN_ON(kvm_register_clock("secondary cpu clock")); /* ok, done with our trickery, call native */ setup_secondary_APIC_clock(); } #endif +#ifdef CONFIG_SMP +void __init kvm_smp_prepare_boot_cpu(void) +{ + WARN_ON(kvm_register_clock("primary cpu clock")); + native_smp_prepare_boot_cpu(); +} +#endif + /* * After the clock is registered, the host will keep writing to the * registered memory location. If the guest happens to shutdown, this memory @@ -174,7 +148,7 @@ void __init kvmclock_init(void) return; if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) { - if (kvm_register_clock()) + if (kvm_register_clock("boot clock")) return; pv_time_ops.get_wallclock = kvm_get_wallclock; pv_time_ops.set_wallclock = kvm_set_wallclock; @@ -182,6 +156,9 @@ void __init kvmclock_init(void) #ifdef CONFIG_X86_LOCAL_APIC pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock; #endif +#ifdef CONFIG_SMP + smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; +#endif machine_ops.shutdown = kvm_shutdown; #ifdef CONFIG_KEXEC machine_ops.crash_shutdown = kvm_crash_shutdown; diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c new file mode 100644 index 00000000000..05fbe9a0325 --- /dev/null +++ b/arch/x86/kernel/pvclock.c @@ -0,0 +1,141 @@ +/* paravirtual clock -- common code used by kvm/xen + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +*/ + +#include <linux/kernel.h> +#include <linux/percpu.h> +#include <asm/pvclock.h> + +/* + * These are perodically updated + * xen: magic shared_info page + * kvm: gpa registered via msr + * and then copied here. + */ +struct pvclock_shadow_time { + u64 tsc_timestamp; /* TSC at last update of time vals. */ + u64 system_timestamp; /* Time, in nanosecs, since boot. */ + u32 tsc_to_nsec_mul; + int tsc_shift; + u32 version; +}; + +/* + * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, + * yielding a 64-bit result. + */ +static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift) +{ + u64 product; +#ifdef __i386__ + u32 tmp1, tmp2; +#endif + + if (shift < 0) + delta >>= -shift; + else + delta <<= shift; + +#ifdef __i386__ + __asm__ ( + "mul %5 ; " + "mov %4,%%eax ; " + "mov %%edx,%4 ; " + "mul %5 ; " + "xor %5,%5 ; " + "add %4,%%eax ; " + "adc %5,%%edx ; " + : "=A" (product), "=r" (tmp1), "=r" (tmp2) + : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); +#elif __x86_64__ + __asm__ ( + "mul %%rdx ; shrd $32,%%rdx,%%rax" + : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) ); +#else +#error implement me! +#endif + + return product; +} + +static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow) +{ + u64 delta = native_read_tsc() - shadow->tsc_timestamp; + return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift); +} + +/* + * Reads a consistent set of time-base values from hypervisor, + * into a shadow data area. + */ +static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst, + struct pvclock_vcpu_time_info *src) +{ + do { + dst->version = src->version; + rmb(); /* fetch version before data */ + dst->tsc_timestamp = src->tsc_timestamp; + dst->system_timestamp = src->system_time; + dst->tsc_to_nsec_mul = src->tsc_to_system_mul; + dst->tsc_shift = src->tsc_shift; + rmb(); /* test version after fetching data */ + } while ((src->version & 1) || (dst->version != src->version)); + + return dst->version; +} + +cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) +{ + struct pvclock_shadow_time shadow; + unsigned version; + cycle_t ret, offset; + + do { + version = pvclock_get_time_values(&shadow, src); + barrier(); + offset = pvclock_get_nsec_offset(&shadow); + ret = shadow.system_timestamp + offset; + barrier(); + } while (version != src->version); + + return ret; +} + +void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock, + struct pvclock_vcpu_time_info *vcpu_time, + struct timespec *ts) +{ + u32 version; + u64 delta; + struct timespec now; + + /* get wallclock at system boot */ + do { + version = wall_clock->version; + rmb(); /* fetch version before time */ + now.tv_sec = wall_clock->sec; + now.tv_nsec = wall_clock->nsec; + rmb(); /* fetch time before checking version */ + } while ((wall_clock->version & 1) || (version != wall_clock->version)); + + delta = pvclock_clocksource_read(vcpu_time); /* time since system boot */ + delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec; + + now.tv_nsec = do_div(delta, NSEC_PER_SEC); + now.tv_sec = delta; + + set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); +} diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index f2f5d260874..3829aa7b663 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -200,9 +200,12 @@ int __pit_timer_fn(struct kvm_kpit_state *ps) atomic_inc(&pt->pending); smp_mb__after_atomic_inc(); - if (vcpu0 && waitqueue_active(&vcpu0->wq)) { - vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE; - wake_up_interruptible(&vcpu0->wq); + if (vcpu0) { + set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests); + if (waitqueue_active(&vcpu0->wq)) { + vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE; + wake_up_interruptible(&vcpu0->wq); + } } pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index c297c50eba6..ebc03f5ae16 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -940,6 +940,7 @@ static int __apic_timer_fn(struct kvm_lapic *apic) wait_queue_head_t *q = &apic->vcpu->wq; atomic_inc(&apic->timer.pending); + set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests); if (waitqueue_active(q)) { apic->vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; wake_up_interruptible(q); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index ee3f53098f0..7e7c3969f7a 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -640,6 +640,7 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn) rmap_remove(kvm, spte); --kvm->stat.lpages; set_shadow_pte(spte, shadow_trap_nonpresent_pte); + spte = NULL; write_protected = 1; } spte = rmap_next(kvm, rmapp, spte); @@ -1082,10 +1083,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, struct kvm_mmu_page *shadow; spte |= PT_WRITABLE_MASK; - if (user_fault) { - mmu_unshadow(vcpu->kvm, gfn); - goto unshadowed; - } shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); if (shadow || @@ -1102,8 +1099,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, } } -unshadowed: - if (pte_access & ACC_WRITE_MASK) mark_page_dirty(vcpu->kvm, gfn); @@ -1580,11 +1575,13 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, u64 *spte, const void *new) { - if ((sp->role.level != PT_PAGE_TABLE_LEVEL) - && !vcpu->arch.update_pte.largepage) { - ++vcpu->kvm->stat.mmu_pde_zapped; - return; - } + if (sp->role.level != PT_PAGE_TABLE_LEVEL) { + if (!vcpu->arch.update_pte.largepage || + sp->role.glevels == PT32_ROOT_LEVEL) { + ++vcpu->kvm->stat.mmu_pde_zapped; + return; + } + } ++vcpu->kvm->stat.mmu_pte_updated; if (sp->role.glevels == PT32_ROOT_LEVEL) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 02efbe75f31..540e9517907 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -566,7 +566,7 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) load_transition_efer(vmx); } -static void vmx_load_host_state(struct vcpu_vmx *vmx) +static void __vmx_load_host_state(struct vcpu_vmx *vmx) { unsigned long flags; @@ -596,6 +596,13 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx) reload_host_efer(vmx); } +static void vmx_load_host_state(struct vcpu_vmx *vmx) +{ + preempt_disable(); + __vmx_load_host_state(vmx); + preempt_enable(); +} + /* * Switches to specified vcpu, until a matching vcpu_put(), but assumes * vcpu mutex is already taken. @@ -654,7 +661,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) static void vmx_vcpu_put(struct kvm_vcpu *vcpu) { - vmx_load_host_state(to_vmx(vcpu)); + __vmx_load_host_state(to_vmx(vcpu)); } static void vmx_fpu_activate(struct kvm_vcpu *vcpu) @@ -884,11 +891,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) switch (msr_index) { #ifdef CONFIG_X86_64 case MSR_EFER: + vmx_load_host_state(vmx); ret = kvm_set_msr_common(vcpu, msr_index, data); - if (vmx->host_state.loaded) { - reload_host_efer(vmx); - load_transition_efer(vmx); - } break; case MSR_FS_BASE: vmcs_writel(GUEST_FS_BASE, data); @@ -910,11 +914,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) guest_write_tsc(data); break; default: + vmx_load_host_state(vmx); msr = find_msr_entry(vmx, msr_index); if (msr) { msr->data = data; - if (vmx->host_state.loaded) - load_msrs(vmx->guest_msrs, vmx->save_nmsrs); break; } ret = kvm_set_msr_common(vcpu, msr_index, data); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 00acf1301a1..63a77caa59f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -492,8 +492,8 @@ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) { static int version; - struct kvm_wall_clock wc; - struct timespec wc_ts; + struct pvclock_wall_clock wc; + struct timespec now, sys, boot; if (!wall_clock) return; @@ -502,10 +502,19 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); - wc_ts = current_kernel_time(); - wc.wc_sec = wc_ts.tv_sec; - wc.wc_nsec = wc_ts.tv_nsec; - wc.wc_version = version; + /* + * The guest calculates current wall clock time by adding + * system time (updated by kvm_write_guest_time below) to the + * wall clock specified here. guest system time equals host + * system time for us, thus we must fill in host boot time here. + */ + now = current_kernel_time(); + ktime_get_ts(&sys); + boot = ns_to_timespec(timespec_to_ns(&now) - timespec_to_ns(&sys)); + + wc.sec = boot.tv_sec; + wc.nsec = boot.tv_nsec; + wc.version = version; kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc)); @@ -513,6 +522,45 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); } +static uint32_t div_frac(uint32_t dividend, uint32_t divisor) +{ + uint32_t quotient, remainder; + + /* Don't try to replace with do_div(), this one calculates + * "(dividend << 32) / divisor" */ + __asm__ ( "divl %4" + : "=a" (quotient), "=d" (remainder) + : "0" (0), "1" (dividend), "r" (divisor) ); + return quotient; +} + +static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock) +{ + uint64_t nsecs = 1000000000LL; + int32_t shift = 0; + uint64_t tps64; + uint32_t tps32; + + tps64 = tsc_khz * 1000LL; + while (tps64 > nsecs*2) { + tps64 >>= 1; + shift--; + } + + tps32 = (uint32_t)tps64; + while (tps32 <= (uint32_t)nsecs) { + tps32 <<= 1; + shift++; + } + + hv_clock->tsc_shift = shift; + hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32); + + pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n", + __FUNCTION__, tsc_khz, hv_clock->tsc_shift, + hv_clock->tsc_to_system_mul); +} + static void kvm_write_guest_time(struct kvm_vcpu *v) { struct timespec ts; @@ -523,6 +571,11 @@ static void kvm_write_guest_time(struct kvm_vcpu *v) if ((!vcpu->time_page)) return; + if (unlikely(vcpu->hv_clock_tsc_khz != tsc_khz)) { + kvm_set_time_scale(tsc_khz, &vcpu->hv_clock); + vcpu->hv_clock_tsc_khz = tsc_khz; + } + /* Keep irq disabled to prevent changes to the clock */ local_irq_save(flags); kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER, @@ -537,14 +590,14 @@ static void kvm_write_guest_time(struct kvm_vcpu *v) /* * The interface expects us to write an even number signaling that the * update is finished. Since the guest won't see the intermediate - * state, we just write "2" at the end + * state, we just increase by 2 at the end. */ - vcpu->hv_clock.version = 2; + vcpu->hv_clock.version += 2; shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0); memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock, - sizeof(vcpu->hv_clock)); + sizeof(vcpu->hv_clock)); kunmap_atomic(shared_kaddr, KM_USER0); @@ -599,10 +652,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) /* ...but clean it before doing the actual write */ vcpu->arch.time_offset = data & ~(PAGE_MASK | 1); - vcpu->arch.hv_clock.tsc_to_system_mul = - clocksource_khz2mult(tsc_khz, 22); - vcpu->arch.hv_clock.tsc_shift = 22; - down_read(¤t->mm->mmap_sem); vcpu->arch.time_page = gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT); @@ -2759,6 +2808,8 @@ again: if (vcpu->requests) { if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests)) __kvm_migrate_timers(vcpu); + if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) + kvm_x86_ops->tlb_flush(vcpu); if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests)) { kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS; @@ -2772,6 +2823,7 @@ again: } } + clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); kvm_inject_pending_timer_irqs(vcpu); preempt_disable(); @@ -2781,21 +2833,13 @@ again: local_irq_disable(); - if (need_resched()) { + if (vcpu->requests || need_resched()) { local_irq_enable(); preempt_enable(); r = 1; goto out; } - if (vcpu->requests) - if (test_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) { - local_irq_enable(); - preempt_enable(); - r = 1; - goto out; - } - if (signal_pending(current)) { local_irq_enable(); preempt_enable(); @@ -2825,9 +2869,6 @@ again: kvm_guest_enter(); - if (vcpu->requests) - if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) - kvm_x86_ops->tlb_flush(vcpu); KVMTRACE_0D(VMENTRY, vcpu, entryexit); kvm_x86_ops->run(vcpu, kvm_run); diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 2e641be2737..6c388e593bc 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -5,8 +5,9 @@ config XEN bool "Xen guest support" select PARAVIRT + select PARAVIRT_CLOCK depends on X86_32 - depends on X86_CMPXCHG && X86_TSC && !(X86_VISWS || X86_VOYAGER) + depends on X86_CMPXCHG && X86_TSC && X86_PAE && !(X86_VISWS || X86_VOYAGER) help This is the Linux Xen port. Enabling this will allow the kernel to boot in a paravirtualized environment under the diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index c8a56e457d6..f09c1c69c37 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -785,38 +785,35 @@ static __init void xen_set_pte_init(pte_t *ptep, pte_t pte) static __init void xen_pagetable_setup_start(pgd_t *base) { pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base; + int i; /* special set_pte for pagetable initialization */ pv_mmu_ops.set_pte = xen_set_pte_init; init_mm.pgd = base; /* - * copy top-level of Xen-supplied pagetable into place. For - * !PAE we can use this as-is, but for PAE it is a stand-in - * while we copy the pmd pages. + * copy top-level of Xen-supplied pagetable into place. This + * is a stand-in while we copy the pmd pages. */ memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t)); - if (PTRS_PER_PMD > 1) { - int i; - /* - * For PAE, need to allocate new pmds, rather than - * share Xen's, since Xen doesn't like pmd's being - * shared between address spaces. - */ - for (i = 0; i < PTRS_PER_PGD; i++) { - if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) { - pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE); + /* + * For PAE, need to allocate new pmds, rather than + * share Xen's, since Xen doesn't like pmd's being + * shared between address spaces. + */ + for (i = 0; i < PTRS_PER_PGD; i++) { + if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) { + pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE); - memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]), - PAGE_SIZE); + memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]), + PAGE_SIZE); - make_lowmem_page_readonly(pmd); + make_lowmem_page_readonly(pmd); - set_pgd(&base[i], __pgd(1 + __pa(pmd))); - } else - pgd_clear(&base[i]); - } + set_pgd(&base[i], __pgd(1 + __pa(pmd))); + } else + pgd_clear(&base[i]); } /* make sure zero_page is mapped RO so we can use it in pagetables */ @@ -873,17 +870,7 @@ static __init void xen_pagetable_setup_done(pgd_t *base) /* Actually pin the pagetable down, but we can't set PG_pinned yet because the page structures don't exist yet. */ - { - unsigned level; - -#ifdef CONFIG_X86_PAE - level = MMUEXT_PIN_L3_TABLE; -#else - level = MMUEXT_PIN_L2_TABLE; -#endif - - pin_pagetable_pfn(level, PFN_DOWN(__pa(base))); - } + pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(base))); } /* This is called once we have the cpu_possible_map */ @@ -1093,7 +1080,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { .make_pte = xen_make_pte, .make_pgd = xen_make_pgd, -#ifdef CONFIG_X86_PAE .set_pte_atomic = xen_set_pte_atomic, .set_pte_present = xen_set_pte_at, .set_pud = xen_set_pud, @@ -1102,7 +1088,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { .make_pmd = xen_make_pmd, .pmd_val = xen_pmd_val, -#endif /* PAE */ .activate_mm = xen_activate_mm, .dup_mmap = xen_dup_mmap, @@ -1228,6 +1213,11 @@ asmlinkage void __init xen_start_kernel(void) if (xen_feature(XENFEAT_supervisor_mode_kernel)) pv_info.kernel_rpl = 0; + /* Prevent unwanted bits from being set in PTEs. */ + __supported_pte_mask &= ~_PAGE_GLOBAL; + if (!is_initial_xendomain()) + __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD); + /* set the limit of our address space */ xen_reserve_top(); diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 3525ef523a7..df40bf74ea7 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -179,50 +179,56 @@ out: preempt_enable(); } -pteval_t xen_pte_val(pte_t pte) +/* Assume pteval_t is equivalent to all the other *val_t types. */ +static pteval_t pte_mfn_to_pfn(pteval_t val) +{ + if (val & _PAGE_PRESENT) { + unsigned long mfn = (val & PTE_MASK) >> PAGE_SHIFT; + pteval_t flags = val & ~PTE_MASK; + val = (mfn_to_pfn(mfn) << PAGE_SHIFT) | flags; + } + + return val; +} + +static pteval_t pte_pfn_to_mfn(pteval_t val) { - pteval_t ret = pte.pte; + if (val & _PAGE_PRESENT) { + unsigned long pfn = (val & PTE_MASK) >> PAGE_SHIFT; + pteval_t flags = val & ~PTE_MASK; + val = (pfn_to_mfn(pfn) << PAGE_SHIFT) | flags; + } - if (ret & _PAGE_PRESENT) - ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT; + return val; +} - return ret; +pteval_t xen_pte_val(pte_t pte) +{ + return pte_mfn_to_pfn(pte.pte); } pgdval_t xen_pgd_val(pgd_t pgd) { - pgdval_t ret = pgd.pgd; - if (ret & _PAGE_PRESENT) - ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT; - return ret; + return pte_mfn_to_pfn(pgd.pgd); } pte_t xen_make_pte(pteval_t pte) { - if (pte & _PAGE_PRESENT) { - pte = phys_to_machine(XPADDR(pte)).maddr; - pte &= ~(_PAGE_PCD | _PAGE_PWT); - } - - return (pte_t){ .pte = pte }; + pte = pte_pfn_to_mfn(pte); + return native_make_pte(pte); } pgd_t xen_make_pgd(pgdval_t pgd) { - if (pgd & _PAGE_PRESENT) - pgd = phys_to_machine(XPADDR(pgd)).maddr; - - return (pgd_t){ pgd }; + pgd = pte_pfn_to_mfn(pgd); + return native_make_pgd(pgd); } pmdval_t xen_pmd_val(pmd_t pmd) { - pmdval_t ret = native_pmd_val(pmd); - if (ret & _PAGE_PRESENT) - ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT; - return ret; + return pte_mfn_to_pfn(pmd.pmd); } -#ifdef CONFIG_X86_PAE + void xen_set_pud(pud_t *ptr, pud_t val) { struct multicall_space mcs; @@ -267,17 +273,9 @@ void xen_pmd_clear(pmd_t *pmdp) pmd_t xen_make_pmd(pmdval_t pmd) { - if (pmd & _PAGE_PRESENT) - pmd = phys_to_machine(XPADDR(pmd)).maddr; - + pmd = pte_pfn_to_mfn(pmd); return native_make_pmd(pmd); } -#else /* !PAE */ -void xen_set_pte(pte_t *ptep, pte_t pte) -{ - *ptep = pte; -} -#endif /* CONFIG_X86_PAE */ /* (Yet another) pagetable walker. This one is intended for pinning a @@ -430,8 +428,6 @@ static int pin_page(struct page *page, enum pt_level level) read-only, and can be pinned. */ void xen_pgd_pin(pgd_t *pgd) { - unsigned level; - xen_mc_batch(); if (pgd_walk(pgd, pin_page, TASK_SIZE)) { @@ -441,14 +437,7 @@ void xen_pgd_pin(pgd_t *pgd) xen_mc_batch(); } -#ifdef CONFIG_X86_PAE - level = MMUEXT_PIN_L3_TABLE; -#else - level = MMUEXT_PIN_L2_TABLE; -#endif - - xen_do_pin(level, PFN_DOWN(__pa(pgd))); - + xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd))); xen_mc_issue(0); } diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h index b5e189b1519..5fe961caffd 100644 --- a/arch/x86/xen/mmu.h +++ b/arch/x86/xen/mmu.h @@ -37,14 +37,13 @@ void xen_exit_mmap(struct mm_struct *mm); void xen_pgd_pin(pgd_t *pgd); //void xen_pgd_unpin(pgd_t *pgd); -#ifdef CONFIG_X86_PAE -unsigned long long xen_pte_val(pte_t); -unsigned long long xen_pmd_val(pmd_t); -unsigned long long xen_pgd_val(pgd_t); +pteval_t xen_pte_val(pte_t); +pmdval_t xen_pmd_val(pmd_t); +pgdval_t xen_pgd_val(pgd_t); -pte_t xen_make_pte(unsigned long long); -pmd_t xen_make_pmd(unsigned long long); -pgd_t xen_make_pgd(unsigned long long); +pte_t xen_make_pte(pteval_t); +pmd_t xen_make_pmd(pmdval_t); +pgd_t xen_make_pgd(pgdval_t); void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval); @@ -53,15 +52,4 @@ void xen_set_pud(pud_t *ptr, pud_t val); void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); void xen_pmd_clear(pmd_t *pmdp); - -#else -unsigned long xen_pte_val(pte_t); -unsigned long xen_pmd_val(pmd_t); -unsigned long xen_pgd_val(pgd_t); - -pte_t xen_make_pte(unsigned long); -pmd_t xen_make_pmd(unsigned long); -pgd_t xen_make_pgd(unsigned long); -#endif - #endif /* _XEN_MMU_H */ diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 52b2e385698..41e217503c9 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -14,6 +14,7 @@ #include <linux/kernel_stat.h> #include <linux/math64.h> +#include <asm/pvclock.h> #include <asm/xen/hypervisor.h> #include <asm/xen/hypercall.h> @@ -31,17 +32,6 @@ static cycle_t xen_clocksource_read(void); -/* These are perodically updated in shared_info, and then copied here. */ -struct shadow_time_info { - u64 tsc_timestamp; /* TSC at last update of time vals. */ - u64 system_timestamp; /* Time, in nanosecs, since boot. */ - u32 tsc_to_nsec_mul; - int tsc_shift; - u32 version; -}; - -static DEFINE_PER_CPU(struct shadow_time_info, shadow_time); - /* runstate info updated by Xen */ static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate); @@ -211,7 +201,7 @@ unsigned long long xen_sched_clock(void) unsigned long xen_cpu_khz(void) { u64 xen_khz = 1000000ULL << 32; - const struct vcpu_time_info *info = + const struct pvclock_vcpu_time_info *info = &HYPERVISOR_shared_info->vcpu_info[0].time; do_div(xen_khz, info->tsc_to_system_mul); @@ -223,121 +213,26 @@ unsigned long xen_cpu_khz(void) return xen_khz; } -/* - * Reads a consistent set of time-base values from Xen, into a shadow data - * area. - */ -static unsigned get_time_values_from_xen(void) -{ - struct vcpu_time_info *src; - struct shadow_time_info *dst; - - /* src is shared memory with the hypervisor, so we need to - make sure we get a consistent snapshot, even in the face of - being preempted. */ - src = &__get_cpu_var(xen_vcpu)->time; - dst = &__get_cpu_var(shadow_time); - - do { - dst->version = src->version; - rmb(); /* fetch version before data */ - dst->tsc_timestamp = src->tsc_timestamp; - dst->system_timestamp = src->system_time; - dst->tsc_to_nsec_mul = src->tsc_to_system_mul; - dst->tsc_shift = src->tsc_shift; - rmb(); /* test version after fetching data */ - } while ((src->version & 1) | (dst->version ^ src->version)); - - return dst->version; -} - -/* - * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, - * yielding a 64-bit result. - */ -static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift) -{ - u64 product; -#ifdef __i386__ - u32 tmp1, tmp2; -#endif - - if (shift < 0) - delta >>= -shift; - else - delta <<= shift; - -#ifdef __i386__ - __asm__ ( - "mul %5 ; " - "mov %4,%%eax ; " - "mov %%edx,%4 ; " - "mul %5 ; " - "xor %5,%5 ; " - "add %4,%%eax ; " - "adc %5,%%edx ; " - : "=A" (product), "=r" (tmp1), "=r" (tmp2) - : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); -#elif __x86_64__ - __asm__ ( - "mul %%rdx ; shrd $32,%%rdx,%%rax" - : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) ); -#else -#error implement me! -#endif - - return product; -} - -static u64 get_nsec_offset(struct shadow_time_info *shadow) -{ - u64 now, delta; - now = native_read_tsc(); - delta = now - shadow->tsc_timestamp; - return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift); -} - static cycle_t xen_clocksource_read(void) { - struct shadow_time_info *shadow = &get_cpu_var(shadow_time); + struct pvclock_vcpu_time_info *src; cycle_t ret; - unsigned version; - - do { - version = get_time_values_from_xen(); - barrier(); - ret = shadow->system_timestamp + get_nsec_offset(shadow); - barrier(); - } while (version != __get_cpu_var(xen_vcpu)->time.version); - - put_cpu_var(shadow_time); + src = &get_cpu_var(xen_vcpu)->time; + ret = pvclock_clocksource_read(src); + put_cpu_var(xen_vcpu); return ret; } static void xen_read_wallclock(struct timespec *ts) { - const struct shared_info *s = HYPERVISOR_shared_info; - u32 version; - u64 delta; - struct timespec now; - - /* get wallclock at system boot */ - do { - version = s->wc_version; - rmb(); /* fetch version before time */ - now.tv_sec = s->wc_sec; - now.tv_nsec = s->wc_nsec; - rmb(); /* fetch time before checking version */ - } while ((s->wc_version & 1) | (version ^ s->wc_version)); + struct shared_info *s = HYPERVISOR_shared_info; + struct pvclock_wall_clock *wall_clock = &(s->wc); + struct pvclock_vcpu_time_info *vcpu_time; - delta = xen_clocksource_read(); /* time since system boot */ - delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec; - - now.tv_nsec = do_div(delta, NSEC_PER_SEC); - now.tv_sec = delta; - - set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); + vcpu_time = &get_cpu_var(xen_vcpu)->time; + pvclock_read_wallclock(wall_clock, vcpu_time, ts); + put_cpu_var(xen_vcpu); } unsigned long xen_get_wallclock(void) @@ -345,7 +240,6 @@ unsigned long xen_get_wallclock(void) struct timespec ts; xen_read_wallclock(&ts); - return ts.tv_sec; } @@ -569,8 +463,6 @@ __init void xen_time_init(void) { int cpu = smp_processor_id(); - get_time_values_from_xen(); - clocksource_register(&xen_clocksource); if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) { diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 288d587ce73..6ec3b4f7719 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -17,7 +17,7 @@ ENTRY(startup_xen) __FINIT -.pushsection .bss.page_aligned +.pushsection .text .align PAGE_SIZE_asm ENTRY(hypercall_page) .skip 0x1000 @@ -30,11 +30,7 @@ ENTRY(hypercall_page) ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long startup_xen) ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long hypercall_page) ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb") -#ifdef CONFIG_X86_PAE ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes") -#else - ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "no") -#endif ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") #endif /*CONFIG_XEN */ diff --git a/drivers/char/drm/i915_drv.c b/drivers/char/drm/i915_drv.c index e8f3d682e3b..93aed1c38bd 100644 --- a/drivers/char/drm/i915_drv.c +++ b/drivers/char/drm/i915_drv.c @@ -389,6 +389,7 @@ static int i915_resume(struct drm_device *dev) pci_restore_state(dev->pdev); if (pci_enable_device(dev->pdev)) return -1; + pci_set_master(dev->pdev); pci_write_config_byte(dev->pdev, LBB, dev_priv->saveLBB); diff --git a/drivers/char/tty_ioctl.c b/drivers/char/tty_ioctl.c index b1a757a5ee2..8f81139d619 100644 --- a/drivers/char/tty_ioctl.c +++ b/drivers/char/tty_ioctl.c @@ -981,16 +981,9 @@ EXPORT_SYMBOL_GPL(tty_perform_flush); int n_tty_ioctl(struct tty_struct *tty, struct file *file, unsigned int cmd, unsigned long arg) { - struct tty_struct *real_tty; unsigned long flags; int retval; - if (tty->driver->type == TTY_DRIVER_TYPE_PTY && - tty->driver->subtype == PTY_TYPE_MASTER) - real_tty = tty->link; - else - real_tty = tty; - switch (cmd) { case TCXONC: retval = tty_check_change(tty); diff --git a/drivers/infiniband/hw/mthca/mthca_memfree.c b/drivers/infiniband/hw/mthca/mthca_memfree.c index b224079d4e1..d5862e5d99a 100644 --- a/drivers/infiniband/hw/mthca/mthca_memfree.c +++ b/drivers/infiniband/hw/mthca/mthca_memfree.c @@ -109,7 +109,11 @@ static int mthca_alloc_icm_pages(struct scatterlist *mem, int order, gfp_t gfp_m { struct page *page; - page = alloc_pages(gfp_mask, order); + /* + * Use __GFP_ZERO because buggy firmware assumes ICM pages are + * cleared, and subtle failures are seen if they aren't. + */ + page = alloc_pages(gfp_mask | __GFP_ZERO, order); if (!page) return -ENOMEM; diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c index 5126d5d9ea0..2e554a4ab33 100644 --- a/drivers/lguest/x86/core.c +++ b/drivers/lguest/x86/core.c @@ -176,7 +176,7 @@ void lguest_arch_run_guest(struct lg_cpu *cpu) * we set it now, so we can trap and pass that trap to the Guest if it * uses the FPU. */ if (cpu->ts) - lguest_set_ts(); + unlazy_fpu(current); /* SYSENTER is an optimized way of doing system calls. We can't allow * it because it always jumps to privilege level 0. A normal Guest @@ -196,6 +196,10 @@ void lguest_arch_run_guest(struct lg_cpu *cpu) * trap made the switcher code come back, and an error code which some * traps set. */ + /* Restore SYSENTER if it's supposed to be on. */ + if (boot_cpu_has(X86_FEATURE_SEP)) + wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); + /* If the Guest page faulted, then the cr2 register will tell us the * bad virtual address. We have to grab this now, because once we * re-enable interrupts an interrupt could fault and thus overwrite @@ -203,13 +207,12 @@ void lguest_arch_run_guest(struct lg_cpu *cpu) if (cpu->regs->trapnum == 14) cpu->arch.last_pagefault = read_cr2(); /* Similarly, if we took a trap because the Guest used the FPU, - * we have to restore the FPU it expects to see. */ + * we have to restore the FPU it expects to see. + * math_state_restore() may sleep and we may even move off to + * a different CPU. So all the critical stuff should be done + * before this. */ else if (cpu->regs->trapnum == 7) math_state_restore(); - - /* Restore SYSENTER if it's supposed to be on. */ - if (boot_cpu_has(X86_FEATURE_SEP)) - wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); } /*H:130 Now we've examined the hypercall code; our Guest can make requests. diff --git a/drivers/watchdog/Makefile b/drivers/watchdog/Makefile index 8662a6b7a30..25b352b664d 100644 --- a/drivers/watchdog/Makefile +++ b/drivers/watchdog/Makefile @@ -68,7 +68,6 @@ obj-$(CONFIG_WAFER_WDT) += wafer5823wdt.o obj-$(CONFIG_I6300ESB_WDT) += i6300esb.o obj-$(CONFIG_ITCO_WDT) += iTCO_wdt.o iTCO_vendor_support.o obj-$(CONFIG_IT8712F_WDT) += it8712f_wdt.o -CFLAGS_hpwdt.o += -O obj-$(CONFIG_HP_WATCHDOG) += hpwdt.o obj-$(CONFIG_SC1200_WDT) += sc1200wdt.o obj-$(CONFIG_SCx200_WDT) += scx200_wdt.o diff --git a/drivers/xen/events.c b/drivers/xen/events.c index 4f0f22b020e..76e5b7386af 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -529,7 +529,7 @@ void xen_evtchn_do_upcall(struct pt_regs *regs) #ifndef CONFIG_X86 /* No need for a barrier -- XCHG is a barrier on x86. */ /* Clear master flag /before/ clearing selector flag. */ - rmb(); + wmb(); #endif pending_words = xchg(&vcpu_info->evtchn_pending_sel, 0); while (pending_words != 0) { diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index c19184f2e70..bec76b1c2bb 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -246,15 +246,11 @@ static void find_metapath(const struct gfs2_sbd *sdp, u64 block, } -static inline unsigned int zero_metapath_length(const struct metapath *mp, - unsigned height) +static inline unsigned int metapath_branch_start(const struct metapath *mp) { - unsigned int i; - for (i = 0; i < height - 1; i++) { - if (mp->mp_list[i] != 0) - return i; - } - return height; + if (mp->mp_list[0] == 0) + return 2; + return 1; } /** @@ -436,7 +432,7 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock, struct gfs2_sbd *sdp = GFS2_SB(inode); struct buffer_head *dibh = mp->mp_bh[0]; u64 bn, dblock = 0; - unsigned n, i, blks, alloced = 0, iblks = 0, zmpl = 0; + unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0; unsigned dblks = 0; unsigned ptrs_per_blk; const unsigned end_of_metadata = height - 1; @@ -471,9 +467,8 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock, /* Building up tree height */ state = ALLOC_GROW_HEIGHT; iblks = height - ip->i_height; - zmpl = zero_metapath_length(mp, height); - iblks -= zmpl; - iblks += height; + branch_start = metapath_branch_start(mp); + iblks += (height - branch_start); } } @@ -509,13 +504,13 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock, sizeof(struct gfs2_meta_header)); *ptr = zero_bn; state = ALLOC_GROW_DEPTH; - for(i = zmpl; i < height; i++) { + for(i = branch_start; i < height; i++) { if (mp->mp_bh[i] == NULL) break; brelse(mp->mp_bh[i]); mp->mp_bh[i] = NULL; } - i = zmpl; + i = branch_start; } if (n == 0) break; diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 6387523a315..3401628d742 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -195,7 +195,7 @@ ulong_aligned: depending on architecture. I've experimented with several ways of writing this section such as using an else before the goto but this one seems to be the fastest. */ - while ((unsigned char *)plong < end - 1) { + while ((unsigned char *)plong < end - sizeof(unsigned long)) { prefetch(plong + 1); if (((*plong) & LBITMASK) != lskipval) break; diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c index 49c7cd0502c..779d2eb649c 100644 --- a/fs/nfs/mount_clnt.c +++ b/fs/nfs/mount_clnt.c @@ -130,10 +130,11 @@ static int xdr_decode_fhstatus3(struct rpc_rqst *req, __be32 *p, struct mnt_fhstatus *res) { struct nfs_fh *fh = res->fh; + unsigned size; if ((res->status = ntohl(*p++)) == 0) { - int size = ntohl(*p++); - if (size <= NFS3_FHSIZE) { + size = ntohl(*p++); + if (size <= NFS3_FHSIZE && size != 0) { fh->size = size; memcpy(fh->data, p, size); } else diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 2a4a024a4e7..614efeed543 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1216,8 +1216,6 @@ static int nfs_validate_mount_data(void *options, { struct nfs_mount_data *data = (struct nfs_mount_data *)options; - memset(args, 0, sizeof(*args)); - if (data == NULL) goto out_no_data; @@ -1251,13 +1249,13 @@ static int nfs_validate_mount_data(void *options, case 5: memset(data->context, 0, sizeof(data->context)); case 6: - if (data->flags & NFS_MOUNT_VER3) + if (data->flags & NFS_MOUNT_VER3) { + if (data->root.size > NFS3_FHSIZE || data->root.size == 0) + goto out_invalid_fh; mntfh->size = data->root.size; - else + } else mntfh->size = NFS2_FHSIZE; - if (mntfh->size > sizeof(mntfh->data)) - goto out_invalid_fh; memcpy(mntfh->data, data->root.data, mntfh->size); if (mntfh->size < sizeof(mntfh->data)) @@ -1585,24 +1583,29 @@ static int nfs_get_sb(struct file_system_type *fs_type, { struct nfs_server *server = NULL; struct super_block *s; - struct nfs_fh mntfh; - struct nfs_parsed_mount_data data; + struct nfs_parsed_mount_data *data; + struct nfs_fh *mntfh; struct dentry *mntroot; int (*compare_super)(struct super_block *, void *) = nfs_compare_super; struct nfs_sb_mountdata sb_mntdata = { .mntflags = flags, }; - int error; + int error = -ENOMEM; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL); + if (data == NULL || mntfh == NULL) + goto out_free_fh; - security_init_mnt_opts(&data.lsm_opts); + security_init_mnt_opts(&data->lsm_opts); /* Validate the mount data */ - error = nfs_validate_mount_data(raw_data, &data, &mntfh, dev_name); + error = nfs_validate_mount_data(raw_data, data, mntfh, dev_name); if (error < 0) goto out; /* Get a volume representation */ - server = nfs_create_server(&data, &mntfh); + server = nfs_create_server(data, mntfh); if (IS_ERR(server)) { error = PTR_ERR(server); goto out; @@ -1630,16 +1633,16 @@ static int nfs_get_sb(struct file_system_type *fs_type, if (!s->s_root) { /* initial superblock/root creation */ - nfs_fill_super(s, &data); + nfs_fill_super(s, data); } - mntroot = nfs_get_root(s, &mntfh); + mntroot = nfs_get_root(s, mntfh); if (IS_ERR(mntroot)) { error = PTR_ERR(mntroot); goto error_splat_super; } - error = security_sb_set_mnt_opts(s, &data.lsm_opts); + error = security_sb_set_mnt_opts(s, &data->lsm_opts); if (error) goto error_splat_root; @@ -1649,9 +1652,12 @@ static int nfs_get_sb(struct file_system_type *fs_type, error = 0; out: - kfree(data.nfs_server.hostname); - kfree(data.mount_server.hostname); - security_free_mnt_opts(&data.lsm_opts); + kfree(data->nfs_server.hostname); + kfree(data->mount_server.hostname); + security_free_mnt_opts(&data->lsm_opts); +out_free_fh: + kfree(mntfh); + kfree(data); return error; out_err_nosb: @@ -1800,8 +1806,6 @@ static int nfs4_validate_mount_data(void *options, struct nfs4_mount_data *data = (struct nfs4_mount_data *)options; char *c; - memset(args, 0, sizeof(*args)); - if (data == NULL) goto out_no_data; @@ -1959,26 +1963,31 @@ out_no_client_address: static int nfs4_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt) { - struct nfs_parsed_mount_data data; + struct nfs_parsed_mount_data *data; struct super_block *s; struct nfs_server *server; - struct nfs_fh mntfh; + struct nfs_fh *mntfh; struct dentry *mntroot; int (*compare_super)(struct super_block *, void *) = nfs_compare_super; struct nfs_sb_mountdata sb_mntdata = { .mntflags = flags, }; - int error; + int error = -ENOMEM; - security_init_mnt_opts(&data.lsm_opts); + data = kzalloc(sizeof(*data), GFP_KERNEL); + mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL); + if (data == NULL || mntfh == NULL) + goto out_free_fh; + + security_init_mnt_opts(&data->lsm_opts); /* Validate the mount data */ - error = nfs4_validate_mount_data(raw_data, &data, dev_name); + error = nfs4_validate_mount_data(raw_data, data, dev_name); if (error < 0) goto out; /* Get a volume representation */ - server = nfs4_create_server(&data, &mntfh); + server = nfs4_create_server(data, mntfh); if (IS_ERR(server)) { error = PTR_ERR(server); goto out; @@ -2009,13 +2018,13 @@ static int nfs4_get_sb(struct file_system_type *fs_type, nfs4_fill_super(s); } - mntroot = nfs4_get_root(s, &mntfh); + mntroot = nfs4_get_root(s, mntfh); if (IS_ERR(mntroot)) { error = PTR_ERR(mntroot); goto error_splat_super; } - error = security_sb_set_mnt_opts(s, &data.lsm_opts); + error = security_sb_set_mnt_opts(s, &data->lsm_opts); if (error) goto error_splat_root; @@ -2025,10 +2034,13 @@ static int nfs4_get_sb(struct file_system_type *fs_type, error = 0; out: - kfree(data.client_address); - kfree(data.nfs_server.export_path); - kfree(data.nfs_server.hostname); - security_free_mnt_opts(&data.lsm_opts); + kfree(data->client_address); + kfree(data->nfs_server.export_path); + kfree(data->nfs_server.hostname); + security_free_mnt_opts(&data->lsm_opts); +out_free_fh: + kfree(mntfh); + kfree(data); return error; out_free: diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 6d8ace3e325..f333848fd3b 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -739,12 +739,13 @@ int nfs_updatepage(struct file *file, struct page *page, } status = nfs_writepage_setup(ctx, page, offset, count); - __set_page_dirty_nobuffers(page); + if (status < 0) + nfs_set_pageerror(page); + else + __set_page_dirty_nobuffers(page); dprintk("NFS: nfs_updatepage returns %d (isize %Ld)\n", status, (long long)i_size_read(inode)); - if (status < 0) - nfs_set_pageerror(page); return status; } diff --git a/fs/select.c b/fs/select.c index 8dda969614a..da0e88201c3 100644 --- a/fs/select.c +++ b/fs/select.c @@ -249,7 +249,6 @@ int do_select(int n, fd_set_bits *fds, s64 *timeout) retval++; } } - cond_resched(); } if (res_in) *rinp = res_in; @@ -257,6 +256,7 @@ int do_select(int n, fd_set_bits *fds, s64 *timeout) *routp = res_out; if (res_ex) *rexp = res_ex; + cond_resched(); } wait = NULL; if (retval || !*timeout || signal_pending(current)) diff --git a/include/asm-alpha/percpu.h b/include/asm-alpha/percpu.h index 82e8a94b4b2..3495e8e00d7 100644 --- a/include/asm-alpha/percpu.h +++ b/include/asm-alpha/percpu.h @@ -69,6 +69,8 @@ extern unsigned long __per_cpu_offset[NR_CPUS]; #define __get_cpu_var(var) per_cpu_var(var) #define __raw_get_cpu_var(var) per_cpu_var(var) +#define PER_CPU_ATTRIBUTES + #endif /* SMP */ #define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu_var(name) diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h index 1d8cd01fa51..844f2a89afb 100644 --- a/include/asm-x86/kvm_host.h +++ b/include/asm-x86/kvm_host.h @@ -18,6 +18,7 @@ #include <linux/kvm_para.h> #include <linux/kvm_types.h> +#include <asm/pvclock-abi.h> #include <asm/desc.h> #define KVM_MAX_VCPUS 16 @@ -282,7 +283,8 @@ struct kvm_vcpu_arch { struct x86_emulate_ctxt emulate_ctxt; gpa_t time; - struct kvm_vcpu_time_info hv_clock; + struct pvclock_vcpu_time_info hv_clock; + unsigned int hv_clock_tsc_khz; unsigned int time_offset; struct page *time_page; }; diff --git a/include/asm-x86/kvm_para.h b/include/asm-x86/kvm_para.h index 50984594207..bfd9900742b 100644 --- a/include/asm-x86/kvm_para.h +++ b/include/asm-x86/kvm_para.h @@ -48,24 +48,6 @@ struct kvm_mmu_op_release_pt { #ifdef __KERNEL__ #include <asm/processor.h> -/* xen binary-compatible interface. See xen headers for details */ -struct kvm_vcpu_time_info { - uint32_t version; - uint32_t pad0; - uint64_t tsc_timestamp; - uint64_t system_time; - uint32_t tsc_to_system_mul; - int8_t tsc_shift; - int8_t pad[3]; -} __attribute__((__packed__)); /* 32 bytes */ - -struct kvm_wall_clock { - uint32_t wc_version; - uint32_t wc_sec; - uint32_t wc_nsec; -} __attribute__((__packed__)); - - extern void kvmclock_init(void); diff --git a/include/asm-x86/pvclock-abi.h b/include/asm-x86/pvclock-abi.h new file mode 100644 index 00000000000..6857f840b24 --- /dev/null +++ b/include/asm-x86/pvclock-abi.h @@ -0,0 +1,42 @@ +#ifndef _ASM_X86_PVCLOCK_ABI_H_ +#define _ASM_X86_PVCLOCK_ABI_H_ +#ifndef __ASSEMBLY__ + +/* + * These structs MUST NOT be changed. + * They are the ABI between hypervisor and guest OS. + * Both Xen and KVM are using this. + * + * pvclock_vcpu_time_info holds the system time and the tsc timestamp + * of the last update. So the guest can use the tsc delta to get a + * more precise system time. There is one per virtual cpu. + * + * pvclock_wall_clock references the point in time when the system + * time was zero (usually boot time), thus the guest calculates the + * current wall clock by adding the system time. + * + * Protocol for the "version" fields is: hypervisor raises it (making + * it uneven) before it starts updating the fields and raises it again + * (making it even) when it is done. Thus the guest can make sure the + * time values it got are consistent by checking the version before + * and after reading them. + */ + +struct pvclock_vcpu_time_info { + u32 version; + u32 pad0; + u64 tsc_timestamp; + u64 system_time; + u32 tsc_to_system_mul; + s8 tsc_shift; + u8 pad[3]; +} __attribute__((__packed__)); /* 32 bytes */ + +struct pvclock_wall_clock { + u32 version; + u32 sec; + u32 nsec; +} __attribute__((__packed__)); + +#endif /* __ASSEMBLY__ */ +#endif /* _ASM_X86_PVCLOCK_ABI_H_ */ diff --git a/include/asm-x86/pvclock.h b/include/asm-x86/pvclock.h new file mode 100644 index 00000000000..85b1bba8e0a --- /dev/null +++ b/include/asm-x86/pvclock.h @@ -0,0 +1,13 @@ +#ifndef _ASM_X86_PVCLOCK_H_ +#define _ASM_X86_PVCLOCK_H_ + +#include <linux/clocksource.h> +#include <asm/pvclock-abi.h> + +/* some helper functions for xen and kvm pv clock sources */ +cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src); +void pvclock_read_wallclock(struct pvclock_wall_clock *wall, + struct pvclock_vcpu_time_info *vcpu, + struct timespec *ts); + +#endif /* _ASM_X86_PVCLOCK_H_ */ diff --git a/include/asm-x86/xen/page.h b/include/asm-x86/xen/page.h index baf3a4dce28..e11f24038b1 100644 --- a/include/asm-x86/xen/page.h +++ b/include/asm-x86/xen/page.h @@ -150,13 +150,9 @@ static inline pte_t __pte_ma(pteval_t x) return (pte_t) { .pte = x }; } -#ifdef CONFIG_X86_PAE #define pmd_val_ma(v) ((v).pmd) #define pud_val_ma(v) ((v).pgd.pgd) #define __pmd_ma(x) ((pmd_t) { (x) } ) -#else /* !X86_PAE */ -#define pmd_val_ma(v) ((v).pud.pgd.pgd) -#endif /* CONFIG_X86_PAE */ #define pgd_val_ma(x) ((x).pgd) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 092b1b25291..de9d1df4bba 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -33,6 +33,7 @@ #define KVM_REQ_REPORT_TPR_ACCESS 2 #define KVM_REQ_MMU_RELOAD 3 #define KVM_REQ_TRIPLE_FAULT 4 +#define KVM_REQ_PENDING_TIMER 5 struct kvm_vcpu; extern struct kmem_cache *kvm_vcpu_cache; diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h index 59f1c0bd8f9..d2a00358676 100644 --- a/include/linux/tty_driver.h +++ b/include/linux/tty_driver.h @@ -27,8 +27,7 @@ * This routine is called by the kernel to write a series of * characters to the tty device. The characters may come from * user space or kernel space. This routine will return the - * number of characters actually accepted for writing. This - * routine is mandatory. + * number of characters actually accepted for writing. * * Optional: Required for writable devices. * @@ -134,7 +133,7 @@ * This routine notifies the tty driver that it should hangup the * tty device. * - * Required: + * Optional: * * void (*break_ctl)(struct tty_stuct *tty, int state); * diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h index 9b018da48cf..819a0331cda 100644 --- a/include/xen/interface/xen.h +++ b/include/xen/interface/xen.h @@ -10,6 +10,7 @@ #define __XEN_PUBLIC_XEN_H__ #include <asm/xen/interface.h> +#include <asm/pvclock-abi.h> /* * XEN "SYSTEM CALLS" (a.k.a. HYPERCALLS). @@ -336,7 +337,7 @@ struct vcpu_info { uint8_t evtchn_upcall_mask; unsigned long evtchn_pending_sel; struct arch_vcpu_info arch; - struct vcpu_time_info time; + struct pvclock_vcpu_time_info time; }; /* 64 bytes (x86) */ /* @@ -384,9 +385,7 @@ struct shared_info { * Wallclock time: updated only by control software. Guests should base * their gettimeofday() syscall on this wallclock-base value. */ - uint32_t wc_version; /* Version counter: see vcpu_time_info_t. */ - uint32_t wc_sec; /* Secs 00:00:00 UTC, Jan 1, 1970. */ - uint32_t wc_nsec; /* Nsecs 00:00:00 UTC, Jan 1, 1970. */ + struct pvclock_wall_clock wc; struct arch_shared_info arch; diff --git a/kernel/futex.c b/kernel/futex.c index 449def8074f..7d1136e97c1 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1096,21 +1096,64 @@ static void unqueue_me_pi(struct futex_q *q) * private futexes. */ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, - struct task_struct *newowner) + struct task_struct *newowner, + struct rw_semaphore *fshared) { u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; struct futex_pi_state *pi_state = q->pi_state; + struct task_struct *oldowner = pi_state->owner; u32 uval, curval, newval; - int ret; + int ret, attempt = 0; /* Owner died? */ + if (!pi_state->owner) + newtid |= FUTEX_OWNER_DIED; + + /* + * We are here either because we stole the rtmutex from the + * pending owner or we are the pending owner which failed to + * get the rtmutex. We have to replace the pending owner TID + * in the user space variable. This must be atomic as we have + * to preserve the owner died bit here. + * + * Note: We write the user space value _before_ changing the + * pi_state because we can fault here. Imagine swapped out + * pages or a fork, which was running right before we acquired + * mmap_sem, that marked all the anonymous memory readonly for + * cow. + * + * Modifying pi_state _before_ the user space value would + * leave the pi_state in an inconsistent state when we fault + * here, because we need to drop the hash bucket lock to + * handle the fault. This might be observed in the PID check + * in lookup_pi_state. + */ +retry: + if (get_futex_value_locked(&uval, uaddr)) + goto handle_fault; + + while (1) { + newval = (uval & FUTEX_OWNER_DIED) | newtid; + + curval = cmpxchg_futex_value_locked(uaddr, uval, newval); + + if (curval == -EFAULT) + goto handle_fault; + if (curval == uval) + break; + uval = curval; + } + + /* + * We fixed up user space. Now we need to fix the pi_state + * itself. + */ if (pi_state->owner != NULL) { spin_lock_irq(&pi_state->owner->pi_lock); WARN_ON(list_empty(&pi_state->list)); list_del_init(&pi_state->list); spin_unlock_irq(&pi_state->owner->pi_lock); - } else - newtid |= FUTEX_OWNER_DIED; + } pi_state->owner = newowner; @@ -1118,26 +1161,35 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, WARN_ON(!list_empty(&pi_state->list)); list_add(&pi_state->list, &newowner->pi_state_list); spin_unlock_irq(&newowner->pi_lock); + return 0; /* - * We own it, so we have to replace the pending owner - * TID. This must be atomic as we have preserve the - * owner died bit here. + * To handle the page fault we need to drop the hash bucket + * lock here. That gives the other task (either the pending + * owner itself or the task which stole the rtmutex) the + * chance to try the fixup of the pi_state. So once we are + * back from handling the fault we need to check the pi_state + * after reacquiring the hash bucket lock and before trying to + * do another fixup. When the fixup has been done already we + * simply return. */ - ret = get_futex_value_locked(&uval, uaddr); +handle_fault: + spin_unlock(q->lock_ptr); - while (!ret) { - newval = (uval & FUTEX_OWNER_DIED) | newtid; + ret = futex_handle_fault((unsigned long)uaddr, fshared, attempt++); - curval = cmpxchg_futex_value_locked(uaddr, uval, newval); + spin_lock(q->lock_ptr); - if (curval == -EFAULT) - ret = -EFAULT; - if (curval == uval) - break; - uval = curval; - } - return ret; + /* + * Check if someone else fixed it for us: + */ + if (pi_state->owner != oldowner) + return 0; + + if (ret) + return ret; + + goto retry; } /* @@ -1507,7 +1559,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, * that case: */ if (q.pi_state->owner != curr) - ret = fixup_pi_state_owner(uaddr, &q, curr); + ret = fixup_pi_state_owner(uaddr, &q, curr, fshared); } else { /* * Catch the rare case, where the lock was released @@ -1539,7 +1591,8 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, int res; owner = rt_mutex_owner(&q.pi_state->pi_mutex); - res = fixup_pi_state_owner(uaddr, &q, owner); + res = fixup_pi_state_owner(uaddr, &q, owner, + fshared); /* propagate -EFAULT, if the fixup failed */ if (res) diff --git a/kernel/kgdb.c b/kernel/kgdb.c index 79e3c90113c..3ec23c3ec97 100644 --- a/kernel/kgdb.c +++ b/kernel/kgdb.c @@ -1499,7 +1499,8 @@ int kgdb_nmicallback(int cpu, void *regs) return 1; } -void kgdb_console_write(struct console *co, const char *s, unsigned count) +static void kgdb_console_write(struct console *co, const char *s, + unsigned count) { unsigned long flags; diff --git a/kernel/sched.c b/kernel/sched.c index 27eaa47d438..1ffa76813a0 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4449,22 +4449,20 @@ do_wait_for_common(struct completion *x, long timeout, int state) signal_pending(current)) || (state == TASK_KILLABLE && fatal_signal_pending(current))) { - __remove_wait_queue(&x->wait, &wait); - return -ERESTARTSYS; + timeout = -ERESTARTSYS; + break; } __set_current_state(state); spin_unlock_irq(&x->wait.lock); timeout = schedule_timeout(timeout); spin_lock_irq(&x->wait.lock); - if (!timeout) { - __remove_wait_queue(&x->wait, &wait); - return timeout; - } - } while (!x->done); + } while (!x->done && timeout); __remove_wait_queue(&x->wait, &wait); + if (!x->done) + return timeout; } x->done--; - return timeout; + return timeout ?: 1; } static long __sched diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 1dad5bbb59b..0f3c19197fa 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -250,7 +250,8 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) if (rt_rq->rt_time || rt_rq->rt_nr_running) idle = 0; spin_unlock(&rt_rq->rt_runtime_lock); - } + } else if (rt_rq->rt_nr_running) + idle = 0; if (enqueue) sched_rt_rq_enqueue(rt_rq); diff --git a/mm/memory.c b/mm/memory.c index 9aefaae4685..d14b251a25a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1045,6 +1045,26 @@ no_page_table: return page; } +/* Can we do the FOLL_ANON optimization? */ +static inline int use_zero_page(struct vm_area_struct *vma) +{ + /* + * We don't want to optimize FOLL_ANON for make_pages_present() + * when it tries to page in a VM_LOCKED region. As to VM_SHARED, + * we want to get the page from the page tables to make sure + * that we serialize and update with any other user of that + * mapping. + */ + if (vma->vm_flags & (VM_LOCKED | VM_SHARED)) + return 0; + /* + * And if we have a fault or a nopfn routine, it's not an + * anonymous region. + */ + return !vma->vm_ops || + (!vma->vm_ops->fault && !vma->vm_ops->nopfn); +} + int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int len, int write, int force, struct page **pages, struct vm_area_struct **vmas) @@ -1119,8 +1139,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, foll_flags = FOLL_TOUCH; if (pages) foll_flags |= FOLL_GET; - if (!write && !(vma->vm_flags & VM_LOCKED) && - (!vma->vm_ops || !vma->vm_ops->fault)) + if (!write && use_zero_page(vma)) foll_flags |= FOLL_ANON; do { @@ -1766,7 +1785,6 @@ gotten: page_table = pte_offset_map_lock(mm, pmd, address, &ptl); if (likely(pte_same(*page_table, orig_pte))) { if (old_page) { - page_remove_rmap(old_page, vma); if (!PageAnon(old_page)) { dec_mm_counter(mm, file_rss); inc_mm_counter(mm, anon_rss); @@ -1788,6 +1806,32 @@ gotten: lru_cache_add_active(new_page); page_add_new_anon_rmap(new_page, vma, address); + if (old_page) { + /* + * Only after switching the pte to the new page may + * we remove the mapcount here. Otherwise another + * process may come and find the rmap count decremented + * before the pte is switched to the new page, and + * "reuse" the old page writing into it while our pte + * here still points into it and can be read by other + * threads. + * + * The critical issue is to order this + * page_remove_rmap with the ptp_clear_flush above. + * Those stores are ordered by (if nothing else,) + * the barrier present in the atomic_add_negative + * in page_remove_rmap. + * + * Then the TLB flush in ptep_clear_flush ensures that + * no process can access the old page before the + * decremented mapcount is visible. And the old page + * cannot be reused until after the decremented + * mapcount is visible. So transitively, TLBs to + * old page will be flushed before it can be reused. + */ + page_remove_rmap(old_page, vma); + } + /* Free the old page.. */ new_page = old_page; ret |= VM_FAULT_WRITE; diff --git a/sound/isa/sb/sb_mixer.c b/sound/isa/sb/sb_mixer.c index 91d14224f6b..73d4572d136 100644 --- a/sound/isa/sb/sb_mixer.c +++ b/sound/isa/sb/sb_mixer.c @@ -925,7 +925,7 @@ static unsigned char als4000_saved_regs[] = { static void save_mixer(struct snd_sb *chip, unsigned char *regs, int num_regs) { unsigned char *val = chip->saved_regs; - snd_assert(num_regs > ARRAY_SIZE(chip->saved_regs), return); + snd_assert(num_regs <= ARRAY_SIZE(chip->saved_regs), return); for (; num_regs; num_regs--) *val++ = snd_sbmixer_read(chip, *regs++); } @@ -933,7 +933,7 @@ static void save_mixer(struct snd_sb *chip, unsigned char *regs, int num_regs) static void restore_mixer(struct snd_sb *chip, unsigned char *regs, int num_regs) { unsigned char *val = chip->saved_regs; - snd_assert(num_regs > ARRAY_SIZE(chip->saved_regs), return); + snd_assert(num_regs <= ARRAY_SIZE(chip->saved_regs), return); for (; num_regs; num_regs--) snd_sbmixer_write(chip, *regs++, *val++); } diff --git a/sound/pci/aw2/aw2-alsa.c b/sound/pci/aw2/aw2-alsa.c index 56f87cd33c1..3f00ddf450f 100644 --- a/sound/pci/aw2/aw2-alsa.c +++ b/sound/pci/aw2/aw2-alsa.c @@ -316,6 +316,8 @@ static int __devinit snd_aw2_create(struct snd_card *card, return -ENOMEM; } + /* (2) initialization of the chip hardware */ + snd_aw2_saa7146_setup(&chip->saa7146, chip->iobase_virt); if (request_irq(pci->irq, snd_aw2_saa7146_interrupt, IRQF_SHARED, "Audiowerk2", chip)) { @@ -329,8 +331,6 @@ static int __devinit snd_aw2_create(struct snd_card *card, } chip->irq = pci->irq; - /* (2) initialization of the chip hardware */ - snd_aw2_saa7146_setup(&chip->saa7146, chip->iobase_virt); err = snd_device_new(card, SNDRV_DEV_LOWLEVEL, chip, &ops); if (err < 0) { free_irq(chip->irq, (void *)chip); diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index 98778cb69c6..1dcf9f3d110 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c @@ -269,28 +269,9 @@ void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level) } } -static int get_eoi_gsi(struct kvm_ioapic *ioapic, int vector) +static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int gsi) { - int i; - - for (i = 0; i < IOAPIC_NUM_PINS; i++) - if (ioapic->redirtbl[i].fields.vector == vector) - return i; - return -1; -} - -void kvm_ioapic_update_eoi(struct kvm *kvm, int vector) -{ - struct kvm_ioapic *ioapic = kvm->arch.vioapic; union ioapic_redir_entry *ent; - int gsi; - - gsi = get_eoi_gsi(ioapic, vector); - if (gsi == -1) { - printk(KERN_WARNING "Can't find redir item for %d EOI\n", - vector); - return; - } ent = &ioapic->redirtbl[gsi]; ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG); @@ -300,6 +281,16 @@ void kvm_ioapic_update_eoi(struct kvm *kvm, int vector) ioapic_deliver(ioapic, gsi); } +void kvm_ioapic_update_eoi(struct kvm *kvm, int vector) +{ + struct kvm_ioapic *ioapic = kvm->arch.vioapic; + int i; + + for (i = 0; i < IOAPIC_NUM_PINS; i++) + if (ioapic->redirtbl[i].fields.vector == vector) + __kvm_ioapic_update_eoi(ioapic, i); +} + static int ioapic_in_range(struct kvm_io_device *this, gpa_t addr) { struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private; |