9 files changed, 379 insertions, 17 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index b78a17b1..3db651f 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -7,9 +7,10 @@ extra-y                := head_$(BITS).o head$(BITS).o head.o init_task.o vmlinu
 CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
 
 ifdef CONFIG_FTRACE
-# Do not profile debug utilities
+# Do not profile debug and lowlevel utilities
 CFLAGS_REMOVE_tsc.o = -pg
 CFLAGS_REMOVE_rtc.o = -pg
+CFLAGS_REMOVE_paravirt.o = -pg
 endif
 
 #
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index b4564d0..097d8a6 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -124,6 +124,7 @@ static void *get_call_destination(u8 type)
 		.pv_irq_ops = pv_irq_ops,
 		.pv_apic_ops = pv_apic_ops,
 		.pv_mmu_ops = pv_mmu_ops,
+		.pv_lock_ops = pv_lock_ops,
 	};
 	return *((void **)&tmpl + type);
 }
@@ -267,6 +268,17 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
 	return __get_cpu_var(paravirt_lazy_mode);
 }
 
+void __init paravirt_use_bytelocks(void)
+{
+#ifdef CONFIG_SMP
+	pv_lock_ops.spin_is_locked = __byte_spin_is_locked;
+	pv_lock_ops.spin_is_contended = __byte_spin_is_contended;
+	pv_lock_ops.spin_lock = __byte_spin_lock;
+	pv_lock_ops.spin_trylock = __byte_spin_trylock;
+	pv_lock_ops.spin_unlock = __byte_spin_unlock;
+#endif
+}
+
 struct pv_info pv_info = {
 	.name = "bare hardware",
 	.paravirt_enabled = 0,
@@ -449,6 +461,18 @@ struct pv_mmu_ops pv_mmu_ops = {
 	.set_fixmap = native_set_fixmap,
 };
 
+struct pv_lock_ops pv_lock_ops = {
+#ifdef CONFIG_SMP
+	.spin_is_locked = __ticket_spin_is_locked,
+	.spin_is_contended = __ticket_spin_is_contended,
+
+	.spin_lock = __ticket_spin_lock,
+	.spin_trylock = __ticket_spin_trylock,
+	.spin_unlock = __ticket_spin_unlock,
+#endif
+};
+EXPORT_SYMBOL_GPL(pv_lock_ops);
+
 EXPORT_SYMBOL_GPL(pv_time_ops);
 EXPORT_SYMBOL    (pv_cpu_ops);
 EXPORT_SYMBOL    (pv_mmu_ops);
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index f702199..e693812 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -15,6 +15,7 @@
  * This does not handle HOTPLUG_CPU yet.
  */
 #include <linux/sched.h>
+#include <linux/kernel_stat.h>
 #include <linux/err.h>
 #include <linux/smp.h>
 
@@ -35,6 +36,8 @@
 #include "xen-ops.h"
 #include "mmu.h"
 
+static void __cpuinit xen_init_lock_cpu(int cpu);
+
 cpumask_t xen_cpu_initialized_map;
 
 static DEFINE_PER_CPU(int, resched_irq);
@@ -179,6 +182,8 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
 {
 	unsigned cpu;
 
+	xen_init_lock_cpu(0);
+
 	smp_store_cpu_info(0);
 	cpu_data(0).x86_max_cores = 1;
 	set_cpu_sibling_map(0);
@@ -301,6 +306,7 @@ static int __cpuinit xen_cpu_up(unsigned int cpu)
 	clear_tsk_thread_flag(idle, TIF_FORK);
 #endif
 	xen_setup_timer(cpu);
+	xen_init_lock_cpu(cpu);
 
 	per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
 
@@ -413,6 +419,170 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
+struct xen_spinlock {
+	unsigned char lock;		/* 0 -> free; 1 -> locked */
+	unsigned short spinners;	/* count of waiting cpus */
+};
+
+static int xen_spin_is_locked(struct raw_spinlock *lock)
+{
+	struct xen_spinlock *xl = (struct xen_spinlock *)lock;
+
+	return xl->lock != 0;
+}
+
+static int xen_spin_is_contended(struct raw_spinlock *lock)
+{
+	struct xen_spinlock *xl = (struct xen_spinlock *)lock;
+
+	/* Not strictly true; this is only the count of contended
+	   lock-takers entering the slow path. */
+	return xl->spinners != 0;
+}
+
+static int xen_spin_trylock(struct raw_spinlock *lock)
+{
+	struct xen_spinlock *xl = (struct xen_spinlock *)lock;
+	u8 old = 1;
+
+	asm("xchgb %b0,%1"
+	    : "+q" (old), "+m" (xl->lock) : : "memory");
+
+	return old == 0;
+}
+
+static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
+static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners);
+
+static inline void spinning_lock(struct xen_spinlock *xl)
+{
+	__get_cpu_var(lock_spinners) = xl;
+	wmb();			/* set lock of interest before count */
+	asm(LOCK_PREFIX " incw %0"
+	    : "+m" (xl->spinners) : : "memory");
+}
+
+static inline void unspinning_lock(struct xen_spinlock *xl)
+{
+	asm(LOCK_PREFIX " decw %0"
+	    : "+m" (xl->spinners) : : "memory");
+	wmb();			/* decrement count before clearing lock */
+	__get_cpu_var(lock_spinners) = NULL;
+}
+
+static noinline int xen_spin_lock_slow(struct raw_spinlock *lock)
+{
+	struct xen_spinlock *xl = (struct xen_spinlock *)lock;
+	int irq = __get_cpu_var(lock_kicker_irq);
+	int ret;
+
+	/* If kicker interrupts not initialized yet, just spin */
+	if (irq == -1)
+		return 0;
+
+	/* announce we're spinning */
+	spinning_lock(xl);
+
+	/* clear pending */
+	xen_clear_irq_pending(irq);
+
+	/* check again make sure it didn't become free while
+	   we weren't looking  */
+	ret = xen_spin_trylock(lock);
+	if (ret)
+		goto out;
+
+	/* block until irq becomes pending */
+	xen_poll_irq(irq);
+	kstat_this_cpu.irqs[irq]++;
+
+out:
+	unspinning_lock(xl);
+	return ret;
+}
+
+static void xen_spin_lock(struct raw_spinlock *lock)
+{
+	struct xen_spinlock *xl = (struct xen_spinlock *)lock;
+	int timeout;
+	u8 oldval;
+
+	do {
+		timeout = 1 << 10;
+
+		asm("1: xchgb %1,%0\n"
+		    "   testb %1,%1\n"
+		    "   jz 3f\n"
+		    "2: rep;nop\n"
+		    "   cmpb $0,%0\n"
+		    "   je 1b\n"
+		    "   dec %2\n"
+		    "   jnz 2b\n"
+		    "3:\n"
+		    : "+m" (xl->lock), "=q" (oldval), "+r" (timeout)
+		    : "1" (1)
+		    : "memory");
+
+	} while (unlikely(oldval != 0 && !xen_spin_lock_slow(lock)));
+}
+
+static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl)
+{
+	int cpu;
+
+	for_each_online_cpu(cpu) {
+		/* XXX should mix up next cpu selection */
+		if (per_cpu(lock_spinners, cpu) == xl) {
+			xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR);
+			break;
+		}
+	}
+}
+
+static void xen_spin_unlock(struct raw_spinlock *lock)
+{
+	struct xen_spinlock *xl = (struct xen_spinlock *)lock;
+
+	smp_wmb();		/* make sure no writes get moved after unlock */
+	xl->lock = 0;		/* release lock */
+
+	/* make sure unlock happens before kick */
+	barrier();
+
+	if (unlikely(xl->spinners))
+		xen_spin_unlock_slow(xl);
+}
+
+static __cpuinit void xen_init_lock_cpu(int cpu)
+{
+	int irq;
+	const char *name;
+
+	name = kasprintf(GFP_KERNEL, "spinlock%d", cpu);
+	irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR,
+				     cpu,
+				     xen_reschedule_interrupt,
+				     IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
+				     name,
+				     NULL);
+
+	if (irq >= 0) {
+		disable_irq(irq); /* make sure it's never delivered */
+		per_cpu(lock_kicker_irq, cpu) = irq;
+	}
+
+	printk("cpu %d spinlock event irq %d\n", cpu, irq);
+}
+
+static void __init xen_init_spinlocks(void)
+{
+	pv_lock_ops.spin_is_locked = xen_spin_is_locked;
+	pv_lock_ops.spin_is_contended = xen_spin_is_contended;
+	pv_lock_ops.spin_lock = xen_spin_lock;
+	pv_lock_ops.spin_trylock = xen_spin_trylock;
+	pv_lock_ops.spin_unlock = xen_spin_unlock;
+}
+
 static const struct smp_ops xen_smp_ops __initdata = {
 	.smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
 	.smp_prepare_cpus = xen_smp_prepare_cpus,
@@ -430,4 +600,5 @@ void __init xen_smp_init(void)
 {
 	smp_ops = xen_smp_ops;
 	xen_fill_possible_map();
+	xen_init_spinlocks();
 }
diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index 332dd637..0e0c285 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -734,6 +734,33 @@ static void restore_cpu_ipis(unsigned int cpu)
 	}
 }
 
+/* Clear an irq's pending state, in preparation for polling on it */
+void xen_clear_irq_pending(int irq)
+{
+	int evtchn = evtchn_from_irq(irq);
+
+	if (VALID_EVTCHN(evtchn))
+		clear_evtchn(evtchn);
+}
+
+/* Poll waiting for an irq to become pending.  In the usual case, the
+   irq will be disabled so it won't deliver an interrupt. */
+void xen_poll_irq(int irq)
+{
+	evtchn_port_t evtchn = evtchn_from_irq(irq);
+
+	if (VALID_EVTCHN(evtchn)) {
+		struct sched_poll poll;
+
+		poll.nr_ports = 1;
+		poll.timeout = 0;
+		poll.ports = &evtchn;
+
+		if (HYPERVISOR_sched_op(SCHEDOP_poll, &poll) != 0)
+			BUG();
+	}
+}
+
 void xen_irq_resume(void)
 {
 	unsigned int cpu, irq, evtchn;
diff --git a/include/asm-x86/paravirt.h b/include/asm-x86/paravirt.h
index 695ce93..aec9767 100644
--- a/include/asm-x86/paravirt.h
+++ b/include/asm-x86/paravirt.h
@@ -325,6 +325,15 @@ struct pv_mmu_ops {
 			   unsigned long phys, pgprot_t flags);
 };
 
+struct raw_spinlock;
+struct pv_lock_ops {
+	int (*spin_is_locked)(struct raw_spinlock *lock);
+	int (*spin_is_contended)(struct raw_spinlock *lock);
+	void (*spin_lock)(struct raw_spinlock *lock);
+	int (*spin_trylock)(struct raw_spinlock *lock);
+	void (*spin_unlock)(struct raw_spinlock *lock);
+};
+
 /* This contains all the paravirt structures: we get a convenient
  * number for each function using the offset which we use to indicate
  * what to patch. */
@@ -335,6 +344,7 @@ struct paravirt_patch_template {
 	struct pv_irq_ops pv_irq_ops;
 	struct pv_apic_ops pv_apic_ops;
 	struct pv_mmu_ops pv_mmu_ops;
+	struct pv_lock_ops pv_lock_ops;
 };
 
 extern struct pv_info pv_info;
@@ -344,6 +354,7 @@ extern struct pv_cpu_ops pv_cpu_ops;
 extern struct pv_irq_ops pv_irq_ops;
 extern struct pv_apic_ops pv_apic_ops;
 extern struct pv_mmu_ops pv_mmu_ops;
+extern struct pv_lock_ops pv_lock_ops;
 
 #define PARAVIRT_PATCH(x)					\
 	(offsetof(struct paravirt_patch_template, x) / sizeof(void *))
@@ -1368,6 +1379,37 @@ static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx,
 void _paravirt_nop(void);
 #define paravirt_nop	((void *)_paravirt_nop)
 
+void paravirt_use_bytelocks(void);
+
+#ifdef CONFIG_SMP
+
+static inline int __raw_spin_is_locked(struct raw_spinlock *lock)
+{
+	return PVOP_CALL1(int, pv_lock_ops.spin_is_locked, lock);
+}
+
+static inline int __raw_spin_is_contended(struct raw_spinlock *lock)
+{
+	return PVOP_CALL1(int, pv_lock_ops.spin_is_contended, lock);
+}
+
+static __always_inline void __raw_spin_lock(struct raw_spinlock *lock)
+{
+	PVOP_VCALL1(pv_lock_ops.spin_lock, lock);
+}
+
+static __always_inline int __raw_spin_trylock(struct raw_spinlock *lock)
+{
+	return PVOP_CALL1(int, pv_lock_ops.spin_trylock, lock);
+}
+
+static __always_inline void __raw_spin_unlock(struct raw_spinlock *lock)
+{
+	PVOP_VCALL1(pv_lock_ops.spin_unlock, lock);
+}
+
+#endif
+
 /* These all sit in the .parainstructions section to tell us what to patch. */
 struct paravirt_patch_site {
 	u8 *instr; 		/* original instructions */
@@ -1452,6 +1494,7 @@ static inline unsigned long __raw_local_irq_save(void)
 	return f;
 }
 
+
 /* Make sure as little as possible of this mess escapes. */
 #undef PARAVIRT_CALL
 #undef __PVOP_CALL
diff --git a/include/asm-x86/spinlock.h b/include/asm-x86/spinlock.h
index 21e89bf..4f9a986 100644
--- a/include/asm-x86/spinlock.h
+++ b/include/asm-x86/spinlock.h
@@ -6,7 +6,7 @@
 #include <asm/page.h>
 #include <asm/processor.h>
 #include <linux/compiler.h>
-
+#include <asm/paravirt.h>
 /*
  * Your basic SMP spinlocks, allowing only a single CPU anywhere
  *
@@ -54,21 +54,21 @@
  * much between them in performance though, especially as locks are out of line.
  */
 #if (NR_CPUS < 256)
-static inline int __raw_spin_is_locked(raw_spinlock_t *lock)
+static inline int __ticket_spin_is_locked(raw_spinlock_t *lock)
 {
 	int tmp = ACCESS_ONCE(lock->slock);
 
 	return (((tmp >> 8) & 0xff) != (tmp & 0xff));
 }
 
-static inline int __raw_spin_is_contended(raw_spinlock_t *lock)
+static inline int __ticket_spin_is_contended(raw_spinlock_t *lock)
 {
 	int tmp = ACCESS_ONCE(lock->slock);
 
 	return (((tmp >> 8) & 0xff) - (tmp & 0xff)) > 1;
 }
 
-static __always_inline void __raw_spin_lock(raw_spinlock_t *lock)
+static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock)
 {
 	short inc = 0x0100;
 
@@ -87,9 +87,7 @@ static __always_inline void __raw_spin_lock(raw_spinlock_t *lock)
 		: "memory", "cc");
 }
 
-#define __raw_spin_lock_flags(lock, flags) __raw_spin_lock(lock)
-
-static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock)
+static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock)
 {
 	int tmp;
 	short new;
@@ -110,7 +108,7 @@ static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock)
 	return tmp;
 }
 
-static __always_inline void __raw_spin_unlock(raw_spinlock_t *lock)
+static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock)
 {
 	asm volatile(UNLOCK_LOCK_PREFIX "incb %0"
 		     : "+m" (lock->slock)
@@ -118,21 +116,21 @@ static __always_inline void __raw_spin_unlock(raw_spinlock_t *lock)
 		     : "memory", "cc");
 }
 #else
-static inline int __raw_spin_is_locked(raw_spinlock_t *lock)
+static inline int __ticket_spin_is_locked(raw_spinlock_t *lock)
 {
 	int tmp = ACCESS_ONCE(lock->slock);
 
 	return (((tmp >> 16) & 0xffff) != (tmp & 0xffff));
 }
 
-static inline int __raw_spin_is_contended(raw_spinlock_t *lock)
+static inline int __ticket_spin_is_contended(raw_spinlock_t *lock)
 {
 	int tmp = ACCESS_ONCE(lock->slock);
 
 	return (((tmp >> 16) & 0xffff) - (tmp & 0xffff)) > 1;
 }
 
-static __always_inline void __raw_spin_lock(raw_spinlock_t *lock)
+static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock)
 {
 	int inc = 0x00010000;
 	int tmp;
@@ -153,9 +151,7 @@ static __always_inline void __raw_spin_lock(raw_spinlock_t *lock)
 		     : "memory", "cc");
 }
 
-#define __raw_spin_lock_flags(lock, flags) __raw_spin_lock(lock)
-
-static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock)
+static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock)
 {
 	int tmp;
 	int new;
@@ -177,7 +173,7 @@ static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock)
 	return tmp;
 }
 
-static __always_inline void __raw_spin_unlock(raw_spinlock_t *lock)
+static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock)
 {
 	asm volatile(UNLOCK_LOCK_PREFIX "incw %0"
 		     : "+m" (lock->slock)
@@ -186,6 +182,98 @@ static __always_inline void __raw_spin_unlock(raw_spinlock_t *lock)
 }
 #endif
 
+#define __raw_spin_lock_flags(lock, flags) __raw_spin_lock(lock)
+
+#ifdef CONFIG_PARAVIRT
+/*
+ * Define virtualization-friendly old-style lock byte lock, for use in
+ * pv_lock_ops if desired.
+ *
+ * This differs from the pre-2.6.24 spinlock by always using xchgb
+ * rather than decb to take the lock; this allows it to use a
+ * zero-initialized lock structure.  It also maintains a 1-byte
+ * contention counter, so that we can implement
+ * __byte_spin_is_contended.
+ */
+struct __byte_spinlock {
+	s8 lock;
+	s8 spinners;
+};
+
+static inline int __byte_spin_is_locked(raw_spinlock_t *lock)
+{
+	struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
+	return bl->lock != 0;
+}
+
+static inline int __byte_spin_is_contended(raw_spinlock_t *lock)
+{
+	struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
+	return bl->spinners != 0;
+}
+
+static inline void __byte_spin_lock(raw_spinlock_t *lock)
+{
+	struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
+	s8 val = 1;
+
+	asm("1: xchgb %1, %0\n"
+	    "   test %1,%1\n"
+	    "   jz 3f\n"
+	    "   " LOCK_PREFIX "incb %2\n"
+	    "2: rep;nop\n"
+	    "   cmpb $1, %0\n"
+	    "   je 2b\n"
+	    "   " LOCK_PREFIX "decb %2\n"
+	    "   jmp 1b\n"
+	    "3:"
+	    : "+m" (bl->lock), "+q" (val), "+m" (bl->spinners): : "memory");
+}
+
+static inline int __byte_spin_trylock(raw_spinlock_t *lock)
+{
+	struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
+	u8 old = 1;
+
+	asm("xchgb %1,%0"
+	    : "+m" (bl->lock), "+q" (old) : : "memory");
+
+	return old == 0;
+}
+
+static inline void __byte_spin_unlock(raw_spinlock_t *lock)
+{
+	struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
+	smp_wmb();
+	bl->lock = 0;
+}
+#else  /* !CONFIG_PARAVIRT */
+static inline int __raw_spin_is_locked(raw_spinlock_t *lock)
+{
+	return __ticket_spin_is_locked(lock);
+}
+
+static inline int __raw_spin_is_contended(raw_spinlock_t *lock)
+{
+	return __ticket_spin_is_contended(lock);
+}
+
+static __always_inline void __raw_spin_lock(raw_spinlock_t *lock)
+{
+	__ticket_spin_lock(lock);
+}
+
+static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock)
+{
+	return __ticket_spin_trylock(lock);
+}
+
+static __always_inline void __raw_spin_unlock(raw_spinlock_t *lock)
+{
+	__ticket_spin_unlock(lock);
+}
+#endif	/* CONFIG_PARAVIRT */
+
 static inline void __raw_spin_unlock_wait(raw_spinlock_t *lock)
 {
 	while (__raw_spin_is_locked(lock))
diff --git a/include/asm-x86/spinlock_types.h b/include/asm-x86/spinlock_types.h
index 9029cf7..06c071c 100644
--- a/include/asm-x86/spinlock_types.h
+++ b/include/asm-x86/spinlock_types.h
@@ -5,7 +5,7 @@
 # error "please don't include this file directly"
 #endif
 
-typedef struct {
+typedef struct raw_spinlock {
 	unsigned int slock;
 } raw_spinlock_t;
 
diff --git a/include/asm-x86/xen/events.h b/include/asm-x86/xen/events.h
index f8d57ea..8ded747 100644
--- a/include/asm-x86/xen/events.h
+++ b/include/asm-x86/xen/events.h
@@ -5,6 +5,7 @@ enum ipi_vector {
 	XEN_RESCHEDULE_VECTOR,
 	XEN_CALL_FUNCTION_VECTOR,
 	XEN_CALL_FUNCTION_SINGLE_VECTOR,
+	XEN_SPIN_UNLOCK_VECTOR,
 
 	XEN_NR_IPIS,
 };
diff --git a/include/xen/events.h b/include/xen/events.h
index 67c4436..4680ff3 100644
--- a/include/xen/events.h
+++ b/include/xen/events.h
@@ -44,4 +44,11 @@ extern void notify_remote_via_irq(int irq);
 
 extern void xen_irq_resume(void);
 
+/* Clear an irq's pending state, in preparation for polling on it */
+void xen_clear_irq_pending(int irq);
+
+/* Poll waiting for an irq to become pending.  In the usual case, the
+   irq will be disabled so it won't deliver an interrupt. */
+void xen_poll_irq(int irq);
+
 #endif	/* _XEN_EVENTS_H */