From b3ac891b67bd4b1fc728d1c784cad1212dea433d Mon Sep 17 00:00:00 2001
From: Luca Barbieri <luca@luca-barbieri.com>
Date: Wed, 24 Feb 2010 10:54:22 +0100
Subject: x86: Add support for lock prefix in alternatives

The current lock prefix UP/SMP alternative code doesn't allow
LOCK_PREFIX to be used in alternatives code.

This patch solves the problem by adding a new LOCK_PREFIX_ALTERNATIVE_PATCH
macro that only records the lock prefix location but does not emit
the prefix.

The user of this macro can then start any alternative sequence with
"lock" and have it UP/SMP patched.

To make this work, the UP/SMP alternative code is changed to do the
lock/DS prefix switching only if the byte actually contains a lock or
DS prefix.

Thus, if an alternative without the "lock" is selected, it will now do
nothing instead of clobbering the code.

Changes in v2:
- Naming change
- Change label to not conflict with alternatives

Signed-off-by: Luca Barbieri <luca@luca-barbieri.com>
LKML-Reference: <1267005265-27958-2-git-send-email-luca@luca-barbieri.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/alternative.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 2589ea4..80b222e 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -244,7 +244,8 @@ static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end)
 		if (*ptr > text_end)
 			continue;
 		/* turn DS segment override prefix into lock prefix */
-		text_poke(*ptr, ((unsigned char []){0xf0}), 1);
+		if (**ptr == 0x3e)
+			text_poke(*ptr, ((unsigned char []){0xf0}), 1);
 	};
 	mutex_unlock(&text_mutex);
 }
@@ -263,7 +264,8 @@ static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end
 		if (*ptr > text_end)
 			continue;
 		/* turn lock prefix into DS segment override prefix */
-		text_poke(*ptr, ((unsigned char []){0x3E}), 1);
+		if (**ptr == 0xf0)
+			text_poke(*ptr, ((unsigned char []){0x3E}), 1);
 	};
 	mutex_unlock(&text_mutex);
 }
-- 
cgit v1.1


From ced918eb748ce30b3aace549fd17540e40ffdca0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 17 Feb 2010 16:47:10 +0000
Subject: i8253: Convert i8253_lock to raw_spinlock

i8253_lock needs to be a real spinlock in preempt-rt, i.e. it can
not be converted to a sleeping lock.

Convert it to raw_spinlock and fix up all users.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Acked-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Acked-by: Takashi Iwai <tiwai@suse.de>
Cc: Jens Axboe <jens.axboe@oracle.com>
LKML-Reference: <20100217163751.030764372@linutronix.de>
---
 arch/x86/kernel/apm_32.c |  4 ++--
 arch/x86/kernel/i8253.c  | 14 +++++++-------
 2 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 031aa88..c4f9182 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -1224,7 +1224,7 @@ static void reinit_timer(void)
 #ifdef INIT_TIMER_AFTER_SUSPEND
 	unsigned long flags;
 
-	spin_lock_irqsave(&i8253_lock, flags);
+	raw_spin_lock_irqsave(&i8253_lock, flags);
 	/* set the clock to HZ */
 	outb_pit(0x34, PIT_MODE);		/* binary, mode 2, LSB/MSB, ch 0 */
 	udelay(10);
@@ -1232,7 +1232,7 @@ static void reinit_timer(void)
 	udelay(10);
 	outb_pit(LATCH >> 8, PIT_CH0);	/* MSB */
 	udelay(10);
-	spin_unlock_irqrestore(&i8253_lock, flags);
+	raw_spin_unlock_irqrestore(&i8253_lock, flags);
 #endif
 }
 
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index 23c1679..2dfd315 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -16,7 +16,7 @@
 #include <asm/hpet.h>
 #include <asm/smp.h>
 
-DEFINE_SPINLOCK(i8253_lock);
+DEFINE_RAW_SPINLOCK(i8253_lock);
 EXPORT_SYMBOL(i8253_lock);
 
 /*
@@ -33,7 +33,7 @@ struct clock_event_device *global_clock_event;
 static void init_pit_timer(enum clock_event_mode mode,
 			   struct clock_event_device *evt)
 {
-	spin_lock(&i8253_lock);
+	raw_spin_lock(&i8253_lock);
 
 	switch (mode) {
 	case CLOCK_EVT_MODE_PERIODIC:
@@ -62,7 +62,7 @@ static void init_pit_timer(enum clock_event_mode mode,
 		/* Nothing to do here */
 		break;
 	}
-	spin_unlock(&i8253_lock);
+	raw_spin_unlock(&i8253_lock);
 }
 
 /*
@@ -72,10 +72,10 @@ static void init_pit_timer(enum clock_event_mode mode,
  */
 static int pit_next_event(unsigned long delta, struct clock_event_device *evt)
 {
-	spin_lock(&i8253_lock);
+	raw_spin_lock(&i8253_lock);
 	outb_pit(delta & 0xff , PIT_CH0);	/* LSB */
 	outb_pit(delta >> 8 , PIT_CH0);		/* MSB */
-	spin_unlock(&i8253_lock);
+	raw_spin_unlock(&i8253_lock);
 
 	return 0;
 }
@@ -130,7 +130,7 @@ static cycle_t pit_read(struct clocksource *cs)
 	int count;
 	u32 jifs;
 
-	spin_lock_irqsave(&i8253_lock, flags);
+	raw_spin_lock_irqsave(&i8253_lock, flags);
 	/*
 	 * Although our caller may have the read side of xtime_lock,
 	 * this is now a seqlock, and we are cheating in this routine
@@ -176,7 +176,7 @@ static cycle_t pit_read(struct clocksource *cs)
 	old_count = count;
 	old_jifs = jifs;
 
-	spin_unlock_irqrestore(&i8253_lock, flags);
+	raw_spin_unlock_irqrestore(&i8253_lock, flags);
 
 	count = (LATCH - 1) - count;
 
-- 
cgit v1.1


From 4abc14a733f9002c05623db755aaafdd27fa7a91 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 20 Jan 2010 14:52:23 +0100
Subject: iommu-api: Rename ->{un}map function pointers to ->{un}map_range

The new function pointer names match better with the
top-level functions of the iommu-api which are using them.
Main intention of this change is to make the ->{un}map
pointer names free for two new mapping functions.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index adb0ba0..59cae7c 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -2515,8 +2515,8 @@ static struct iommu_ops amd_iommu_ops = {
 	.domain_destroy = amd_iommu_domain_destroy,
 	.attach_dev = amd_iommu_attach_device,
 	.detach_dev = amd_iommu_detach_device,
-	.map = amd_iommu_map_range,
-	.unmap = amd_iommu_unmap_range,
+	.map_range = amd_iommu_map_range,
+	.unmap_range = amd_iommu_unmap_range,
 	.iova_to_phys = amd_iommu_iova_to_phys,
 	.domain_has_cap = amd_iommu_domain_has_cap,
 };
-- 
cgit v1.1


From cbb9d729f3433c9c2660b01dc52e6deb89488886 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 15 Jan 2010 14:41:15 +0100
Subject: x86/amd-iommu: Make iommu_map_page and alloc_pte aware of page sizes

This patch changes the old map_size parameter of alloc_pte
to a page_size parameter which can be used more easily to
alloc a pte for intermediate page sizes.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c | 53 ++++++++++++++++++++++++++++-----------------
 1 file changed, 33 insertions(+), 20 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 59cae7c..4170031 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -730,18 +730,22 @@ static bool increase_address_space(struct protection_domain *domain,
 
 static u64 *alloc_pte(struct protection_domain *domain,
 		      unsigned long address,
-		      int end_lvl,
+		      unsigned long page_size,
 		      u64 **pte_page,
 		      gfp_t gfp)
 {
+	int level, end_lvl;
 	u64 *pte, *page;
-	int level;
+
+	BUG_ON(!is_power_of_2(page_size));
 
 	while (address > PM_LEVEL_SIZE(domain->mode))
 		increase_address_space(domain, gfp);
 
-	level =  domain->mode - 1;
-	pte   = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
+	level   = domain->mode - 1;
+	pte     = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
+	address = PAGE_SIZE_ALIGN(address, page_size);
+	end_lvl = PAGE_SIZE_LEVEL(page_size);
 
 	while (level > end_lvl) {
 		if (!IOMMU_PTE_PRESENT(*pte)) {
@@ -751,6 +755,10 @@ static u64 *alloc_pte(struct protection_domain *domain,
 			*pte = PM_LEVEL_PDE(level, virt_to_phys(page));
 		}
 
+		/* No level skipping support yet */
+		if (PM_PTE_LEVEL(*pte) != level)
+			return NULL;
+
 		level -= 1;
 
 		pte = IOMMU_PTE_PAGE(*pte);
@@ -806,31 +814,36 @@ static int iommu_map_page(struct protection_domain *dom,
 			  unsigned long bus_addr,
 			  unsigned long phys_addr,
 			  int prot,
-			  int map_size)
+			  unsigned long page_size)
 {
 	u64 __pte, *pte;
-
-	bus_addr  = PAGE_ALIGN(bus_addr);
-	phys_addr = PAGE_ALIGN(phys_addr);
-
-	BUG_ON(!PM_ALIGNED(map_size, bus_addr));
-	BUG_ON(!PM_ALIGNED(map_size, phys_addr));
+	int i, count;
 
 	if (!(prot & IOMMU_PROT_MASK))
 		return -EINVAL;
 
-	pte = alloc_pte(dom, bus_addr, map_size, NULL, GFP_KERNEL);
+	bus_addr  = PAGE_ALIGN(bus_addr);
+	phys_addr = PAGE_ALIGN(phys_addr);
+	count     = PAGE_SIZE_PTE_COUNT(page_size);
+	pte       = alloc_pte(dom, bus_addr, page_size, NULL, GFP_KERNEL);
+
+	for (i = 0; i < count; ++i)
+		if (IOMMU_PTE_PRESENT(pte[i]))
+			return -EBUSY;
 
-	if (IOMMU_PTE_PRESENT(*pte))
-		return -EBUSY;
+	if (page_size > PAGE_SIZE) {
+		__pte = PAGE_SIZE_PTE(phys_addr, page_size);
+		__pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_P | IOMMU_PTE_FC;
+	} else
+		__pte = phys_addr | IOMMU_PTE_P | IOMMU_PTE_FC;
 
-	__pte = phys_addr | IOMMU_PTE_P;
 	if (prot & IOMMU_PROT_IR)
 		__pte |= IOMMU_PTE_IR;
 	if (prot & IOMMU_PROT_IW)
 		__pte |= IOMMU_PTE_IW;
 
-	*pte = __pte;
+	for (i = 0; i < count; ++i)
+		pte[i] = __pte;
 
 	update_domain(dom);
 
@@ -877,7 +890,7 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
 	for (addr = e->address_start; addr < e->address_end;
 	     addr += PAGE_SIZE) {
 		ret = iommu_map_page(&dma_dom->domain, addr, addr, e->prot,
-				     PM_MAP_4k);
+				     PAGE_SIZE);
 		if (ret)
 			return ret;
 		/*
@@ -1005,7 +1018,7 @@ static int alloc_new_range(struct dma_ops_domain *dma_dom,
 		u64 *pte, *pte_page;
 
 		for (i = 0; i < num_ptes; ++i) {
-			pte = alloc_pte(&dma_dom->domain, address, PM_MAP_4k,
+			pte = alloc_pte(&dma_dom->domain, address, PAGE_SIZE,
 					&pte_page, gfp);
 			if (!pte)
 				goto out_free;
@@ -1711,7 +1724,7 @@ static u64* dma_ops_get_pte(struct dma_ops_domain *dom,
 
 	pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)];
 	if (!pte) {
-		pte = alloc_pte(&dom->domain, address, PM_MAP_4k, &pte_page,
+		pte = alloc_pte(&dom->domain, address, PAGE_SIZE, &pte_page,
 				GFP_ATOMIC);
 		aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page;
 	} else
@@ -2457,7 +2470,7 @@ static int amd_iommu_map_range(struct iommu_domain *dom,
 	paddr &= PAGE_MASK;
 
 	for (i = 0; i < npages; ++i) {
-		ret = iommu_map_page(domain, iova, paddr, prot, PM_MAP_4k);
+		ret = iommu_map_page(domain, iova, paddr, prot, PAGE_SIZE);
 		if (ret)
 			return ret;
 
-- 
cgit v1.1


From 24cd772315c19e4d9409d0d21367ec1ebab3149f Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Tue, 19 Jan 2010 17:27:39 +0100
Subject: x86/amd-iommu: Make iommu_unmap_page and fetch_pte aware of page
 sizes

This patch extends the functionality of iommu_unmap_page
and fetch_pte to support arbitrary page sizes.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c | 90 ++++++++++++++++++++++++++++++++++++---------
 1 file changed, 72 insertions(+), 18 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 4170031..503d312 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -776,28 +776,47 @@ static u64 *alloc_pte(struct protection_domain *domain,
  * This function checks if there is a PTE for a given dma address. If
  * there is one, it returns the pointer to it.
  */
-static u64 *fetch_pte(struct protection_domain *domain,
-		      unsigned long address, int map_size)
+static u64 *fetch_pte(struct protection_domain *domain, unsigned long address)
 {
 	int level;
 	u64 *pte;
 
-	level =  domain->mode - 1;
-	pte   = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
+	if (address > PM_LEVEL_SIZE(domain->mode))
+		return NULL;
+
+	level   =  domain->mode - 1;
+	pte     = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
 
-	while (level > map_size) {
+	while (level > 0) {
+
+		/* Not Present */
 		if (!IOMMU_PTE_PRESENT(*pte))
 			return NULL;
 
+		/* Large PTE */
+		if (PM_PTE_LEVEL(*pte) == 0x07) {
+			unsigned long pte_mask, __pte;
+
+			/*
+			 * If we have a series of large PTEs, make
+			 * sure to return a pointer to the first one.
+			 */
+			pte_mask = PTE_PAGE_SIZE(*pte);
+			pte_mask = ~((PAGE_SIZE_PTE_COUNT(pte_mask) << 3) - 1);
+			__pte    = ((unsigned long)pte) & pte_mask;
+
+			return (u64 *)__pte;
+		}
+
+		/* No level skipping support yet */
+		if (PM_PTE_LEVEL(*pte) != level)
+			return NULL;
+
 		level -= 1;
 
+		/* Walk to the next level */
 		pte = IOMMU_PTE_PAGE(*pte);
 		pte = &pte[PM_LEVEL_INDEX(level, address)];
-
-		if ((PM_PTE_LEVEL(*pte) == 0) && level != map_size) {
-			pte = NULL;
-			break;
-		}
 	}
 
 	return pte;
@@ -850,13 +869,48 @@ static int iommu_map_page(struct protection_domain *dom,
 	return 0;
 }
 
-static void iommu_unmap_page(struct protection_domain *dom,
-			     unsigned long bus_addr, int map_size)
+static unsigned long iommu_unmap_page(struct protection_domain *dom,
+				      unsigned long bus_addr,
+				      unsigned long page_size)
 {
-	u64 *pte = fetch_pte(dom, bus_addr, map_size);
+	unsigned long long unmap_size, unmapped;
+	u64 *pte;
+
+	BUG_ON(!is_power_of_2(page_size));
+
+	unmapped = 0;
+
+	while (unmapped < page_size) {
+
+		pte = fetch_pte(dom, bus_addr);
+
+		if (!pte) {
+			/*
+			 * No PTE for this address
+			 * move forward in 4kb steps
+			 */
+			unmap_size = PAGE_SIZE;
+		} else if (PM_PTE_LEVEL(*pte) == 0) {
+			/* 4kb PTE found for this address */
+			unmap_size = PAGE_SIZE;
+			*pte       = 0ULL;
+		} else {
+			int count, i;
+
+			/* Large PTE found which maps this address */
+			unmap_size = PTE_PAGE_SIZE(*pte);
+			count      = PAGE_SIZE_PTE_COUNT(unmap_size);
+			for (i = 0; i < count; i++)
+				pte[i] = 0ULL;
+		}
+
+		bus_addr  = (bus_addr & ~(unmap_size - 1)) + unmap_size;
+		unmapped += unmap_size;
+	}
+
+	BUG_ON(!is_power_of_2(unmapped));
 
-	if (pte)
-		*pte = 0;
+	return unmapped;
 }
 
 /*
@@ -1054,7 +1108,7 @@ static int alloc_new_range(struct dma_ops_domain *dma_dom,
 	for (i = dma_dom->aperture[index]->offset;
 	     i < dma_dom->aperture_size;
 	     i += PAGE_SIZE) {
-		u64 *pte = fetch_pte(&dma_dom->domain, i, PM_MAP_4k);
+		u64 *pte = fetch_pte(&dma_dom->domain, i);
 		if (!pte || !IOMMU_PTE_PRESENT(*pte))
 			continue;
 
@@ -2491,7 +2545,7 @@ static void amd_iommu_unmap_range(struct iommu_domain *dom,
 	iova  &= PAGE_MASK;
 
 	for (i = 0; i < npages; ++i) {
-		iommu_unmap_page(domain, iova, PM_MAP_4k);
+		iommu_unmap_page(domain, iova, PAGE_SIZE);
 		iova  += PAGE_SIZE;
 	}
 
@@ -2506,7 +2560,7 @@ static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
 	phys_addr_t paddr;
 	u64 *pte;
 
-	pte = fetch_pte(domain, iova, PM_MAP_4k);
+	pte = fetch_pte(domain, iova);
 
 	if (!pte || !IOMMU_PTE_PRESENT(*pte))
 		return 0;
-- 
cgit v1.1


From f03152bb7d0a74f409ad63ed36916444a7493d72 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 21 Jan 2010 16:15:24 +0100
Subject: x86/amd-iommu: Make amd_iommu_iova_to_phys aware of multiple page
 sizes

This patch extends the amd_iommu_iova_to_phys() function to
handle different page sizes correctly. It doesn't use
fetch_pte() anymore because we don't know (or care about)
the page_size used for mapping the given iova.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 503d312..52e44af 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -2556,17 +2556,22 @@ static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
 					  unsigned long iova)
 {
 	struct protection_domain *domain = dom->priv;
-	unsigned long offset = iova & ~PAGE_MASK;
+	unsigned long offset_mask;
 	phys_addr_t paddr;
-	u64 *pte;
+	u64 *pte, __pte;
 
 	pte = fetch_pte(domain, iova);
 
 	if (!pte || !IOMMU_PTE_PRESENT(*pte))
 		return 0;
 
-	paddr  = *pte & IOMMU_PAGE_MASK;
-	paddr |= offset;
+	if (PM_PTE_LEVEL(*pte) == 0)
+		offset_mask = PAGE_SIZE - 1;
+	else
+		offset_mask = PTE_PAGE_SIZE(*pte) - 1;
+
+	__pte = *pte & PM_ADDR_MASK;
+	paddr = (__pte & ~offset_mask) | (iova & offset_mask);
 
 	return paddr;
 }
-- 
cgit v1.1


From 468e2366cdb80cf8a691b8bc212260cfbdbd518e Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 21 Jan 2010 16:37:36 +0100
Subject: x86/amd-iommu: Implement ->{un}map callbacks for iommu-api

This patch implements the new callbacks for the IOMMU-API
with functions that can handle different page sizes in the
IOMMU page table.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 52e44af..0e068c9 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -2552,6 +2552,33 @@ static void amd_iommu_unmap_range(struct iommu_domain *dom,
 	iommu_flush_tlb_pde(domain);
 }
 
+static int amd_iommu_map(struct iommu_domain *dom, unsigned long iova,
+			 phys_addr_t paddr, int gfp_order, int iommu_prot)
+{
+	unsigned long page_size = 0x1000UL << gfp_order;
+	struct protection_domain *domain = dom->priv;
+	int prot = 0;
+
+	if (iommu_prot & IOMMU_READ)
+		prot |= IOMMU_PROT_IR;
+	if (iommu_prot & IOMMU_WRITE)
+		prot |= IOMMU_PROT_IW;
+
+	return iommu_map_page(domain, iova, paddr, prot, page_size);
+}
+
+static int amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova,
+			   int gfp_order)
+{
+	struct protection_domain *domain = dom->priv;
+	unsigned long page_size, unmap_size;
+
+	page_size  = 0x1000UL << gfp_order;
+	unmap_size = iommu_unmap_page(domain, iova, page_size);
+
+	return get_order(unmap_size);
+}
+
 static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
 					  unsigned long iova)
 {
@@ -2587,6 +2614,8 @@ static struct iommu_ops amd_iommu_ops = {
 	.domain_destroy = amd_iommu_domain_destroy,
 	.attach_dev = amd_iommu_attach_device,
 	.detach_dev = amd_iommu_detach_device,
+	.map = amd_iommu_map,
+	.unmap = amd_iommu_unmap,
 	.map_range = amd_iommu_map_range,
 	.unmap_range = amd_iommu_unmap_range,
 	.iova_to_phys = amd_iommu_iova_to_phys,
-- 
cgit v1.1


From 12c7389abe5786349d3ea6da1961cf78d0c1c7cd Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 21 Jan 2010 11:50:28 +0100
Subject: iommu-api: Remove iommu_{un}map_range functions

These functions are not longer used and can be removed
savely. There functionality is now provided by the
iommu_{un}map functions which are also capable of multiple
page sizes.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c | 48 ---------------------------------------------
 1 file changed, 48 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 0e068c9..d8da998 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -2506,52 +2506,6 @@ static int amd_iommu_attach_device(struct iommu_domain *dom,
 	return ret;
 }
 
-static int amd_iommu_map_range(struct iommu_domain *dom,
-			       unsigned long iova, phys_addr_t paddr,
-			       size_t size, int iommu_prot)
-{
-	struct protection_domain *domain = dom->priv;
-	unsigned long i,  npages = iommu_num_pages(paddr, size, PAGE_SIZE);
-	int prot = 0;
-	int ret;
-
-	if (iommu_prot & IOMMU_READ)
-		prot |= IOMMU_PROT_IR;
-	if (iommu_prot & IOMMU_WRITE)
-		prot |= IOMMU_PROT_IW;
-
-	iova  &= PAGE_MASK;
-	paddr &= PAGE_MASK;
-
-	for (i = 0; i < npages; ++i) {
-		ret = iommu_map_page(domain, iova, paddr, prot, PAGE_SIZE);
-		if (ret)
-			return ret;
-
-		iova  += PAGE_SIZE;
-		paddr += PAGE_SIZE;
-	}
-
-	return 0;
-}
-
-static void amd_iommu_unmap_range(struct iommu_domain *dom,
-				  unsigned long iova, size_t size)
-{
-
-	struct protection_domain *domain = dom->priv;
-	unsigned long i,  npages = iommu_num_pages(iova, size, PAGE_SIZE);
-
-	iova  &= PAGE_MASK;
-
-	for (i = 0; i < npages; ++i) {
-		iommu_unmap_page(domain, iova, PAGE_SIZE);
-		iova  += PAGE_SIZE;
-	}
-
-	iommu_flush_tlb_pde(domain);
-}
-
 static int amd_iommu_map(struct iommu_domain *dom, unsigned long iova,
 			 phys_addr_t paddr, int gfp_order, int iommu_prot)
 {
@@ -2616,8 +2570,6 @@ static struct iommu_ops amd_iommu_ops = {
 	.detach_dev = amd_iommu_detach_device,
 	.map = amd_iommu_map,
 	.unmap = amd_iommu_unmap,
-	.map_range = amd_iommu_map_range,
-	.unmap_range = amd_iommu_unmap_range,
 	.iova_to_phys = amd_iommu_iova_to_phys,
 	.domain_has_cap = amd_iommu_domain_has_cap,
 };
-- 
cgit v1.1


From 938179b4f8cf8a4f11234ebf2dff2eb48400acfe Mon Sep 17 00:00:00 2001
From: Dimitri Sivanich <sivanich@sgi.com>
Date: Fri, 5 Mar 2010 11:42:03 -0600
Subject: x86: Improve Intel microcode loader performance

We've noticed that on large SGI UV system configurations,
running microcode.ctl can take very long periods of time.  This
is due to the large number of vmalloc/vfree calls made by the
Intel generic_load_microcode() logic.

By reusing allocated space, the following patch reduces the time
to run microcode.ctl on a 1024 cpu system from approximately 80
seconds down to 1 or 2 seconds.

Signed-off-by: Dimitri Sivanich <sivanich@sgi.com>
Acked-by: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Cc: Avi Kivity <avi@redhat.com>
Cc: Bill Davidsen <davidsen@tmr.com>
LKML-Reference: <20100305174203.GA19638@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/microcode_intel.c | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
index 85a343e..3561702 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/microcode_intel.c
@@ -343,10 +343,11 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
 				int (*get_ucode_data)(void *, const void *, size_t))
 {
 	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-	u8 *ucode_ptr = data, *new_mc = NULL, *mc;
+	u8 *ucode_ptr = data, *new_mc = NULL, *mc = NULL;
 	int new_rev = uci->cpu_sig.rev;
 	unsigned int leftover = size;
 	enum ucode_state state = UCODE_OK;
+	unsigned int curr_mc_size = 0;
 
 	while (leftover) {
 		struct microcode_header_intel mc_header;
@@ -361,9 +362,15 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
 			break;
 		}
 
-		mc = vmalloc(mc_size);
-		if (!mc)
-			break;
+		/* For performance reasons, reuse mc area when possible */
+		if (!mc || mc_size > curr_mc_size) {
+			if (mc)
+				vfree(mc);
+			mc = vmalloc(mc_size);
+			if (!mc)
+				break;
+			curr_mc_size = mc_size;
+		}
 
 		if (get_ucode_data(mc, ucode_ptr, mc_size) ||
 		    microcode_sanity_check(mc) < 0) {
@@ -376,13 +383,16 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
 				vfree(new_mc);
 			new_rev = mc_header.rev;
 			new_mc  = mc;
-		} else
-			vfree(mc);
+			mc = NULL;	/* trigger new vmalloc */
+		}
 
 		ucode_ptr += mc_size;
 		leftover  -= mc_size;
 	}
 
+	if (mc)
+		vfree(mc);
+
 	if (leftover) {
 		if (new_mc)
 			vfree(new_mc);
-- 
cgit v1.1


From 6f4edd69e40aba4f45bf9558c1e9a950d79ab4e4 Mon Sep 17 00:00:00 2001
From: Jack Steiner <steiner@sgi.com>
Date: Wed, 10 Mar 2010 14:44:58 -0600
Subject: x86, UV: Clean up UV headers for MMR definitions

Update UV mmr definitions header file. Eliminate definitions no
longer needed. Move 2 definitions from tlb_uv.c into the header
file where they belong.

Signed-off-by: Jack Steiner <steiner@sgi.com>
LKML-Reference: <20100310204458.GA28835@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/tlb_uv.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 364d015ef..ef68ba4 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -20,6 +20,8 @@
 #include <asm/tsc.h>
 #include <asm/irq_vectors.h>
 
+#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD	0x000000000bUL
+
 static struct bau_control	**uv_bau_table_bases __read_mostly;
 static int			uv_bau_retry_limit __read_mostly;
 
@@ -478,16 +480,16 @@ static void uv_enable_timeouts(void)
 		 * To program the period, the SOFT_ACK_MODE must be off.
 		 */
 		mmr_image &= ~((unsigned long)1 <<
-			       UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT);
+		    UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT);
 		uv_write_global_mmr64
 		    (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
 		/*
 		 * Set the 4-bit period.
 		 */
 		mmr_image &= ~((unsigned long)0xf <<
-			UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT);
+		     UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT);
 		mmr_image |= (UV_INTD_SOFT_ACK_TIMEOUT_PERIOD <<
-			     UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT);
+		     UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT);
 		uv_write_global_mmr64
 		    (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
 		/*
@@ -496,7 +498,7 @@ static void uv_enable_timeouts(void)
 		 * indicated in bits 2:0 (7 causes all of them to timeout).
 		 */
 		mmr_image |= ((unsigned long)1 <<
-			      UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT);
+		    UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT);
 		uv_write_global_mmr64
 		    (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
 	}
-- 
cgit v1.1


From 2acebe9ecb2b77876e87a1480729cfb2db4570dd Mon Sep 17 00:00:00 2001
From: Jack Steiner <steiner@sgi.com>
Date: Wed, 17 Mar 2010 10:40:38 -0500
Subject: x86, UV: Delete unneeded boot messages

SGI:UV: Delete extra boot messages that describe the system
topology. These messages are no longer useful.

Signed-off-by: Jack Steiner <steiner@sgi.com>
LKML-Reference: <20100317154038.GA29346@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/x2apic_uv_x.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 3740c8a..9a93b07 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -736,9 +736,6 @@ void __init uv_system_init(void)
 		uv_node_to_blade[nid] = blade;
 		uv_cpu_to_blade[cpu] = blade;
 		max_pnode = max(pnode, max_pnode);
-
-		printk(KERN_DEBUG "UV: cpu %d, apicid 0x%x, pnode %d, nid %d, lcpu %d, blade %d\n",
-			cpu, apicid, pnode, nid, lcpu, blade);
 	}
 
 	/* Add blade/pnode info for nodes without cpus */
-- 
cgit v1.1


From 4bd96a7a8185755b091233b16034c7436cbf57af Mon Sep 17 00:00:00 2001
From: Shane Wang <shane.wang@intel.com>
Date: Wed, 10 Mar 2010 14:36:10 +0800
Subject: x86, tboot: Add support for S3 memory integrity protection

This patch adds support for S3 memory integrity protection within an Intel(R)
TXT launched kernel, for all kernel and userspace memory.  All RAM used by the
kernel and userspace, as indicated by memory ranges of type E820_RAM and
E820_RESERVED_KERN in the e820 table, will be integrity protected.

The MAINTAINERS file is also updated to reflect the maintainers of the
TXT-related code.

All MACing is done in tboot, based on a complexity analysis and tradeoff.

v3: Compared with v2, this patch adds a check of array size in
tboot.c, and a note to specify which c/s of tboot supports this kind
of MACing in intel_txt.txt.

Signed-off-by: Shane Wang <shane.wang@intel.com>
LKML-Reference: <4B973DDA.6050902@intel.com>
Signed-off-by: Joseph Cihula <joseph.cihula@intel.com>
Acked-by: Pavel Machek <pavel@ucw.cz>
Acked-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/tboot.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index 86c9f91..cc2c604 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -175,6 +175,9 @@ static void add_mac_region(phys_addr_t start, unsigned long size)
 	struct tboot_mac_region *mr;
 	phys_addr_t end = start + size;
 
+	if (tboot->num_mac_regions >= MAX_TB_MAC_REGIONS)
+		panic("tboot: Too many MAC regions\n");
+
 	if (start && size) {
 		mr = &tboot->mac_regions[tboot->num_mac_regions++];
 		mr->start = round_down(start, PAGE_SIZE);
@@ -184,18 +187,17 @@ static void add_mac_region(phys_addr_t start, unsigned long size)
 
 static int tboot_setup_sleep(void)
 {
+	int i;
+
 	tboot->num_mac_regions = 0;
 
-	/* S3 resume code */
-	add_mac_region(acpi_wakeup_address, WAKEUP_SIZE);
+	for (i = 0; i < e820.nr_map; i++) {
+		if ((e820.map[i].type != E820_RAM)
+		 && (e820.map[i].type != E820_RESERVED_KERN))
+			continue;
 
-#ifdef CONFIG_X86_TRAMPOLINE
-	/* AP trampoline code */
-	add_mac_region(virt_to_phys(trampoline_base), TRAMPOLINE_SIZE);
-#endif
-
-	/* kernel code + data + bss */
-	add_mac_region(virt_to_phys(_text), _end - _text);
+		add_mac_region(e820.map[i].addr, e820.map[i].size);
+	}
 
 	tboot->acpi_sinfo.kernel_s3_resume_vector = acpi_wakeup_address;
 
-- 
cgit v1.1


From 5958f1d5d722df7a9e5d129676614a8e5219bacd Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Wed, 31 Mar 2010 21:56:41 +0200
Subject: x86, cpu: Add AMD core boosting feature flag to /proc/cpuinfo

By semi-popular demand, this adds the Core Performance Boost feature
flag to /proc/cpuinfo. Possible use case for this is userspace tools
like cpufreq-aperf, for example, so that they don't have to jump through
hoops of accessing "/dev/cpu/%d/cpuid" in order to check for CPB hw
support, or call cpuid from userspace.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
LKML-Reference: <1270065406-1814-2-git-send-email-bp@amd64.org>
Reviewed-by: Thomas Renninger <trenn@suse.de>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/addon_cpuid_features.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index 97ad79c..ead2a1c 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -30,8 +30,9 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
 	const struct cpuid_bit *cb;
 
 	static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
-		{ X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 },
-		{ X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006 },
+		{ X86_FEATURE_IDA,   CR_EAX, 1, 0x00000006 },
+		{ X86_FEATURE_ARAT,  CR_EAX, 2, 0x00000006 },
+		{ X86_FEATURE_CPB,   CR_EDX, 9, 0x80000007 },
 		{ X86_FEATURE_NPT,   CR_EDX, 0, 0x8000000a },
 		{ X86_FEATURE_LBRV,  CR_EDX, 1, 0x8000000a },
 		{ X86_FEATURE_SVML,  CR_EDX, 2, 0x8000000a },
-- 
cgit v1.1


From 73860c6b2fd159a35637e233d735e36887c266ad Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Wed, 31 Mar 2010 21:56:42 +0200
Subject: powernow-k8: Add core performance boost support

Starting with F10h, revE, AMD processors add support for a dynamic
core boosting feature called Core Performance Boost. When a specific
condition is present, a subset of the cores on a system are boosted
beyond their P0 operating frequency to speed up the performance of
single-threaded applications.

In the normal case, the system comes out of reset with core boosting
enabled. This patch adds a sysfs knob with which core boosting can be
switched on or off for benchmarking purposes.

While at it, make the CPB code hotplug-aware so that taking cores
offline wouldn't interfere with boosting the remaining online cores.
Furthermore, add cpu_online_mask hotplug protection as suggested by
Andrew.

Finally, cleanup the driver init codepath and update copyrights.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
LKML-Reference: <1270065406-1814-3-git-send-email-bp@amd64.org>
Reviewed-by: Thomas Renninger <trenn@suse.de>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 161 ++++++++++++++++++++++++++++--
 arch/x86/kernel/cpu/cpufreq/powernow-k8.h |   2 -
 2 files changed, 151 insertions(+), 12 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index d360b56..74ca34b 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -1,6 +1,5 @@
-
 /*
- *   (c) 2003-2006 Advanced Micro Devices, Inc.
+ *   (c) 2003-2010 Advanced Micro Devices, Inc.
  *  Your use of this code is subject to the terms and conditions of the
  *  GNU general public license version 2. See "COPYING" or
  *  http://www.gnu.org/licenses/gpl.html
@@ -54,6 +53,10 @@ static DEFINE_PER_CPU(struct powernow_k8_data *, powernow_data);
 
 static int cpu_family = CPU_OPTERON;
 
+/* core performance boost */
+static bool cpb_capable, cpb_enabled;
+static struct msr __percpu *msrs;
+
 #ifndef CONFIG_SMP
 static inline const struct cpumask *cpu_core_mask(int cpu)
 {
@@ -1393,8 +1396,77 @@ out:
 	return khz;
 }
 
+static void _cpb_toggle_msrs(bool t)
+{
+	int cpu;
+
+	get_online_cpus();
+
+	rdmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
+
+	for_each_cpu(cpu, cpu_online_mask) {
+		struct msr *reg = per_cpu_ptr(msrs, cpu);
+		if (t)
+			reg->l &= ~BIT(25);
+		else
+			reg->l |= BIT(25);
+	}
+	wrmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
+
+	put_online_cpus();
+}
+
+/*
+ * Switch on/off core performance boosting.
+ *
+ * 0=disable
+ * 1=enable.
+ */
+static void cpb_toggle(bool t)
+{
+	if (!cpb_capable)
+		return;
+
+	if (t && !cpb_enabled) {
+		cpb_enabled = true;
+		_cpb_toggle_msrs(t);
+		printk(KERN_INFO PFX "Core Boosting enabled.\n");
+	} else if (!t && cpb_enabled) {
+		cpb_enabled = false;
+		_cpb_toggle_msrs(t);
+		printk(KERN_INFO PFX "Core Boosting disabled.\n");
+	}
+}
+
+static ssize_t store_cpb(struct cpufreq_policy *policy, const char *buf,
+				 size_t count)
+{
+	int ret = -EINVAL;
+	unsigned long val = 0;
+
+	ret = strict_strtoul(buf, 10, &val);
+	if (!ret && (val == 0 || val == 1) && cpb_capable)
+		cpb_toggle(val);
+	else
+		return -EINVAL;
+
+	return count;
+}
+
+static ssize_t show_cpb(struct cpufreq_policy *policy, char *buf)
+{
+	return sprintf(buf, "%u\n", cpb_enabled);
+}
+
+#define define_one_rw(_name) \
+static struct freq_attr _name = \
+__ATTR(_name, 0644, show_##_name, store_##_name)
+
+define_one_rw(cpb);
+
 static struct freq_attr *powernow_k8_attr[] = {
 	&cpufreq_freq_attr_scaling_available_freqs,
+	&cpb,
 	NULL,
 };
 
@@ -1410,10 +1482,51 @@ static struct cpufreq_driver cpufreq_amd64_driver = {
 	.attr		= powernow_k8_attr,
 };
 
+/*
+ * Clear the boost-disable flag on the CPU_DOWN path so that this cpu
+ * cannot block the remaining ones from boosting. On the CPU_UP path we
+ * simply keep the boost-disable flag in sync with the current global
+ * state.
+ */
+static int __cpuinit cpb_notify(struct notifier_block *nb, unsigned long action,
+				void *hcpu)
+{
+	unsigned cpu = (long)hcpu;
+	u32 lo, hi;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
+
+		if (!cpb_enabled) {
+			rdmsr_on_cpu(cpu, MSR_K7_HWCR, &lo, &hi);
+			lo |= BIT(25);
+			wrmsr_on_cpu(cpu, MSR_K7_HWCR, lo, hi);
+		}
+		break;
+
+	case CPU_DOWN_PREPARE:
+	case CPU_DOWN_PREPARE_FROZEN:
+		rdmsr_on_cpu(cpu, MSR_K7_HWCR, &lo, &hi);
+		lo &= ~BIT(25);
+		wrmsr_on_cpu(cpu, MSR_K7_HWCR, lo, hi);
+		break;
+
+	default:
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata cpb_nb = {
+	.notifier_call		= cpb_notify,
+};
+
 /* driver entry point for init */
 static int __cpuinit powernowk8_init(void)
 {
-	unsigned int i, supported_cpus = 0;
+	unsigned int i, supported_cpus = 0, cpu;
 
 	for_each_online_cpu(i) {
 		int rc;
@@ -1422,15 +1535,36 @@ static int __cpuinit powernowk8_init(void)
 			supported_cpus++;
 	}
 
-	if (supported_cpus == num_online_cpus()) {
-		printk(KERN_INFO PFX "Found %d %s "
-			"processors (%d cpu cores) (" VERSION ")\n",
-			num_online_nodes(),
-			boot_cpu_data.x86_model_id, supported_cpus);
-		return cpufreq_register_driver(&cpufreq_amd64_driver);
+	if (supported_cpus != num_online_cpus())
+		return -ENODEV;
+
+	printk(KERN_INFO PFX "Found %d %s (%d cpu cores) (" VERSION ")\n",
+		num_online_nodes(), boot_cpu_data.x86_model_id, supported_cpus);
+
+	if (boot_cpu_has(X86_FEATURE_CPB)) {
+
+		cpb_capable = true;
+
+		register_cpu_notifier(&cpb_nb);
+
+		msrs = msrs_alloc();
+		if (!msrs) {
+			printk(KERN_ERR "%s: Error allocating msrs!\n", __func__);
+			return -ENOMEM;
+		}
+
+		rdmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
+
+		for_each_cpu(cpu, cpu_online_mask) {
+			struct msr *reg = per_cpu_ptr(msrs, cpu);
+			cpb_enabled |= !(!!(reg->l & BIT(25)));
+		}
+
+		printk(KERN_INFO PFX "Core Performance Boosting: %s.\n",
+			(cpb_enabled ? "on" : "off"));
 	}
 
-	return -ENODEV;
+	return cpufreq_register_driver(&cpufreq_amd64_driver);
 }
 
 /* driver entry point for term */
@@ -1438,6 +1572,13 @@ static void __exit powernowk8_exit(void)
 {
 	dprintk("exit\n");
 
+	if (boot_cpu_has(X86_FEATURE_CPB)) {
+		msrs_free(msrs);
+		msrs = NULL;
+
+		unregister_cpu_notifier(&cpb_nb);
+	}
+
 	cpufreq_unregister_driver(&cpufreq_amd64_driver);
 }
 
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
index 02ce824..df3529b 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
@@ -5,7 +5,6 @@
  *  http://www.gnu.org/licenses/gpl.html
  */
 
-
 enum pstate {
 	HW_PSTATE_INVALID = 0xff,
 	HW_PSTATE_0 = 0,
@@ -55,7 +54,6 @@ struct powernow_k8_data {
 	struct cpumask *available_cores;
 };
 
-
 /* processor's cpuid instruction support */
 #define CPUID_PROCESSOR_SIGNATURE	1	/* function 1 */
 #define CPUID_XFAM			0x0ff00000	/* extended family */
-- 
cgit v1.1


From d65ad45cd82a0db9544469b8c54f5dc5cafbb2d8 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Wed, 31 Mar 2010 21:56:43 +0200
Subject: x86: Unify APERF/MPERF support

Initialize this CPUID flag feature in common code. It could be made a
standalone function later, maybe, if more functionality is duplicated.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
LKML-Reference: <1270065406-1814-4-git-send-email-bp@amd64.org>
Reviewed-by: Thomas Renninger <trenn@suse.de>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/addon_cpuid_features.c | 8 ++++++++
 arch/x86/kernel/cpu/intel.c                | 6 ------
 2 files changed, 8 insertions(+), 6 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index ead2a1c..fd1fc19 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -54,6 +54,14 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
 		if (regs[cb->reg] & (1 << cb->bit))
 			set_cpu_cap(c, cb->feature);
 	}
+
+	/*
+	 * common AMD/Intel features
+	 */
+	if (c->cpuid_level >= 6) {
+		if (cpuid_ecx(6) & 0x1)
+			set_cpu_cap(c, X86_FEATURE_APERFMPERF);
+	}
 }
 
 /* leaf 0xb SMT level */
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 7e1cca1..3830258 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -352,12 +352,6 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
 			set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
 	}
 
-	if (c->cpuid_level > 6) {
-		unsigned ecx = cpuid_ecx(6);
-		if (ecx & 0x01)
-			set_cpu_cap(c, X86_FEATURE_APERFMPERF);
-	}
-
 	if (cpu_has_xmm2)
 		set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
 	if (cpu_has_ds) {
-- 
cgit v1.1


From a2fed573f065e526bfd5cbf26e5491973d9e9aaa Mon Sep 17 00:00:00 2001
From: Mark Langsdorf <mark.langsdorf@amd.com>
Date: Thu, 18 Mar 2010 18:41:46 +0100
Subject: x86, cpufreq: Add APERF/MPERF support for AMD processors

Starting with model 10 of Family 0x10, AMD processors may have
support for APERF/MPERF.  Add support for identifying it and using
it within cpufreq.  Move the APERF/MPERF functions out of the
acpi-cpufreq code and into their own file so they can easily be
shared.

Signed-off-by: Mark Langsdorf <mark.langsdorf@amd.com>
LKML-Reference: <20100401141956.GA1930@aftab>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
Reviewed-by: Thomas Renninger <trenn@suse.de>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/cpufreq/Makefile       |  4 +--
 arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 44 ++------------------------
 arch/x86/kernel/cpu/cpufreq/mperf.c        | 51 ++++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/cpufreq/mperf.h        |  9 ++++++
 arch/x86/kernel/cpu/cpufreq/powernow-k8.c  |  8 +++++
 5 files changed, 72 insertions(+), 44 deletions(-)
 create mode 100644 arch/x86/kernel/cpu/cpufreq/mperf.c
 create mode 100644 arch/x86/kernel/cpu/cpufreq/mperf.h

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/cpufreq/Makefile b/arch/x86/kernel/cpu/cpufreq/Makefile
index 1840c0a..bd54bf6 100644
--- a/arch/x86/kernel/cpu/cpufreq/Makefile
+++ b/arch/x86/kernel/cpu/cpufreq/Makefile
@@ -2,8 +2,8 @@
 # K8 systems. ACPI is preferred to all other hardware-specific drivers.
 # speedstep-* is preferred over p4-clockmod.
 
-obj-$(CONFIG_X86_POWERNOW_K8)		+= powernow-k8.o
-obj-$(CONFIG_X86_ACPI_CPUFREQ)		+= acpi-cpufreq.o
+obj-$(CONFIG_X86_POWERNOW_K8)		+= powernow-k8.o mperf.o
+obj-$(CONFIG_X86_ACPI_CPUFREQ)		+= acpi-cpufreq.o mperf.o
 obj-$(CONFIG_X86_PCC_CPUFREQ)		+= pcc-cpufreq.o
 obj-$(CONFIG_X86_POWERNOW_K6)		+= powernow-k6.o
 obj-$(CONFIG_X86_POWERNOW_K7)		+= powernow-k7.o
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 1b1920f..dc68e5c 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -45,6 +45,7 @@
 #include <asm/msr.h>
 #include <asm/processor.h>
 #include <asm/cpufeature.h>
+#include "mperf.h"
 
 #define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
 		"acpi-cpufreq", msg)
@@ -70,8 +71,6 @@ struct acpi_cpufreq_data {
 
 static DEFINE_PER_CPU(struct acpi_cpufreq_data *, acfreq_data);
 
-static DEFINE_PER_CPU(struct aperfmperf, acfreq_old_perf);
-
 /* acpi_perf_data is a pointer to percpu data. */
 static struct acpi_processor_performance *acpi_perf_data;
 
@@ -239,45 +238,6 @@ static u32 get_cur_val(const struct cpumask *mask)
 	return cmd.val;
 }
 
-/* Called via smp_call_function_single(), on the target CPU */
-static void read_measured_perf_ctrs(void *_cur)
-{
-	struct aperfmperf *am = _cur;
-
-	get_aperfmperf(am);
-}
-
-/*
- * Return the measured active (C0) frequency on this CPU since last call
- * to this function.
- * Input: cpu number
- * Return: Average CPU frequency in terms of max frequency (zero on error)
- *
- * We use IA32_MPERF and IA32_APERF MSRs to get the measured performance
- * over a period of time, while CPU is in C0 state.
- * IA32_MPERF counts at the rate of max advertised frequency
- * IA32_APERF counts at the rate of actual CPU frequency
- * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and
- * no meaning should be associated with absolute values of these MSRs.
- */
-static unsigned int get_measured_perf(struct cpufreq_policy *policy,
-				      unsigned int cpu)
-{
-	struct aperfmperf perf;
-	unsigned long ratio;
-	unsigned int retval;
-
-	if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1))
-		return 0;
-
-	ratio = calc_aperfmperf_ratio(&per_cpu(acfreq_old_perf, cpu), &perf);
-	per_cpu(acfreq_old_perf, cpu) = perf;
-
-	retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT;
-
-	return retval;
-}
-
 static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
 {
 	struct acpi_cpufreq_data *data = per_cpu(acfreq_data, cpu);
@@ -701,7 +661,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
 
 	/* Check for APERF/MPERF support in hardware */
 	if (cpu_has(c, X86_FEATURE_APERFMPERF))
-		acpi_cpufreq_driver.getavg = get_measured_perf;
+		acpi_cpufreq_driver.getavg = cpufreq_get_measured_perf;
 
 	dprintk("CPU%u - ACPI performance management activated.\n", cpu);
 	for (i = 0; i < perf->state_count; i++)
diff --git a/arch/x86/kernel/cpu/cpufreq/mperf.c b/arch/x86/kernel/cpu/cpufreq/mperf.c
new file mode 100644
index 0000000..911e193
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/mperf.c
@@ -0,0 +1,51 @@
+#include <linux/kernel.h>
+#include <linux/smp.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/cpufreq.h>
+#include <linux/slab.h>
+
+#include "mperf.h"
+
+static DEFINE_PER_CPU(struct aperfmperf, acfreq_old_perf);
+
+/* Called via smp_call_function_single(), on the target CPU */
+static void read_measured_perf_ctrs(void *_cur)
+{
+	struct aperfmperf *am = _cur;
+
+	get_aperfmperf(am);
+}
+
+/*
+ * Return the measured active (C0) frequency on this CPU since last call
+ * to this function.
+ * Input: cpu number
+ * Return: Average CPU frequency in terms of max frequency (zero on error)
+ *
+ * We use IA32_MPERF and IA32_APERF MSRs to get the measured performance
+ * over a period of time, while CPU is in C0 state.
+ * IA32_MPERF counts at the rate of max advertised frequency
+ * IA32_APERF counts at the rate of actual CPU frequency
+ * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and
+ * no meaning should be associated with absolute values of these MSRs.
+ */
+unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy,
+					unsigned int cpu)
+{
+	struct aperfmperf perf;
+	unsigned long ratio;
+	unsigned int retval;
+
+	if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1))
+		return 0;
+
+	ratio = calc_aperfmperf_ratio(&per_cpu(acfreq_old_perf, cpu), &perf);
+	per_cpu(acfreq_old_perf, cpu) = perf;
+
+	retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT;
+
+	return retval;
+}
+EXPORT_SYMBOL_GPL(cpufreq_get_measured_perf);
+MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/cpufreq/mperf.h b/arch/x86/kernel/cpu/cpufreq/mperf.h
new file mode 100644
index 0000000..5dbf295
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/mperf.h
@@ -0,0 +1,9 @@
+/*
+ *  (c) 2010 Advanced Micro Devices, Inc.
+ *  Your use of this code is subject to the terms and conditions of the
+ *  GNU general public license version 2. See "COPYING" or
+ *  http://www.gnu.org/licenses/gpl.html
+ */
+
+unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy,
+					unsigned int cpu);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 74ca34b..52fce63 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -45,6 +45,7 @@
 #define PFX "powernow-k8: "
 #define VERSION "version 2.20.00"
 #include "powernow-k8.h"
+#include "mperf.h"
 
 /* serialize freq changes  */
 static DEFINE_MUTEX(fidvid_mutex);
@@ -57,6 +58,8 @@ static int cpu_family = CPU_OPTERON;
 static bool cpb_capable, cpb_enabled;
 static struct msr __percpu *msrs;
 
+static struct cpufreq_driver cpufreq_amd64_driver;
+
 #ifndef CONFIG_SMP
 static inline const struct cpumask *cpu_core_mask(int cpu)
 {
@@ -1251,6 +1254,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
 	struct powernow_k8_data *data;
 	struct init_on_cpu init_on_cpu;
 	int rc;
+	struct cpuinfo_x86 *c = &cpu_data(pol->cpu);
 
 	if (!cpu_online(pol->cpu))
 		return -ENODEV;
@@ -1325,6 +1329,10 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
 		return -EINVAL;
 	}
 
+	/* Check for APERF/MPERF support in hardware */
+	if (cpu_has(c, X86_FEATURE_APERFMPERF))
+		cpufreq_amd64_driver.getavg = cpufreq_get_measured_perf;
+
 	cpufreq_frequency_table_get_attr(data->powernow_table, pol->cpu);
 
 	if (cpu_family == CPU_HW_PSTATE)
-- 
cgit v1.1


From 679370641e3675633cad222449262abbe93a4a2a Mon Sep 17 00:00:00 2001
From: Mark Langsdorf <mark.langsdorf@amd.com>
Date: Wed, 31 Mar 2010 21:56:45 +0200
Subject: powernow-k8: Fix frequency reporting

With F10, model 10, all valid frequencies are in the ACPI _PST table.

Cc: <stable@kernel.org> # 33.x 32.x
Signed-off-by: Mark Langsdorf <mark.langsdorf@amd.com>
LKML-Reference: <1270065406-1814-6-git-send-email-bp@amd64.org>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
Reviewed-by: Thomas Renninger <trenn@suse.de>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 52fce63..6f3dc8f 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -935,7 +935,8 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data,
 		powernow_table[i].index = index;
 
 		/* Frequency may be rounded for these */
-		if (boot_cpu_data.x86 == 0x10 || boot_cpu_data.x86 == 0x11) {
+		if ((boot_cpu_data.x86 == 0x10 && boot_cpu_data.x86_model < 10)
+				 || boot_cpu_data.x86 == 0x11) {
 			powernow_table[i].frequency =
 				freq_from_fid_did(lo & 0x3f, (lo >> 6) & 7);
 		} else
-- 
cgit v1.1


From 3f10940e4fb69d312602078f2c5234206797ca31 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@relay.de.ibm.com>
Date: Sat, 10 Apr 2010 16:46:21 +0200
Subject: x86/microcode: Use nonseekable_open()

No need to seek on this file, so prevent it outright so we can
avoid using default_llseek - removes one more BKL usage.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
[drop useless llseek = no_llseek and smp_lock.h inclusion]
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Arnd Bergmann <arnd@relay.de.ibm.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Dmitry Adamushko <dmitry.adamushko@gmail.com>
LKML-Reference: <1270910781-8786-1-git-send-regression-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/microcode_core.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index cceb5bc..2cd8c54 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -201,9 +201,9 @@ static int do_microcode_update(const void __user *buf, size_t size)
 	return error;
 }
 
-static int microcode_open(struct inode *unused1, struct file *unused2)
+static int microcode_open(struct inode *inode, struct file *file)
 {
-	return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
+	return capable(CAP_SYS_RAWIO) ? nonseekable_open(inode, file) : -EPERM;
 }
 
 static ssize_t microcode_write(struct file *file, const char __user *buf,
-- 
cgit v1.1


From b8f7fb13d2d7ff14818fd1d3edd8b834d38b0217 Mon Sep 17 00:00:00 2001
From: Cliff Wickman <cpw@sgi.com>
Date: Wed, 14 Apr 2010 11:35:46 -0500
Subject: x86, UV: Improve BAU performance and error recovery

- increase performance of the interrupt handler

- release timed-out software acknowledge resources

- recover from continuous-busy status due to a hardware issue

- add a 'throttle' to keep a uvhub from sending more than a
  specified number of broadcasts concurrently (work around the hardware issue)

- provide a 'nobau' boot command line option

- rename 'pnode' and 'node' to 'uvhub' (the 'node' terminology
  is ambiguous)

- add some new statistics about the scope of broadcasts, retries, the
  hardware issue and the 'throttle'

- split off new function uv_bau_retry_msg() from
  uv_bau_process_message() per community coding style feedback.

- simplify the argument list to uv_bau_process_message(), per
  community coding style feedback.

Signed-off-by: Cliff Wickman <cpw@sgi.com>
Cc: linux-mm@kvack.org
Cc: Jack Steiner <steiner@sgi.com>
Cc: Russ Anderson <rja@sgi.com>
Cc: Mike Travis <travis@sgi.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <E1O25Z4-0004Ur-PB@eag09.americas.sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/tlb_uv.c | 1270 ++++++++++++++++++++++++++++++++--------------
 1 file changed, 900 insertions(+), 370 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index ef68ba4..414f7c4f 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -1,7 +1,7 @@
 /*
  *	SGI UltraViolet TLB flush routines.
  *
- *	(c) 2008 Cliff Wickman <cpw@sgi.com>, SGI.
+ *	(c) 2008-2010 Cliff Wickman <cpw@sgi.com>, SGI.
  *
  *	This code is released under the GNU General Public License version 2 or
  *	later.
@@ -19,44 +19,67 @@
 #include <asm/idle.h>
 #include <asm/tsc.h>
 #include <asm/irq_vectors.h>
+#include <asm/timer.h>
+
+struct msg_desc {
+	struct bau_payload_queue_entry *msg;
+	int msg_slot;
+	int sw_ack_slot;
+	struct bau_payload_queue_entry *va_queue_first;
+	struct bau_payload_queue_entry *va_queue_last;
+};
 
 #define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD	0x000000000bUL
 
-static struct bau_control	**uv_bau_table_bases __read_mostly;
-static int			uv_bau_retry_limit __read_mostly;
+static int uv_bau_max_concurrent __read_mostly;
 
-/* base pnode in this partition */
-static int			uv_partition_base_pnode __read_mostly;
+static int nobau;
+static int __init setup_nobau(char *arg)
+{
+	nobau = 1;
+	return 0;
+}
+early_param("nobau", setup_nobau);
 
-static unsigned long		uv_mmask __read_mostly;
+/* base pnode in this partition */
+static int uv_partition_base_pnode __read_mostly;
+/* position of pnode (which is nasid>>1): */
+static int uv_nshift __read_mostly;
+static unsigned long uv_mmask __read_mostly;
 
 static DEFINE_PER_CPU(struct ptc_stats, ptcstats);
 static DEFINE_PER_CPU(struct bau_control, bau_control);
+static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask);
+
+struct reset_args {
+	int sender;
+};
 
 /*
- * Determine the first node on a blade.
+ * Determine the first node on a uvhub. 'Nodes' are used for kernel
+ * memory allocation.
  */
-static int __init blade_to_first_node(int blade)
+static int __init uvhub_to_first_node(int uvhub)
 {
 	int node, b;
 
 	for_each_online_node(node) {
 		b = uv_node_to_blade_id(node);
-		if (blade == b)
+		if (uvhub == b)
 			return node;
 	}
-	return -1; /* shouldn't happen */
+	return -1;
 }
 
 /*
- * Determine the apicid of the first cpu on a blade.
+ * Determine the apicid of the first cpu on a uvhub.
  */
-static int __init blade_to_first_apicid(int blade)
+static int __init uvhub_to_first_apicid(int uvhub)
 {
 	int cpu;
 
 	for_each_present_cpu(cpu)
-		if (blade == uv_cpu_to_blade_id(cpu))
+		if (uvhub == uv_cpu_to_blade_id(cpu))
 			return per_cpu(x86_cpu_to_apicid, cpu);
 	return -1;
 }
@@ -69,195 +92,459 @@ static int __init blade_to_first_apicid(int blade)
  * clear of the Timeout bit (as well) will free the resource. No reply will
  * be sent (the hardware will only do one reply per message).
  */
-static void uv_reply_to_message(int resource,
-				struct bau_payload_queue_entry *msg,
-				struct bau_msg_status *msp)
+static inline void uv_reply_to_message(struct msg_desc *mdp,
+				       struct bau_control *bcp)
 {
 	unsigned long dw;
+	struct bau_payload_queue_entry *msg;
 
-	dw = (1 << (resource + UV_SW_ACK_NPENDING)) | (1 << resource);
+	msg = mdp->msg;
+	if (!msg->canceled) {
+		dw = (msg->sw_ack_vector << UV_SW_ACK_NPENDING) |
+						msg->sw_ack_vector;
+		uv_write_local_mmr(
+				UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, dw);
+	}
 	msg->replied_to = 1;
 	msg->sw_ack_vector = 0;
-	if (msp)
-		msp->seen_by.bits = 0;
-	uv_write_local_mmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, dw);
 }
 
 /*
- * Do all the things a cpu should do for a TLB shootdown message.
- * Other cpu's may come here at the same time for this message.
+ * Process the receipt of a RETRY message
  */
-static void uv_bau_process_message(struct bau_payload_queue_entry *msg,
-				   int msg_slot, int sw_ack_slot)
+static inline void uv_bau_process_retry_msg(struct msg_desc *mdp,
+					    struct bau_control *bcp)
 {
-	unsigned long this_cpu_mask;
-	struct bau_msg_status *msp;
-	int cpu;
+	int i;
+	int cancel_count = 0;
+	int slot2;
+	unsigned long msg_res;
+	unsigned long mmr = 0;
+	struct bau_payload_queue_entry *msg;
+	struct bau_payload_queue_entry *msg2;
+	struct ptc_stats *stat;
 
-	msp = __get_cpu_var(bau_control).msg_statuses + msg_slot;
-	cpu = uv_blade_processor_id();
-	msg->number_of_cpus =
-		uv_blade_nr_online_cpus(uv_node_to_blade_id(numa_node_id()));
-	this_cpu_mask = 1UL << cpu;
-	if (msp->seen_by.bits & this_cpu_mask)
-		return;
-	atomic_or_long(&msp->seen_by.bits, this_cpu_mask);
+	msg = mdp->msg;
+	stat = &per_cpu(ptcstats, bcp->cpu);
+	stat->d_retries++;
+	/*
+	 * cancel any message from msg+1 to the retry itself
+	 */
+	for (msg2 = msg+1, i = 0; i < DEST_Q_SIZE; msg2++, i++) {
+		if (msg2 > mdp->va_queue_last)
+			msg2 = mdp->va_queue_first;
+		if (msg2 == msg)
+			break;
+
+		/* same conditions for cancellation as uv_do_reset */
+		if ((msg2->replied_to == 0) && (msg2->canceled == 0) &&
+		    (msg2->sw_ack_vector) && ((msg2->sw_ack_vector &
+			msg->sw_ack_vector) == 0) &&
+		    (msg2->sending_cpu == msg->sending_cpu) &&
+		    (msg2->msg_type != MSG_NOOP)) {
+			slot2 = msg2 - mdp->va_queue_first;
+			mmr = uv_read_local_mmr
+				(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
+			msg_res = ((msg2->sw_ack_vector << 8) |
+				   msg2->sw_ack_vector);
+			/*
+			 * This is a message retry; clear the resources held
+			 * by the previous message only if they timed out.
+			 * If it has not timed out we have an unexpected
+			 * situation to report.
+			 */
+			if (mmr & (msg_res << 8)) {
+				/*
+				 * is the resource timed out?
+				 * make everyone ignore the cancelled message.
+				 */
+				msg2->canceled = 1;
+				stat->d_canceled++;
+				cancel_count++;
+				uv_write_local_mmr(
+				    UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS,
+					(msg_res << 8) | msg_res);
+			} else
+				printk(KERN_INFO "note bau retry: no effect\n");
+		}
+	}
+	if (!cancel_count)
+		stat->d_nocanceled++;
+}
 
-	if (msg->replied_to == 1)
-		return;
+/*
+ * Do all the things a cpu should do for a TLB shootdown message.
+ * Other cpu's may come here at the same time for this message.
+ */
+static void uv_bau_process_message(struct msg_desc *mdp,
+				   struct bau_control *bcp)
+{
+	int msg_ack_count;
+	short socket_ack_count = 0;
+	struct ptc_stats *stat;
+	struct bau_payload_queue_entry *msg;
+	struct bau_control *smaster = bcp->socket_master;
 
+	/*
+	 * This must be a normal message, or retry of a normal message
+	 */
+	msg = mdp->msg;
+	stat = &per_cpu(ptcstats, bcp->cpu);
 	if (msg->address == TLB_FLUSH_ALL) {
 		local_flush_tlb();
-		__get_cpu_var(ptcstats).alltlb++;
+		stat->d_alltlb++;
 	} else {
 		__flush_tlb_one(msg->address);
-		__get_cpu_var(ptcstats).onetlb++;
+		stat->d_onetlb++;
 	}
+	stat->d_requestee++;
+
+	/*
+	 * One cpu on each uvhub has the additional job on a RETRY
+	 * of releasing the resource held by the message that is
+	 * being retried.  That message is identified by sending
+	 * cpu number.
+	 */
+	if (msg->msg_type == MSG_RETRY && bcp == bcp->uvhub_master)
+		uv_bau_process_retry_msg(mdp, bcp);
 
-	__get_cpu_var(ptcstats).requestee++;
+	/*
+	 * This is a sw_ack message, so we have to reply to it.
+	 * Count each responding cpu on the socket. This avoids
+	 * pinging the count's cache line back and forth between
+	 * the sockets.
+	 */
+	socket_ack_count = atomic_add_short_return(1, (struct atomic_short *)
+			&smaster->socket_acknowledge_count[mdp->msg_slot]);
+	if (socket_ack_count == bcp->cpus_in_socket) {
+		/*
+		 * Both sockets dump their completed count total into
+		 * the message's count.
+		 */
+		smaster->socket_acknowledge_count[mdp->msg_slot] = 0;
+		msg_ack_count = atomic_add_short_return(socket_ack_count,
+				(struct atomic_short *)&msg->acknowledge_count);
+
+		if (msg_ack_count == bcp->cpus_in_uvhub) {
+			/*
+			 * All cpus in uvhub saw it; reply
+			 */
+			uv_reply_to_message(mdp, bcp);
+		}
+	}
 
-	atomic_inc_short(&msg->acknowledge_count);
-	if (msg->number_of_cpus == msg->acknowledge_count)
-		uv_reply_to_message(sw_ack_slot, msg, msp);
+	return;
 }
 
 /*
- * Examine the payload queue on one distribution node to see
- * which messages have not been seen, and which cpu(s) have not seen them.
+ * Determine the first cpu on a uvhub.
+ */
+static int uvhub_to_first_cpu(int uvhub)
+{
+	int cpu;
+	for_each_present_cpu(cpu)
+		if (uvhub == uv_cpu_to_blade_id(cpu))
+			return cpu;
+	return -1;
+}
+
+/*
+ * Last resort when we get a large number of destination timeouts is
+ * to clear resources held by a given cpu.
+ * Do this with IPI so that all messages in the BAU message queue
+ * can be identified by their nonzero sw_ack_vector field.
  *
- * Returns the number of cpu's that have not responded.
+ * This is entered for a single cpu on the uvhub.
+ * The sender want's this uvhub to free a specific message's
+ * sw_ack resources.
  */
-static int uv_examine_destination(struct bau_control *bau_tablesp, int sender)
+static void
+uv_do_reset(void *ptr)
 {
-	struct bau_payload_queue_entry *msg;
-	struct bau_msg_status *msp;
-	int count = 0;
 	int i;
-	int j;
+	int slot;
+	int count = 0;
+	unsigned long mmr;
+	unsigned long msg_res;
+	struct bau_control *bcp;
+	struct reset_args *rap;
+	struct bau_payload_queue_entry *msg;
+	struct ptc_stats *stat;
 
-	for (msg = bau_tablesp->va_queue_first, i = 0; i < DEST_Q_SIZE;
-	     msg++, i++) {
-		if ((msg->sending_cpu == sender) && (!msg->replied_to)) {
-			msp = bau_tablesp->msg_statuses + i;
-			printk(KERN_DEBUG
-			       "blade %d: address:%#lx %d of %d, not cpu(s): ",
-			       i, msg->address, msg->acknowledge_count,
-			       msg->number_of_cpus);
-			for (j = 0; j < msg->number_of_cpus; j++) {
-				if (!((1L << j) & msp->seen_by.bits)) {
-					count++;
-					printk("%d ", j);
-				}
+	bcp = &per_cpu(bau_control, smp_processor_id());
+	rap = (struct reset_args *)ptr;
+	stat = &per_cpu(ptcstats, bcp->cpu);
+	stat->d_resets++;
+
+	/*
+	 * We're looking for the given sender, and
+	 * will free its sw_ack resource.
+	 * If all cpu's finally responded after the timeout, its
+	 * message 'replied_to' was set.
+	 */
+	for (msg = bcp->va_queue_first, i = 0; i < DEST_Q_SIZE; msg++, i++) {
+		/* uv_do_reset: same conditions for cancellation as
+		   uv_bau_process_retry_msg() */
+		if ((msg->replied_to == 0) &&
+		    (msg->canceled == 0) &&
+		    (msg->sending_cpu == rap->sender) &&
+		    (msg->sw_ack_vector) &&
+		    (msg->msg_type != MSG_NOOP)) {
+			/*
+			 * make everyone else ignore this message
+			 */
+			msg->canceled = 1;
+			slot = msg - bcp->va_queue_first;
+			count++;
+			/*
+			 * only reset the resource if it is still pending
+			 */
+			mmr = uv_read_local_mmr
+					(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
+			msg_res = ((msg->sw_ack_vector << 8) |
+						   msg->sw_ack_vector);
+			if (mmr & msg_res) {
+				stat->d_rcanceled++;
+				uv_write_local_mmr(
+				    UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS,
+							msg_res);
 			}
-			printk("\n");
 		}
 	}
-	return count;
+	return;
 }
 
 /*
- * Examine the payload queue on all the distribution nodes to see
- * which messages have not been seen, and which cpu(s) have not seen them.
- *
- * Returns the number of cpu's that have not responded.
+ * Use IPI to get all target uvhubs to release resources held by
+ * a given sending cpu number.
  */
-static int uv_examine_destinations(struct bau_target_nodemask *distribution)
+static void uv_reset_with_ipi(struct bau_target_uvhubmask *distribution,
+			      int sender)
 {
-	int sender;
-	int i;
-	int count = 0;
+	int uvhub;
+	int cpu;
+	cpumask_t mask;
+	struct reset_args reset_args;
 
-	sender = smp_processor_id();
-	for (i = 0; i < sizeof(struct bau_target_nodemask) * BITSPERBYTE; i++) {
-		if (!bau_node_isset(i, distribution))
+	reset_args.sender = sender;
+
+	cpus_clear(mask);
+	/* find a single cpu for each uvhub in this distribution mask */
+	for (uvhub = 0;
+		    uvhub < sizeof(struct bau_target_uvhubmask) * BITSPERBYTE;
+		    uvhub++) {
+		if (!bau_uvhub_isset(uvhub, distribution))
 			continue;
-		count += uv_examine_destination(uv_bau_table_bases[i], sender);
+		/* find a cpu for this uvhub */
+		cpu = uvhub_to_first_cpu(uvhub);
+		cpu_set(cpu, mask);
 	}
-	return count;
+	/* IPI all cpus; Preemption is already disabled */
+	smp_call_function_many(&mask, uv_do_reset, (void *)&reset_args, 1);
+	return;
+}
+
+static inline unsigned long
+cycles_2_us(unsigned long long cyc)
+{
+	unsigned long long ns;
+	unsigned long us;
+	ns =  (cyc * per_cpu(cyc2ns, smp_processor_id()))
+						>> CYC2NS_SCALE_FACTOR;
+	us = ns / 1000;
+	return us;
 }
 
 /*
- * wait for completion of a broadcast message
- *
- * return COMPLETE, RETRY or GIVEUP
+ * wait for all cpus on this hub to finish their sends and go quiet
+ * leaves uvhub_quiesce set so that no new broadcasts are started by
+ * bau_flush_send_and_wait()
+ */
+static inline void
+quiesce_local_uvhub(struct bau_control *hmaster)
+{
+	atomic_add_short_return(1, (struct atomic_short *)
+		 &hmaster->uvhub_quiesce);
+}
+
+/*
+ * mark this quiet-requestor as done
+ */
+static inline void
+end_uvhub_quiesce(struct bau_control *hmaster)
+{
+	atomic_add_short_return(-1, (struct atomic_short *)
+		&hmaster->uvhub_quiesce);
+}
+
+/*
+ * Wait for completion of a broadcast software ack message
+ * return COMPLETE, RETRY(PLUGGED or TIMEOUT) or GIVEUP
  */
 static int uv_wait_completion(struct bau_desc *bau_desc,
-			      unsigned long mmr_offset, int right_shift)
+	unsigned long mmr_offset, int right_shift, int this_cpu,
+	struct bau_control *bcp, struct bau_control *smaster, long try)
 {
-	int exams = 0;
-	long destination_timeouts = 0;
-	long source_timeouts = 0;
+	int relaxes = 0;
 	unsigned long descriptor_status;
+	unsigned long mmr;
+	unsigned long mask;
+	cycles_t ttime;
+	cycles_t timeout_time;
+	struct ptc_stats *stat = &per_cpu(ptcstats, this_cpu);
+	struct bau_control *hmaster;
 
+	hmaster = bcp->uvhub_master;
+	timeout_time = get_cycles() + bcp->timeout_interval;
+
+	/* spin on the status MMR, waiting for it to go idle */
 	while ((descriptor_status = (((unsigned long)
 		uv_read_local_mmr(mmr_offset) >>
 			right_shift) & UV_ACT_STATUS_MASK)) !=
 			DESC_STATUS_IDLE) {
-		if (descriptor_status == DESC_STATUS_SOURCE_TIMEOUT) {
-			source_timeouts++;
-			if (source_timeouts > SOURCE_TIMEOUT_LIMIT)
-				source_timeouts = 0;
-			__get_cpu_var(ptcstats).s_retry++;
-			return FLUSH_RETRY;
-		}
 		/*
-		 * spin here looking for progress at the destinations
+		 * Our software ack messages may be blocked because there are
+		 * no swack resources available.  As long as none of them
+		 * has timed out hardware will NACK our message and its
+		 * state will stay IDLE.
 		 */
-		if (descriptor_status == DESC_STATUS_DESTINATION_TIMEOUT) {
-			destination_timeouts++;
-			if (destination_timeouts > DESTINATION_TIMEOUT_LIMIT) {
-				/*
-				 * returns number of cpus not responding
-				 */
-				if (uv_examine_destinations
-				    (&bau_desc->distribution) == 0) {
-					__get_cpu_var(ptcstats).d_retry++;
-					return FLUSH_RETRY;
-				}
-				exams++;
-				if (exams >= uv_bau_retry_limit) {
-					printk(KERN_DEBUG
-					       "uv_flush_tlb_others");
-					printk("giving up on cpu %d\n",
-					       smp_processor_id());
+		if (descriptor_status == DESC_STATUS_SOURCE_TIMEOUT) {
+			stat->s_stimeout++;
+			return FLUSH_GIVEUP;
+		} else if (descriptor_status ==
+					DESC_STATUS_DESTINATION_TIMEOUT) {
+			stat->s_dtimeout++;
+			ttime = get_cycles();
+
+			/*
+			 * Our retries may be blocked by all destination
+			 * swack resources being consumed, and a timeout
+			 * pending.  In that case hardware returns the
+			 * ERROR that looks like a destination timeout.
+			 */
+			if (cycles_2_us(ttime - bcp->send_message) < BIOS_TO) {
+				bcp->conseccompletes = 0;
+				return FLUSH_RETRY_PLUGGED;
+			}
+
+			bcp->conseccompletes = 0;
+			return FLUSH_RETRY_TIMEOUT;
+		} else {
+			/*
+			 * descriptor_status is still BUSY
+			 */
+			cpu_relax();
+			relaxes++;
+			if (relaxes >= 10000) {
+				relaxes = 0;
+				if (get_cycles() > timeout_time) {
+					quiesce_local_uvhub(hmaster);
+
+					/* single-thread the register change */
+					spin_lock(&hmaster->masks_lock);
+					mmr = uv_read_local_mmr(mmr_offset);
+					mask = 0UL;
+					mask |= (3UL < right_shift);
+					mask = ~mask;
+					mmr &= mask;
+					uv_write_local_mmr(mmr_offset, mmr);
+					spin_unlock(&hmaster->masks_lock);
+					end_uvhub_quiesce(hmaster);
+					stat->s_busy++;
 					return FLUSH_GIVEUP;
 				}
-				/*
-				 * delays can hang the simulator
-				   udelay(1000);
-				 */
-				destination_timeouts = 0;
 			}
 		}
-		cpu_relax();
 	}
+	bcp->conseccompletes++;
 	return FLUSH_COMPLETE;
 }
 
+static inline cycles_t
+sec_2_cycles(unsigned long sec)
+{
+	unsigned long ns;
+	cycles_t cyc;
+
+	ns = sec * 1000000000;
+	cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id()));
+	return cyc;
+}
+
+/*
+ * conditionally add 1 to *v, unless *v is >= u
+ * return 0 if we cannot add 1 to *v because it is >= u
+ * return 1 if we can add 1 to *v because it is < u
+ * the add is atomic
+ *
+ * This is close to atomic_add_unless(), but this allows the 'u' value
+ * to be lowered below the current 'v'.  atomic_add_unless can only stop
+ * on equal.
+ */
+static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u)
+{
+	spin_lock(lock);
+	if (atomic_read(v) >= u) {
+		spin_unlock(lock);
+		return 0;
+	}
+	atomic_inc(v);
+	spin_unlock(lock);
+	return 1;
+}
+
 /**
  * uv_flush_send_and_wait
  *
- * Send a broadcast and wait for a broadcast message to complete.
+ * Send a broadcast and wait for it to complete.
  *
- * The flush_mask contains the cpus the broadcast was sent to.
+ * The flush_mask contains the cpus the broadcast is to be sent to, plus
+ * cpus that are on the local uvhub.
  *
- * Returns NULL if all remote flushing was done. The mask is zeroed.
+ * Returns NULL if all flushing represented in the mask was done. The mask
+ * is zeroed.
  * Returns @flush_mask if some remote flushing remains to be done. The
- * mask will have some bits still set.
+ * mask will have some bits still set, representing any cpus on the local
+ * uvhub (not current cpu) and any on remote uvhubs if the broadcast failed.
  */
-const struct cpumask *uv_flush_send_and_wait(int cpu, int this_pnode,
-					     struct bau_desc *bau_desc,
-					     struct cpumask *flush_mask)
+const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc,
+					     struct cpumask *flush_mask,
+					     struct bau_control *bcp)
 {
-	int completion_status = 0;
 	int right_shift;
-	int tries = 0;
-	int pnode;
+	int uvhub;
 	int bit;
+	int completion_status = 0;
+	int seq_number = 0;
+	long try = 0;
+	int cpu = bcp->uvhub_cpu;
+	int this_cpu = bcp->cpu;
+	int this_uvhub = bcp->uvhub;
 	unsigned long mmr_offset;
 	unsigned long index;
 	cycles_t time1;
 	cycles_t time2;
+	struct ptc_stats *stat = &per_cpu(ptcstats, bcp->cpu);
+	struct bau_control *smaster = bcp->socket_master;
+	struct bau_control *hmaster = bcp->uvhub_master;
+
+	/*
+	 * Spin here while there are hmaster->max_concurrent or more active
+	 * descriptors. This is the per-uvhub 'throttle'.
+	 */
+	if (!atomic_inc_unless_ge(&hmaster->uvhub_lock,
+			&hmaster->active_descriptor_count,
+			hmaster->max_concurrent)) {
+		stat->s_throttles++;
+		do {
+			cpu_relax();
+		} while (!atomic_inc_unless_ge(&hmaster->uvhub_lock,
+			&hmaster->active_descriptor_count,
+			hmaster->max_concurrent));
+	}
+
+	while (hmaster->uvhub_quiesce)
+		cpu_relax();
 
 	if (cpu < UV_CPUS_PER_ACT_STATUS) {
 		mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0;
@@ -269,24 +556,108 @@ const struct cpumask *uv_flush_send_and_wait(int cpu, int this_pnode,
 	}
 	time1 = get_cycles();
 	do {
-		tries++;
+		/*
+		 * Every message from any given cpu gets a unique message
+		 * sequence number. But retries use that same number.
+		 * Our message may have timed out at the destination because
+		 * all sw-ack resources are in use and there is a timeout
+		 * pending there.  In that case, our last send never got
+		 * placed into the queue and we need to persist until it
+		 * does.
+		 *
+		 * Make any retry a type MSG_RETRY so that the destination will
+		 * free any resource held by a previous message from this cpu.
+		 */
+		if (try == 0) {
+			/* use message type set by the caller the first time */
+			seq_number = bcp->message_number++;
+		} else {
+			/* use RETRY type on all the rest; same sequence */
+			bau_desc->header.msg_type = MSG_RETRY;
+			stat->s_retry_messages++;
+		}
+		bau_desc->header.sequence = seq_number;
 		index = (1UL << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) |
-			cpu;
+			bcp->uvhub_cpu;
+		bcp->send_message = get_cycles();
+
 		uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index);
+
+		try++;
 		completion_status = uv_wait_completion(bau_desc, mmr_offset,
-					right_shift);
-	} while (completion_status == FLUSH_RETRY);
+			right_shift, this_cpu, bcp, smaster, try);
+
+		if (completion_status == FLUSH_RETRY_PLUGGED) {
+			/*
+			 * Our retries may be blocked by all destination swack
+			 * resources being consumed, and a timeout pending. In
+			 * that case hardware immediately returns the ERROR
+			 * that looks like a destination timeout.
+			 */
+			udelay(TIMEOUT_DELAY);
+			bcp->plugged_tries++;
+			if (bcp->plugged_tries >= PLUGSB4RESET) {
+				bcp->plugged_tries = 0;
+				quiesce_local_uvhub(hmaster);
+				spin_lock(&hmaster->queue_lock);
+				uv_reset_with_ipi(&bau_desc->distribution,
+							this_cpu);
+				spin_unlock(&hmaster->queue_lock);
+				end_uvhub_quiesce(hmaster);
+				bcp->ipi_attempts++;
+				stat->s_resets_plug++;
+			}
+		} else if (completion_status == FLUSH_RETRY_TIMEOUT) {
+			hmaster->max_concurrent = 1;
+			bcp->timeout_tries++;
+			udelay(TIMEOUT_DELAY);
+			if (bcp->timeout_tries >= TIMEOUTSB4RESET) {
+				bcp->timeout_tries = 0;
+				quiesce_local_uvhub(hmaster);
+				spin_lock(&hmaster->queue_lock);
+				uv_reset_with_ipi(&bau_desc->distribution,
+								this_cpu);
+				spin_unlock(&hmaster->queue_lock);
+				end_uvhub_quiesce(hmaster);
+				bcp->ipi_attempts++;
+				stat->s_resets_timeout++;
+			}
+		}
+		if (bcp->ipi_attempts >= 3) {
+			bcp->ipi_attempts = 0;
+			completion_status = FLUSH_GIVEUP;
+			break;
+		}
+		cpu_relax();
+	} while ((completion_status == FLUSH_RETRY_PLUGGED) ||
+		 (completion_status == FLUSH_RETRY_TIMEOUT));
 	time2 = get_cycles();
-	__get_cpu_var(ptcstats).sflush += (time2 - time1);
-	if (tries > 1)
-		__get_cpu_var(ptcstats).retriesok++;
 
-	if (completion_status == FLUSH_GIVEUP) {
+	if ((completion_status == FLUSH_COMPLETE) && (bcp->conseccompletes > 5)
+	    && (hmaster->max_concurrent < hmaster->max_concurrent_constant))
+			hmaster->max_concurrent++;
+
+	/*
+	 * hold any cpu not timing out here; no other cpu currently held by
+	 * the 'throttle' should enter the activation code
+	 */
+	while (hmaster->uvhub_quiesce)
+		cpu_relax();
+	atomic_dec(&hmaster->active_descriptor_count);
+
+	/* guard against cycles wrap */
+	if (time2 > time1)
+		stat->s_time += (time2 - time1);
+	else
+		stat->s_requestor--; /* don't count this one */
+	if (completion_status == FLUSH_COMPLETE && try > 1)
+		stat->s_retriesok++;
+	else if (completion_status == FLUSH_GIVEUP) {
 		/*
 		 * Cause the caller to do an IPI-style TLB shootdown on
-		 * the cpu's, all of which are still in the mask.
+		 * the target cpu's, all of which are still in the mask.
 		 */
-		__get_cpu_var(ptcstats).ptc_i++;
+		stat->s_giveup++;
 		return flush_mask;
 	}
 
@@ -295,18 +666,17 @@ const struct cpumask *uv_flush_send_and_wait(int cpu, int this_pnode,
 	 * use the IPI method of shootdown on them.
 	 */
 	for_each_cpu(bit, flush_mask) {
-		pnode = uv_cpu_to_pnode(bit);
-		if (pnode == this_pnode)
+		uvhub = uv_cpu_to_blade_id(bit);
+		if (uvhub == this_uvhub)
 			continue;
 		cpumask_clear_cpu(bit, flush_mask);
 	}
 	if (!cpumask_empty(flush_mask))
 		return flush_mask;
+
 	return NULL;
 }
 
-static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask);
-
 /**
  * uv_flush_tlb_others - globally purge translation cache of a virtual
  * address or all TLB's
@@ -323,8 +693,8 @@ static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask);
  * The caller has derived the cpumask from the mm_struct.  This function
  * is called only if there are bits set in the mask. (e.g. flush_tlb_page())
  *
- * The cpumask is converted into a nodemask of the nodes containing
- * the cpus.
+ * The cpumask is converted into a uvhubmask of the uvhubs containing
+ * those cpus.
  *
  * Note that this function should be called with preemption disabled.
  *
@@ -336,52 +706,82 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
 					  struct mm_struct *mm,
 					  unsigned long va, unsigned int cpu)
 {
-	struct cpumask *flush_mask = __get_cpu_var(uv_flush_tlb_mask);
-	int i;
-	int bit;
-	int pnode;
-	int uv_cpu;
-	int this_pnode;
+	int remotes;
+	int tcpu;
+	int uvhub;
 	int locals = 0;
 	struct bau_desc *bau_desc;
+	struct cpumask *flush_mask;
+	struct ptc_stats *stat;
+	struct bau_control *bcp;
 
-	cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
+	if (nobau)
+		return cpumask;
 
-	uv_cpu = uv_blade_processor_id();
-	this_pnode = uv_hub_info->pnode;
-	bau_desc = __get_cpu_var(bau_control).descriptor_base;
-	bau_desc += UV_ITEMS_PER_DESCRIPTOR * uv_cpu;
+	bcp = &per_cpu(bau_control, cpu);
+	/*
+	 * Each sending cpu has a per-cpu mask which it fills from the caller's
+	 * cpu mask.  Only remote cpus are converted to uvhubs and copied.
+	 */
+	flush_mask = (struct cpumask *)per_cpu(uv_flush_tlb_mask, cpu);
+	/*
+	 * copy cpumask to flush_mask, removing current cpu
+	 * (current cpu should already have been flushed by the caller and
+	 *  should never be returned if we return flush_mask)
+	 */
+	cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
+	if (cpu_isset(cpu, *cpumask))
+		locals++;  /* current cpu was targeted */
 
-	bau_nodes_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
+	bau_desc = bcp->descriptor_base;
+	bau_desc += UV_ITEMS_PER_DESCRIPTOR * bcp->uvhub_cpu;
 
-	i = 0;
-	for_each_cpu(bit, flush_mask) {
-		pnode = uv_cpu_to_pnode(bit);
-		BUG_ON(pnode > (UV_DISTRIBUTION_SIZE - 1));
-		if (pnode == this_pnode) {
+	bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
+	remotes = 0;
+	for_each_cpu(tcpu, flush_mask) {
+		uvhub = uv_cpu_to_blade_id(tcpu);
+		if (uvhub == bcp->uvhub) {
 			locals++;
 			continue;
 		}
-		bau_node_set(pnode - uv_partition_base_pnode,
-				&bau_desc->distribution);
-		i++;
+		bau_uvhub_set(uvhub, &bau_desc->distribution);
+		remotes++;
 	}
-	if (i == 0) {
+	if (remotes == 0) {
 		/*
-		 * no off_node flushing; return status for local node
+		 * No off_hub flushing; return status for local hub.
+		 * Return the caller's mask if all were local (the current
+		 * cpu may be in that mask).
 		 */
 		if (locals)
-			return flush_mask;
+			return cpumask;
 		else
 			return NULL;
 	}
-	__get_cpu_var(ptcstats).requestor++;
-	__get_cpu_var(ptcstats).ntargeted += i;
+	stat = &per_cpu(ptcstats, cpu);
+	stat->s_requestor++;
+	stat->s_ntargcpu += remotes;
+	remotes = bau_uvhub_weight(&bau_desc->distribution);
+	stat->s_ntarguvhub += remotes;
+	if (remotes >= 16)
+		stat->s_ntarguvhub16++;
+	else if (remotes >= 8)
+		stat->s_ntarguvhub8++;
+	else if (remotes >= 4)
+		stat->s_ntarguvhub4++;
+	else if (remotes >= 2)
+		stat->s_ntarguvhub2++;
+	else
+		stat->s_ntarguvhub1++;
 
 	bau_desc->payload.address = va;
 	bau_desc->payload.sending_cpu = cpu;
 
-	return uv_flush_send_and_wait(uv_cpu, this_pnode, bau_desc, flush_mask);
+	/*
+	 * uv_flush_send_and_wait returns null if all cpu's were messaged, or
+	 * the adjusted flush_mask if any cpu's were not messaged.
+	 */
+	return uv_flush_send_and_wait(bau_desc, flush_mask, bcp);
 }
 
 /*
@@ -390,87 +790,70 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
  *
  * We received a broadcast assist message.
  *
- * Interrupts may have been disabled; this interrupt could represent
+ * Interrupts are disabled; this interrupt could represent
  * the receipt of several messages.
  *
- * All cores/threads on this node get this interrupt.
- * The last one to see it does the s/w ack.
+ * All cores/threads on this hub get this interrupt.
+ * The last one to see it does the software ack.
  * (the resource will not be freed until noninterruptable cpus see this
- *  interrupt; hardware will timeout the s/w ack and reply ERROR)
+ *  interrupt; hardware may timeout the s/w ack and reply ERROR)
  */
 void uv_bau_message_interrupt(struct pt_regs *regs)
 {
-	struct bau_payload_queue_entry *va_queue_first;
-	struct bau_payload_queue_entry *va_queue_last;
-	struct bau_payload_queue_entry *msg;
-	struct pt_regs *old_regs = set_irq_regs(regs);
-	cycles_t time1;
-	cycles_t time2;
-	int msg_slot;
-	int sw_ack_slot;
-	int fw;
 	int count = 0;
-	unsigned long local_pnode;
-
-	ack_APIC_irq();
-	exit_idle();
-	irq_enter();
-
-	time1 = get_cycles();
-
-	local_pnode = uv_blade_to_pnode(uv_numa_blade_id());
-
-	va_queue_first = __get_cpu_var(bau_control).va_queue_first;
-	va_queue_last = __get_cpu_var(bau_control).va_queue_last;
-
-	msg = __get_cpu_var(bau_control).bau_msg_head;
+	cycles_t time_start;
+	struct bau_payload_queue_entry *msg;
+	struct bau_control *bcp;
+	struct ptc_stats *stat;
+	struct msg_desc msgdesc;
+
+	time_start = get_cycles();
+	bcp = &per_cpu(bau_control, smp_processor_id());
+	stat = &per_cpu(ptcstats, smp_processor_id());
+	msgdesc.va_queue_first = bcp->va_queue_first;
+	msgdesc.va_queue_last = bcp->va_queue_last;
+	msg = bcp->bau_msg_head;
 	while (msg->sw_ack_vector) {
 		count++;
-		fw = msg->sw_ack_vector;
-		msg_slot = msg - va_queue_first;
-		sw_ack_slot = ffs(fw) - 1;
-
-		uv_bau_process_message(msg, msg_slot, sw_ack_slot);
-
+		msgdesc.msg_slot = msg - msgdesc.va_queue_first;
+		msgdesc.sw_ack_slot = ffs(msg->sw_ack_vector) - 1;
+		msgdesc.msg = msg;
+		uv_bau_process_message(&msgdesc, bcp);
 		msg++;
-		if (msg > va_queue_last)
-			msg = va_queue_first;
-		__get_cpu_var(bau_control).bau_msg_head = msg;
+		if (msg > msgdesc.va_queue_last)
+			msg = msgdesc.va_queue_first;
+		bcp->bau_msg_head = msg;
 	}
+	stat->d_time += (get_cycles() - time_start);
 	if (!count)
-		__get_cpu_var(ptcstats).nomsg++;
+		stat->d_nomsg++;
 	else if (count > 1)
-		__get_cpu_var(ptcstats).multmsg++;
-
-	time2 = get_cycles();
-	__get_cpu_var(ptcstats).dflush += (time2 - time1);
-
-	irq_exit();
-	set_irq_regs(old_regs);
+		stat->d_multmsg++;
+	ack_APIC_irq();
 }
 
 /*
  * uv_enable_timeouts
  *
- * Each target blade (i.e. blades that have cpu's) needs to have
+ * Each target uvhub (i.e. a uvhub that has no cpu's) needs to have
  * shootdown message timeouts enabled.  The timeout does not cause
  * an interrupt, but causes an error message to be returned to
  * the sender.
  */
 static void uv_enable_timeouts(void)
 {
-	int blade;
-	int nblades;
+	int uvhub;
+	int nuvhubs;
 	int pnode;
 	unsigned long mmr_image;
 
-	nblades = uv_num_possible_blades();
+	nuvhubs = uv_num_possible_blades();
 
-	for (blade = 0; blade < nblades; blade++) {
-		if (!uv_blade_nr_possible_cpus(blade))
+	for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
+		if (!uv_blade_nr_possible_cpus(uvhub))
 			continue;
 
-		pnode = uv_blade_to_pnode(blade);
+		pnode = uv_blade_to_pnode(uvhub);
 		mmr_image =
 		    uv_read_global_mmr64(pnode, UVH_LB_BAU_MISC_CONTROL);
 		/*
@@ -523,9 +906,20 @@ static void uv_ptc_seq_stop(struct seq_file *file, void *data)
 {
 }
 
+static inline unsigned long long
+millisec_2_cycles(unsigned long millisec)
+{
+	unsigned long ns;
+	unsigned long long cyc;
+
+	ns = millisec * 1000;
+	cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id()));
+	return cyc;
+}
+
 /*
- * Display the statistics thru /proc
- * data points to the cpu number
+ * Display the statistics thru /proc.
+ * 'data' points to the cpu number
  */
 static int uv_ptc_seq_show(struct seq_file *file, void *data)
 {
@@ -536,78 +930,155 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data)
 
 	if (!cpu) {
 		seq_printf(file,
-		"# cpu requestor requestee one all sretry dretry ptc_i ");
+			"# cpu sent stime numuvhubs numuvhubs16 numuvhubs8 ");
 		seq_printf(file,
-		"sw_ack sflush dflush sok dnomsg dmult starget\n");
+			"numuvhubs4 numuvhubs2 numuvhubs1 numcpus dto ");
+		seq_printf(file,
+			"retries rok resetp resett giveup sto bz throt ");
+		seq_printf(file,
+			"sw_ack recv rtime all ");
+		seq_printf(file,
+			"one mult none retry canc nocan reset rcan\n");
 	}
 	if (cpu < num_possible_cpus() && cpu_online(cpu)) {
 		stat = &per_cpu(ptcstats, cpu);
-		seq_printf(file, "cpu %d %ld %ld %ld %ld %ld %ld %ld ",
-			   cpu, stat->requestor,
-			   stat->requestee, stat->onetlb, stat->alltlb,
-			   stat->s_retry, stat->d_retry, stat->ptc_i);
-		seq_printf(file, "%lx %ld %ld %ld %ld %ld %ld\n",
+		/* source side statistics */
+		seq_printf(file,
+			"cpu %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
+			   cpu, stat->s_requestor, cycles_2_us(stat->s_time),
+			   stat->s_ntarguvhub, stat->s_ntarguvhub16,
+			   stat->s_ntarguvhub8, stat->s_ntarguvhub4,
+			   stat->s_ntarguvhub2, stat->s_ntarguvhub1,
+			   stat->s_ntargcpu, stat->s_dtimeout);
+		seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld ",
+			   stat->s_retry_messages, stat->s_retriesok,
+			   stat->s_resets_plug, stat->s_resets_timeout,
+			   stat->s_giveup, stat->s_stimeout,
+			   stat->s_busy, stat->s_throttles);
+		/* destination side statistics */
+		seq_printf(file,
+			   "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld\n",
 			   uv_read_global_mmr64(uv_cpu_to_pnode(cpu),
 					UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE),
-			   stat->sflush, stat->dflush,
-			   stat->retriesok, stat->nomsg,
-			   stat->multmsg, stat->ntargeted);
+			   stat->d_requestee, cycles_2_us(stat->d_time),
+			   stat->d_alltlb, stat->d_onetlb, stat->d_multmsg,
+			   stat->d_nomsg, stat->d_retries, stat->d_canceled,
+			   stat->d_nocanceled, stat->d_resets,
+			   stat->d_rcanceled);
 	}
 
 	return 0;
 }
 
 /*
+ * -1: resetf the statistics
  *  0: display meaning of the statistics
- * >0: retry limit
+ * >0: maximum concurrent active descriptors per uvhub (throttle)
  */
 static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user,
 				 size_t count, loff_t *data)
 {
-	long newmode;
+	int cpu;
+	long input_arg;
 	char optstr[64];
+	struct ptc_stats *stat;
+	struct bau_control *bcp;
 
 	if (count == 0 || count > sizeof(optstr))
 		return -EINVAL;
 	if (copy_from_user(optstr, user, count))
 		return -EFAULT;
 	optstr[count - 1] = '\0';
-	if (strict_strtoul(optstr, 10, &newmode) < 0) {
+	if (strict_strtol(optstr, 10, &input_arg) < 0) {
 		printk(KERN_DEBUG "%s is invalid\n", optstr);
 		return -EINVAL;
 	}
 
-	if (newmode == 0) {
+	if (input_arg == 0) {
 		printk(KERN_DEBUG "# cpu:      cpu number\n");
+		printk(KERN_DEBUG "Sender statistics:\n");
+		printk(KERN_DEBUG
+		"sent:     number of shootdown messages sent\n");
+		printk(KERN_DEBUG
+		"stime:    time spent sending messages\n");
+		printk(KERN_DEBUG
+		"numuvhubs: number of hubs targeted with shootdown\n");
+		printk(KERN_DEBUG
+		"numuvhubs16: number times 16 or more hubs targeted\n");
+		printk(KERN_DEBUG
+		"numuvhubs8: number times 8 or more hubs targeted\n");
+		printk(KERN_DEBUG
+		"numuvhubs4: number times 4 or more hubs targeted\n");
+		printk(KERN_DEBUG
+		"numuvhubs2: number times 2 or more hubs targeted\n");
+		printk(KERN_DEBUG
+		"numuvhubs1: number times 1 hub targeted\n");
+		printk(KERN_DEBUG
+		"numcpus:  number of cpus targeted with shootdown\n");
+		printk(KERN_DEBUG
+		"dto:      number of destination timeouts\n");
+		printk(KERN_DEBUG
+		"retries:  destination timeout retries sent\n");
+		printk(KERN_DEBUG
+		"rok:   :  destination timeouts successfully retried\n");
+		printk(KERN_DEBUG
+		"resetp:   ipi-style resource resets for plugs\n");
+		printk(KERN_DEBUG
+		"resett:   ipi-style resource resets for timeouts\n");
+		printk(KERN_DEBUG
+		"giveup:   fall-backs to ipi-style shootdowns\n");
+		printk(KERN_DEBUG
+		"sto:      number of source timeouts\n");
+		printk(KERN_DEBUG
+		"bz:       number of stay-busy's\n");
+		printk(KERN_DEBUG
+		"throt:    number times spun in throttle\n");
+		printk(KERN_DEBUG "Destination side statistics:\n");
 		printk(KERN_DEBUG
-		"requestor:  times this cpu was the flush requestor\n");
+		"sw_ack:   image of UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE\n");
 		printk(KERN_DEBUG
-		"requestee:  times this cpu was requested to flush its TLBs\n");
+		"recv:     shootdown messages received\n");
 		printk(KERN_DEBUG
-		"one:        times requested to flush a single address\n");
+		"rtime:    time spent processing messages\n");
 		printk(KERN_DEBUG
-		"all:        times requested to flush all TLB's\n");
+		"all:      shootdown all-tlb messages\n");
 		printk(KERN_DEBUG
-		"sretry:     number of retries of source-side timeouts\n");
+		"one:      shootdown one-tlb messages\n");
 		printk(KERN_DEBUG
-		"dretry:     number of retries of destination-side timeouts\n");
+		"mult:     interrupts that found multiple messages\n");
 		printk(KERN_DEBUG
-		"ptc_i:      times UV fell through to IPI-style flushes\n");
+		"none:     interrupts that found no messages\n");
 		printk(KERN_DEBUG
-		"sw_ack:     image of UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE\n");
+		"retry:    number of retry messages processed\n");
 		printk(KERN_DEBUG
-		"sflush_us:  cycles spent in uv_flush_tlb_others()\n");
+		"canc:     number messages canceled by retries\n");
 		printk(KERN_DEBUG
-		"dflush_us:  cycles spent in handling flush requests\n");
-		printk(KERN_DEBUG "sok:        successes on retry\n");
-		printk(KERN_DEBUG "dnomsg:     interrupts with no message\n");
+		"nocan:    number retries that found nothing to cancel\n");
 		printk(KERN_DEBUG
-		"dmult:      interrupts with multiple messages\n");
-		printk(KERN_DEBUG "starget:    nodes targeted\n");
+		"reset:    number of ipi-style reset requests processed\n");
+		printk(KERN_DEBUG
+		"rcan:     number messages canceled by reset requests\n");
+	} else if (input_arg == -1) {
+		for_each_present_cpu(cpu) {
+			stat = &per_cpu(ptcstats, cpu);
+			memset(stat, 0, sizeof(struct ptc_stats));
+		}
 	} else {
-		uv_bau_retry_limit = newmode;
-		printk(KERN_DEBUG "timeout retry limit:%d\n",
-		       uv_bau_retry_limit);
+		uv_bau_max_concurrent = input_arg;
+		bcp = &per_cpu(bau_control, smp_processor_id());
+		if (uv_bau_max_concurrent < 1 ||
+		    uv_bau_max_concurrent > bcp->cpus_in_uvhub) {
+			printk(KERN_DEBUG
+				"Error: BAU max concurrent %d; %d is invalid\n",
+				bcp->max_concurrent, uv_bau_max_concurrent);
+			return -EINVAL;
+		}
+		printk(KERN_DEBUG "Set BAU max concurrent:%d\n",
+		       uv_bau_max_concurrent);
+		for_each_present_cpu(cpu) {
+			bcp = &per_cpu(bau_control, cpu);
+			bcp->max_concurrent = uv_bau_max_concurrent;
+		}
 	}
 
 	return count;
@@ -651,79 +1122,30 @@ static int __init uv_ptc_init(void)
 }
 
 /*
- * begin the initialization of the per-blade control structures
- */
-static struct bau_control * __init uv_table_bases_init(int blade, int node)
-{
-	int i;
-	struct bau_msg_status *msp;
-	struct bau_control *bau_tabp;
-
-	bau_tabp =
-	    kmalloc_node(sizeof(struct bau_control), GFP_KERNEL, node);
-	BUG_ON(!bau_tabp);
-
-	bau_tabp->msg_statuses =
-	    kmalloc_node(sizeof(struct bau_msg_status) *
-			 DEST_Q_SIZE, GFP_KERNEL, node);
-	BUG_ON(!bau_tabp->msg_statuses);
-
-	for (i = 0, msp = bau_tabp->msg_statuses; i < DEST_Q_SIZE; i++, msp++)
-		bau_cpubits_clear(&msp->seen_by, (int)
-				  uv_blade_nr_possible_cpus(blade));
-
-	uv_bau_table_bases[blade] = bau_tabp;
-
-	return bau_tabp;
-}
-
-/*
- * finish the initialization of the per-blade control structures
- */
-static void __init
-uv_table_bases_finish(int blade,
-		      struct bau_control *bau_tablesp,
-		      struct bau_desc *adp)
-{
-	struct bau_control *bcp;
-	int cpu;
-
-	for_each_present_cpu(cpu) {
-		if (blade != uv_cpu_to_blade_id(cpu))
-			continue;
-
-		bcp = (struct bau_control *)&per_cpu(bau_control, cpu);
-		bcp->bau_msg_head	= bau_tablesp->va_queue_first;
-		bcp->va_queue_first	= bau_tablesp->va_queue_first;
-		bcp->va_queue_last	= bau_tablesp->va_queue_last;
-		bcp->msg_statuses	= bau_tablesp->msg_statuses;
-		bcp->descriptor_base	= adp;
-	}
-}
-
-/*
  * initialize the sending side's sending buffers
  */
-static struct bau_desc * __init
+static void
 uv_activation_descriptor_init(int node, int pnode)
 {
 	int i;
+	int cpu;
 	unsigned long pa;
 	unsigned long m;
 	unsigned long n;
-	struct bau_desc *adp;
-	struct bau_desc *ad2;
+	struct bau_desc *bau_desc;
+	struct bau_desc *bd2;
+	struct bau_control *bcp;
 
 	/*
 	 * each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR)
-	 * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per blade
+	 * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per uvhub
 	 */
-	adp = (struct bau_desc *)kmalloc_node(sizeof(struct bau_desc)*
+	bau_desc = (struct bau_desc *)kmalloc_node(sizeof(struct bau_desc)*
 		UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node);
-	BUG_ON(!adp);
+	BUG_ON(!bau_desc);
 
-	pa = uv_gpa(adp); /* need the real nasid*/
-	n = uv_gpa_to_pnode(pa);
+	pa = uv_gpa(bau_desc); /* need the real nasid*/
+	n = pa >> uv_nshift;
 	m = pa & uv_mmask;
 
 	uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE,
@@ -732,96 +1154,188 @@ uv_activation_descriptor_init(int node, int pnode)
 	/*
 	 * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each
 	 * cpu even though we only use the first one; one descriptor can
-	 * describe a broadcast to 256 nodes.
+	 * describe a broadcast to 256 uv hubs.
 	 */
-	for (i = 0, ad2 = adp; i < (UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR);
-		i++, ad2++) {
-		memset(ad2, 0, sizeof(struct bau_desc));
-		ad2->header.sw_ack_flag = 1;
+	for (i = 0, bd2 = bau_desc; i < (UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR);
+		i++, bd2++) {
+		memset(bd2, 0, sizeof(struct bau_desc));
+		bd2->header.sw_ack_flag = 1;
 		/*
-		 * base_dest_nodeid is the first node in the partition, so
-		 * the bit map will indicate partition-relative node numbers.
-		 * note that base_dest_nodeid is actually a nasid.
+		 * base_dest_nodeid is the nasid (pnode<<1) of the first uvhub
+		 * in the partition. The bit map will indicate uvhub numbers,
+		 * which are 0-N in a partition. Pnodes are unique system-wide.
 		 */
-		ad2->header.base_dest_nodeid = uv_partition_base_pnode << 1;
-		ad2->header.dest_subnodeid = 0x10; /* the LB */
-		ad2->header.command = UV_NET_ENDPOINT_INTD;
-		ad2->header.int_both = 1;
+		bd2->header.base_dest_nodeid = uv_partition_base_pnode << 1;
+		bd2->header.dest_subnodeid = 0x10; /* the LB */
+		bd2->header.command = UV_NET_ENDPOINT_INTD;
+		bd2->header.int_both = 1;
 		/*
 		 * all others need to be set to zero:
 		 *   fairness chaining multilevel count replied_to
 		 */
 	}
-	return adp;
+	for_each_present_cpu(cpu) {
+		if (pnode != uv_blade_to_pnode(uv_cpu_to_blade_id(cpu)))
+			continue;
+		bcp = &per_cpu(bau_control, cpu);
+		bcp->descriptor_base = bau_desc;
+	}
 }
 
 /*
  * initialize the destination side's receiving buffers
+ * entered for each uvhub in the partition
+ * - node is first node (kernel memory notion) on the uvhub
+ * - pnode is the uvhub's physical identifier
  */
-static struct bau_payload_queue_entry * __init
-uv_payload_queue_init(int node, int pnode, struct bau_control *bau_tablesp)
+static void
+uv_payload_queue_init(int node, int pnode)
 {
-	struct bau_payload_queue_entry *pqp;
-	unsigned long pa;
 	int pn;
+	int cpu;
 	char *cp;
+	unsigned long pa;
+	struct bau_payload_queue_entry *pqp;
+	struct bau_payload_queue_entry *pqp_malloc;
+	struct bau_control *bcp;
 
 	pqp = (struct bau_payload_queue_entry *) kmalloc_node(
 		(DEST_Q_SIZE + 1) * sizeof(struct bau_payload_queue_entry),
 		GFP_KERNEL, node);
 	BUG_ON(!pqp);
+	pqp_malloc = pqp;
 
 	cp = (char *)pqp + 31;
 	pqp = (struct bau_payload_queue_entry *)(((unsigned long)cp >> 5) << 5);
-	bau_tablesp->va_queue_first = pqp;
+
+	for_each_present_cpu(cpu) {
+		if (pnode != uv_cpu_to_pnode(cpu))
+			continue;
+		/* for every cpu on this pnode: */
+		bcp = &per_cpu(bau_control, cpu);
+		bcp->va_queue_first = pqp;
+		bcp->bau_msg_head = pqp;
+		bcp->va_queue_last = pqp + (DEST_Q_SIZE - 1);
+	}
 	/*
 	 * need the pnode of where the memory was really allocated
 	 */
 	pa = uv_gpa(pqp);
-	pn = uv_gpa_to_pnode(pa);
+	pn = pa >> uv_nshift;
 	uv_write_global_mmr64(pnode,
 			      UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST,
 			      ((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) |
 			      uv_physnodeaddr(pqp));
 	uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL,
 			      uv_physnodeaddr(pqp));
-	bau_tablesp->va_queue_last = pqp + (DEST_Q_SIZE - 1);
 	uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST,
 			      (unsigned long)
-			      uv_physnodeaddr(bau_tablesp->va_queue_last));
+			      uv_physnodeaddr(pqp + (DEST_Q_SIZE - 1)));
+	/* in effect, all msg_type's are set to MSG_NOOP */
 	memset(pqp, 0, sizeof(struct bau_payload_queue_entry) * DEST_Q_SIZE);
-
-	return pqp;
 }
 
 /*
- * Initialization of each UV blade's structures
+ * Initialization of each UV hub's structures
  */
-static int __init uv_init_blade(int blade)
+static void __init uv_init_uvhub(int uvhub, int vector)
 {
 	int node;
 	int pnode;
-	unsigned long pa;
 	unsigned long apicid;
-	struct bau_desc *adp;
-	struct bau_payload_queue_entry *pqp;
-	struct bau_control *bau_tablesp;
-
-	node = blade_to_first_node(blade);
-	bau_tablesp = uv_table_bases_init(blade, node);
-	pnode = uv_blade_to_pnode(blade);
-	adp = uv_activation_descriptor_init(node, pnode);
-	pqp = uv_payload_queue_init(node, pnode, bau_tablesp);
-	uv_table_bases_finish(blade, bau_tablesp, adp);
+
+	node = uvhub_to_first_node(uvhub);
+	pnode = uv_blade_to_pnode(uvhub);
+	uv_activation_descriptor_init(node, pnode);
+	uv_payload_queue_init(node, pnode);
 	/*
 	 * the below initialization can't be in firmware because the
 	 * messaging IRQ will be determined by the OS
 	 */
-	apicid = blade_to_first_apicid(blade);
-	pa = uv_read_global_mmr64(pnode, UVH_BAU_DATA_CONFIG);
+	apicid = uvhub_to_first_apicid(uvhub);
 	uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG,
-				      ((apicid << 32) | UV_BAU_MESSAGE));
-	return 0;
+				      ((apicid << 32) | vector));
+}
+
+/*
+ * initialize the bau_control structure for each cpu
+ */
+static void uv_init_per_cpu(int nuvhubs)
+{
+	int i, j, k;
+	int cpu;
+	int pnode;
+	int uvhub;
+	short socket = 0;
+	struct bau_control *bcp;
+	struct uvhub_desc *bdp;
+	struct socket_desc *sdp;
+	struct bau_control *hmaster = NULL;
+	struct bau_control *smaster = NULL;
+	struct socket_desc {
+		short num_cpus;
+		short cpu_number[16];
+	};
+	struct uvhub_desc {
+		short num_sockets;
+		short num_cpus;
+		short uvhub;
+		short pnode;
+		struct socket_desc socket[2];
+	};
+	struct uvhub_desc *uvhub_descs;
+
+	uvhub_descs = (struct uvhub_desc *)
+		kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL);
+	memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc));
+	for_each_present_cpu(cpu) {
+		bcp = &per_cpu(bau_control, cpu);
+		memset(bcp, 0, sizeof(struct bau_control));
+		spin_lock_init(&bcp->masks_lock);
+		bcp->max_concurrent = uv_bau_max_concurrent;
+		pnode = uv_cpu_hub_info(cpu)->pnode;
+		uvhub = uv_cpu_hub_info(cpu)->numa_blade_id;
+		bdp = &uvhub_descs[uvhub];
+		bdp->num_cpus++;
+		bdp->uvhub = uvhub;
+		bdp->pnode = pnode;
+		/* time interval to catch a hardware stay-busy bug */
+		bcp->timeout_interval = millisec_2_cycles(3);
+		/* kludge: assume uv_hub.h is constant */
+		socket = (cpu_physical_id(cpu)>>5)&1;
+		if (socket >= bdp->num_sockets)
+			bdp->num_sockets = socket+1;
+		sdp = &bdp->socket[socket];
+		sdp->cpu_number[sdp->num_cpus] = cpu;
+		sdp->num_cpus++;
+	}
+	socket = 0;
+	for_each_possible_blade(uvhub) {
+		bdp = &uvhub_descs[uvhub];
+		for (i = 0; i < bdp->num_sockets; i++) {
+			sdp = &bdp->socket[i];
+			for (j = 0; j < sdp->num_cpus; j++) {
+				cpu = sdp->cpu_number[j];
+				bcp = &per_cpu(bau_control, cpu);
+				bcp->cpu = cpu;
+				if (j == 0) {
+					smaster = bcp;
+					if (i == 0)
+						hmaster = bcp;
+				}
+				bcp->cpus_in_uvhub = bdp->num_cpus;
+				bcp->cpus_in_socket = sdp->num_cpus;
+				bcp->socket_master = smaster;
+				bcp->uvhub_master = hmaster;
+				for (k = 0; k < DEST_Q_SIZE; k++)
+					bcp->socket_acknowledge_count[k] = 0;
+				bcp->uvhub_cpu =
+				  uv_cpu_hub_info(cpu)->blade_processor_id;
+			}
+			socket++;
+		}
+	}
+	kfree(uvhub_descs);
 }
 
 /*
@@ -829,38 +1343,54 @@ static int __init uv_init_blade(int blade)
  */
 static int __init uv_bau_init(void)
 {
-	int blade;
-	int nblades;
+	int uvhub;
+	int pnode;
+	int nuvhubs;
 	int cur_cpu;
+	int vector;
+	unsigned long mmr;
 
 	if (!is_uv_system())
 		return 0;
 
+	if (nobau)
+		return 0;
+
 	for_each_possible_cpu(cur_cpu)
 		zalloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu),
 				       GFP_KERNEL, cpu_to_node(cur_cpu));
 
-	uv_bau_retry_limit = 1;
+	uv_bau_max_concurrent = MAX_BAU_CONCURRENT;
+	uv_nshift = uv_hub_info->m_val;
 	uv_mmask = (1UL << uv_hub_info->m_val) - 1;
-	nblades = uv_num_possible_blades();
+	nuvhubs = uv_num_possible_blades();
 
-	uv_bau_table_bases = (struct bau_control **)
-	    kmalloc(nblades * sizeof(struct bau_control *), GFP_KERNEL);
-	BUG_ON(!uv_bau_table_bases);
+	uv_init_per_cpu(nuvhubs);
 
 	uv_partition_base_pnode = 0x7fffffff;
-	for (blade = 0; blade < nblades; blade++)
-		if (uv_blade_nr_possible_cpus(blade) &&
-			(uv_blade_to_pnode(blade) < uv_partition_base_pnode))
-			uv_partition_base_pnode = uv_blade_to_pnode(blade);
-	for (blade = 0; blade < nblades; blade++)
-		if (uv_blade_nr_possible_cpus(blade))
-			uv_init_blade(blade);
-
-	alloc_intr_gate(UV_BAU_MESSAGE, uv_bau_message_intr1);
+	for (uvhub = 0; uvhub < nuvhubs; uvhub++)
+		if (uv_blade_nr_possible_cpus(uvhub) &&
+			(uv_blade_to_pnode(uvhub) < uv_partition_base_pnode))
+			uv_partition_base_pnode = uv_blade_to_pnode(uvhub);
+
+	vector = UV_BAU_MESSAGE;
+	for_each_possible_blade(uvhub)
+		if (uv_blade_nr_possible_cpus(uvhub))
+			uv_init_uvhub(uvhub, vector);
+
 	uv_enable_timeouts();
+	alloc_intr_gate(vector, uv_bau_message_intr1);
+
+	for_each_possible_blade(uvhub) {
+		pnode = uv_blade_to_pnode(uvhub);
+		/* INIT the bau */
+		uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_ACTIVATION_CONTROL,
+				      ((unsigned long)1 << 63));
+		mmr = 1; /* should be 1 to broadcast to both sockets */
+		uv_write_global_mmr64(pnode, UVH_BAU_DATA_BROADCAST, mmr);
+	}
 
 	return 0;
 }
-__initcall(uv_bau_init);
-__initcall(uv_ptc_init);
+core_initcall(uv_bau_init);
+core_initcall(uv_ptc_init);
-- 
cgit v1.1


From a289cc7c70da784a2d370b91885cab4f966dcb0f Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Fri, 16 Apr 2010 17:51:42 -0700
Subject: x86, UV: uv_irq.c: Fix all sparse warnings

Fix all sparse warnings in building uv_irq.c.

 arch/x86/kernel/uv_irq.c:46:17: warning: symbol 'uv_irq_chip' was not declared. Should it be static?
 arch/x86/kernel/uv_irq.c:143:50: error: no identifier for function argument
 arch/x86/kernel/uv_irq.c:162:13: error: typename in expression
 arch/x86/kernel/uv_irq.c:162:13: error: undefined identifier 'restrict'
 arch/x86/kernel/uv_irq.c:250:44: error: no identifier for function argument
 arch/x86/kernel/uv_irq.c:260:17: error: typename in expression
 arch/x86/kernel/uv_irq.c:260:17: error: undefined identifier 'restrict'
 arch/x86/kernel/uv_irq.c:233:50: warning: incorrect type in argument 3 (different signedness)
 arch/x86/kernel/uv_irq.c:233:50:    expected int *pnode
 arch/x86/kernel/uv_irq.c:233:50:    got unsigned int *<noident>
 arch/x86/include/asm/uv/uv_hub.h:318:44: warning: incorrect type in argument 2 (different address spaces)
 arch/x86/include/asm/uv/uv_hub.h:318:44:    expected void volatile [noderef] <asn:2>*addr
 arch/x86/include/asm/uv/uv_hub.h:318:44:    got unsigned long *

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Dimitri Sivanich <sivanich@sgi.com>
Cc: Russ Anderson <rja@sgi.com>
Cc: Robin Holt <holt@sgi.com>
Cc: Mike Travis <travis@sgi.com>
Cc: Cliff Wickman <cpw@sgi.com>
Cc: Jack Steiner <steiner@sgi.com>
LKML-Reference: <20100416175142.f4b59683.randy.dunlap@oracle.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/uv_irq.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c
index ece73d8..1a9f55a 100644
--- a/arch/x86/kernel/uv_irq.c
+++ b/arch/x86/kernel/uv_irq.c
@@ -43,7 +43,7 @@ static void uv_ack_apic(unsigned int irq)
 	ack_APIC_irq();
 }
 
-struct irq_chip uv_irq_chip = {
+static struct irq_chip uv_irq_chip = {
 	.name		= "UV-CORE",
 	.startup	= uv_noop_ret,
 	.shutdown	= uv_noop,
@@ -140,7 +140,7 @@ int uv_irq_2_mmr_info(int irq, unsigned long *offset, int *pnode)
  */
 static int
 arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
-		       unsigned long mmr_offset, int restrict)
+		       unsigned long mmr_offset, int limit)
 {
 	const struct cpumask *eligible_cpu = cpumask_of(cpu);
 	struct irq_desc *desc = irq_to_desc(irq);
@@ -159,7 +159,7 @@ arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
 	if (err != 0)
 		return err;
 
-	if (restrict == UV_AFFINITY_CPU)
+	if (limit == UV_AFFINITY_CPU)
 		desc->status |= IRQ_NO_BALANCING;
 	else
 		desc->status |= IRQ_MOVE_PCNTXT;
@@ -213,7 +213,7 @@ static int uv_set_irq_affinity(unsigned int irq, const struct cpumask *mask)
 	unsigned long mmr_value;
 	struct uv_IO_APIC_route_entry *entry;
 	unsigned long mmr_offset;
-	unsigned mmr_pnode;
+	int mmr_pnode;
 
 	if (set_desc_affinity(desc, mask, &dest))
 		return -1;
@@ -247,7 +247,7 @@ static int uv_set_irq_affinity(unsigned int irq, const struct cpumask *mask)
  * interrupt is raised.
  */
 int uv_setup_irq(char *irq_name, int cpu, int mmr_blade,
-		 unsigned long mmr_offset, int restrict)
+		 unsigned long mmr_offset, int limit)
 {
 	int irq, ret;
 
@@ -257,7 +257,7 @@ int uv_setup_irq(char *irq_name, int cpu, int mmr_blade,
 		return -EBUSY;
 
 	ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset,
-		restrict);
+		limit);
 	if (ret == irq)
 		uv_set_irq_2_mmr_info(irq, mmr_offset, mmr_blade);
 	else
-- 
cgit v1.1


From b1ab1b4d9ab9812c77843abec79030292ef0a544 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Thu, 22 Apr 2010 16:06:58 +0200
Subject: x86, cacheinfo: Unify AMD L3 cache index disable checking

All F10h CPUs starting with model 8 resp. 9, stepping 1, support L3
cache index disable. Concentrate the family, model, stepping checking at
one place and enable the feature implicitly on upcoming Fam10h models.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
LKML-Reference: <1271945222-5283-2-git-send-email-bp@amd64.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/intel_cacheinfo.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index b3eeb66..acfb083 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -328,18 +328,22 @@ static unsigned int __cpuinit amd_calc_l3_indices(void)
 static void __cpuinit
 amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
 {
-	if (index < 3)
+	if (boot_cpu_data.x86 != 0x10)
 		return;
 
-	if (boot_cpu_data.x86 == 0x11)
+	if (index < 3)
 		return;
 
 	/* see errata #382 and #388 */
-	if ((boot_cpu_data.x86 == 0x10) &&
-	    ((boot_cpu_data.x86_model < 0x8) ||
-	     (boot_cpu_data.x86_mask  < 0x1)))
+	if (boot_cpu_data.x86_model < 0x8)
 		return;
 
+	if ((boot_cpu_data.x86_model == 0x8 ||
+	     boot_cpu_data.x86_model == 0x9)
+		&&
+	     boot_cpu_data.x86_mask < 0x1)
+			return;
+
 	this_leaf->can_disable = true;
 	this_leaf->l3_indices  = amd_calc_l3_indices();
 }
@@ -443,8 +447,7 @@ __cpuinit cpuid4_cache_lookup_regs(int index,
 
 	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
 		amd_cpuid4(index, &eax, &ebx, &ecx);
-		if (boot_cpu_data.x86 >= 0x10)
-			amd_check_l3_disable(index, this_leaf);
+		amd_check_l3_disable(index, this_leaf);
 	} else {
 		cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx);
 	}
-- 
cgit v1.1


From f2b20e41407fccfcfacf927ff91ec888832a37af Mon Sep 17 00:00:00 2001
From: Frank Arnold <frank.arnold@amd.com>
Date: Thu, 22 Apr 2010 16:06:59 +0200
Subject: x86, cacheinfo: Turn off L3 cache index disable feature in
 virtualized environments

When running a quest kernel on xen we get:

BUG: unable to handle kernel NULL pointer dereference at 0000000000000038
IP: [<ffffffff8142f2fb>] cpuid4_cache_lookup_regs+0x2ca/0x3df
PGD 0
Oops: 0000 [#1] SMP
last sysfs file:
CPU 0
Modules linked in:

Pid: 0, comm: swapper Tainted: G        W  2.6.34-rc3 #1 /HVM domU
RIP: 0010:[<ffffffff8142f2fb>]  [<ffffffff8142f2fb>] cpuid4_cache_lookup_regs+0x
2ca/0x3df
RSP: 0018:ffff880002203e08  EFLAGS: 00010046
RAX: 0000000000000000 RBX: 0000000000000003 RCX: 0000000000000060
RDX: 0000000000000000 RSI: 0000000000000040 RDI: 0000000000000000
RBP: ffff880002203ed8 R08: 00000000000017c0 R09: ffff880002203e38
R10: ffff8800023d5d40 R11: ffffffff81a01e28 R12: ffff880187e6f5c0
R13: ffff880002203e34 R14: ffff880002203e58 R15: ffff880002203e68
FS:  0000000000000000(0000) GS:ffff880002200000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
CR2: 0000000000000038 CR3: 0000000001a3c000 CR4: 00000000000006f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process swapper (pid: 0, threadinfo ffffffff81a00000, task ffffffff81a44020)
Stack:
 ffffffff810d7ecb ffff880002203e20 ffffffff81059140 ffff880002203e30
<0> ffffffff810d7ec9 0000000002203e40 000000000050d140 ffff880002203e70
<0> 0000000002008140 0000000000000086 ffff880040020140 ffffffff81068b8b
Call Trace:
 <IRQ>
 [<ffffffff810d7ecb>] ? sync_supers_timer_fn+0x0/0x1c
 [<ffffffff81059140>] ? mod_timer+0x23/0x25
 [<ffffffff810d7ec9>] ? arm_supers_timer+0x34/0x36
 [<ffffffff81068b8b>] ? hrtimer_get_next_event+0xa7/0xc3
 [<ffffffff81058e85>] ? get_next_timer_interrupt+0x19a/0x20d
 [<ffffffff8142fa23>] get_cpu_leaves+0x5c/0x232
 [<ffffffff8106a7b1>] ? sched_clock_local+0x1c/0x82
 [<ffffffff8106a9a0>] ? sched_clock_tick+0x75/0x7a
 [<ffffffff8107748c>] generic_smp_call_function_single_interrupt+0xae/0xd0
 [<ffffffff8101f6ef>] smp_call_function_single_interrupt+0x18/0x27
 [<ffffffff8100a773>] call_function_single_interrupt+0x13/0x20
 <EOI>
 [<ffffffff8143c468>] ? notifier_call_chain+0x14/0x63
 [<ffffffff810295c6>] ? native_safe_halt+0xc/0xd
 [<ffffffff810114eb>] ? default_idle+0x36/0x53
 [<ffffffff81008c22>] cpu_idle+0xaa/0xe4
 [<ffffffff81423a9a>] rest_init+0x7e/0x80
 [<ffffffff81b10dd2>] start_kernel+0x40e/0x419
 [<ffffffff81b102c8>] x86_64_start_reservations+0xb3/0xb7
 [<ffffffff81b103c4>] x86_64_start_kernel+0xf8/0x107
Code: 14 d5 40 ff ae 81 8b 14 02 31 c0 3b 15 47 1c 8b 00 7d 0e 48 8b 05 36 1c 8b
 00 48 63 d2 48 8b 04 d0 c7 85 5c ff ff ff 00 00 00 00 <8b> 70 38 48 8d 8d 5c ff
 ff ff 48 8b 78 10 ba c4 01 00 00 e8 eb
RIP  [<ffffffff8142f2fb>] cpuid4_cache_lookup_regs+0x2ca/0x3df
 RSP <ffff880002203e08>
CR2: 0000000000000038
---[ end trace a7919e7f17c0a726 ]---

The L3 cache index disable feature of AMD CPUs has to be disabled if the
kernel is running as guest on top of a hypervisor because northbridge
devices are not available to the guest. Currently, this fixes a boot
crash on top of Xen. In the future this will become an issue on KVM as
well.

Check if northbridge devices are present and do not enable the feature
if there are none.

Signed-off-by: Frank Arnold <frank.arnold@amd.com>
LKML-Reference: <1271945222-5283-3-git-send-email-bp@amd64.org>
Acked-by: Borislav Petkov <borislav.petkov@amd.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/intel_cacheinfo.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index acfb083..5ab14c8 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -344,6 +344,10 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
 	     boot_cpu_data.x86_mask < 0x1)
 			return;
 
+	/* not in virtualized environments */
+	if (num_k8_northbridges == 0)
+		return;
+
 	this_leaf->can_disable = true;
 	this_leaf->l3_indices  = amd_calc_l3_indices();
 }
-- 
cgit v1.1


From 9350f982e4fe539e83a2d4a13e9b53ad8253c4a8 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Thu, 22 Apr 2010 16:07:00 +0200
Subject: x86, cacheinfo: Reorganize AMD L3 cache structure

Add a struct representing L3 cache attributes (subcache sizes and
indices count) and move the respective members out of _cpuid4_info.
Also, stash the struct pci_dev ptr into the struct simplifying the code
even more.

There should be no functionality change resulting from this patch except
slightly slimming the _cpuid4_info per-cpu vars.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
LKML-Reference: <1271945222-5283-4-git-send-email-bp@amd64.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/intel_cacheinfo.c | 53 +++++++++++++++++++++--------------
 1 file changed, 32 insertions(+), 21 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 5ab14c8..ff663ca 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -148,13 +148,19 @@ union _cpuid4_leaf_ecx {
 	u32 full;
 };
 
+struct amd_l3_cache {
+	struct	 pci_dev *dev;
+	bool	 can_disable;
+	unsigned indices;
+	u8	 subcaches[4];
+};
+
 struct _cpuid4_info {
 	union _cpuid4_leaf_eax eax;
 	union _cpuid4_leaf_ebx ebx;
 	union _cpuid4_leaf_ecx ecx;
 	unsigned long size;
-	bool can_disable;
-	unsigned int l3_indices;
+	struct amd_l3_cache *l3;
 	DECLARE_BITMAP(shared_cpu_map, NR_CPUS);
 };
 
@@ -164,8 +170,7 @@ struct _cpuid4_info_regs {
 	union _cpuid4_leaf_ebx ebx;
 	union _cpuid4_leaf_ecx ecx;
 	unsigned long size;
-	bool can_disable;
-	unsigned int l3_indices;
+	struct amd_l3_cache *l3;
 };
 
 unsigned short			num_cache_leaves;
@@ -302,7 +307,7 @@ struct _cache_attr {
 };
 
 #ifdef CONFIG_CPU_SUP_AMD
-static unsigned int __cpuinit amd_calc_l3_indices(void)
+static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3)
 {
 	/*
 	 * We're called over smp_call_function_single() and therefore
@@ -317,12 +322,14 @@ static unsigned int __cpuinit amd_calc_l3_indices(void)
 	pci_read_config_dword(dev, 0x1C4, &val);
 
 	/* calculate subcache sizes */
-	sc0 = !(val & BIT(0));
-	sc1 = !(val & BIT(4));
-	sc2 = !(val & BIT(8))  + !(val & BIT(9));
-	sc3 = !(val & BIT(12)) + !(val & BIT(13));
+	l3->subcaches[0] = sc0 = !(val & BIT(0));
+	l3->subcaches[1] = sc1 = !(val & BIT(4));
+	l3->subcaches[2] = sc2 = !(val & BIT(8))  + !(val & BIT(9));
+	l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13));
+
+	l3->indices = (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1;
 
-	return (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1;
+	l3->dev = dev;
 }
 
 static void __cpuinit
@@ -348,19 +355,23 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
 	if (num_k8_northbridges == 0)
 		return;
 
-	this_leaf->can_disable = true;
-	this_leaf->l3_indices  = amd_calc_l3_indices();
+	this_leaf->l3 = kzalloc(sizeof(struct amd_l3_cache), GFP_ATOMIC);
+	if (!this_leaf->l3) {
+		printk(KERN_WARNING "Error allocating L3 struct\n");
+		return;
+	}
+
+	this_leaf->l3->can_disable = true;
+	amd_calc_l3_indices(this_leaf->l3);
 }
 
 static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
 				  unsigned int index)
 {
-	int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
-	int node = amd_get_nb_id(cpu);
-	struct pci_dev *dev = node_to_k8_nb_misc(node);
+	struct pci_dev *dev = this_leaf->l3->dev;
 	unsigned int reg = 0;
 
-	if (!this_leaf->can_disable)
+	if (!this_leaf->l3 || !this_leaf->l3->can_disable)
 		return -EINVAL;
 
 	if (!dev)
@@ -382,15 +393,14 @@ SHOW_CACHE_DISABLE(1)
 static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
 	const char *buf, size_t count, unsigned int index)
 {
+	struct pci_dev *dev = this_leaf->l3->dev;
 	int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
-	int node = amd_get_nb_id(cpu);
-	struct pci_dev *dev = node_to_k8_nb_misc(node);
 	unsigned long val = 0;
 
 #define SUBCACHE_MASK	(3UL << 20)
 #define SUBCACHE_INDEX	0xfff
 
-	if (!this_leaf->can_disable)
+	if (!this_leaf->l3 || !this_leaf->l3->can_disable)
 		return -EINVAL;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -404,7 +414,7 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
 
 	/* do not allow writes outside of allowed bits */
 	if ((val & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) ||
-	    ((val & SUBCACHE_INDEX) > this_leaf->l3_indices))
+	    ((val & SUBCACHE_INDEX) > this_leaf->l3->indices))
 		return -EINVAL;
 
 	val |= BIT(30);
@@ -708,6 +718,7 @@ static void __cpuinit free_cache_attributes(unsigned int cpu)
 	for (i = 0; i < num_cache_leaves; i++)
 		cache_remove_shared_cpu_map(cpu, i);
 
+	kfree(per_cpu(ici_cpuid4_info, cpu)->l3);
 	kfree(per_cpu(ici_cpuid4_info, cpu));
 	per_cpu(ici_cpuid4_info, cpu) = NULL;
 }
@@ -992,7 +1003,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
 
 		this_leaf = CPUID4_INFO_IDX(cpu, i);
 
-		if (this_leaf->can_disable)
+		if (this_leaf->l3 && this_leaf->l3->can_disable)
 			ktype_cache.default_attrs = default_l3_attrs;
 		else
 			ktype_cache.default_attrs = default_attrs;
-- 
cgit v1.1


From ba06edb63f5ef2913aad37070eaec3c9d8ac73b8 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Thu, 22 Apr 2010 16:07:01 +0200
Subject: x86, cacheinfo: Make L3 cache info per node

Currently, we're allocating L3 cache info and calculating indices for
each online cpu which is clearly superfluous. Instead, we need to do
this per-node as is each L3 cache.

No functional change, only per-cpu memory savings.

-v2: Allocate L3 cache descriptors array dynamically.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
LKML-Reference: <1271945222-5283-5-git-send-email-bp@amd64.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/intel_cacheinfo.c | 59 ++++++++++++++++++++++++++---------
 1 file changed, 45 insertions(+), 14 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index ff663ca..1346e9c 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -307,19 +307,18 @@ struct _cache_attr {
 };
 
 #ifdef CONFIG_CPU_SUP_AMD
+
+/*
+ * L3 cache descriptors
+ */
+static struct amd_l3_cache **__cpuinitdata l3_caches;
+
 static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3)
 {
-	/*
-	 * We're called over smp_call_function_single() and therefore
-	 * are on the correct cpu.
-	 */
-	int cpu = smp_processor_id();
-	int node = cpu_to_node(cpu);
-	struct pci_dev *dev = node_to_k8_nb_misc(node);
 	unsigned int sc0, sc1, sc2, sc3;
 	u32 val = 0;
 
-	pci_read_config_dword(dev, 0x1C4, &val);
+	pci_read_config_dword(l3->dev, 0x1C4, &val);
 
 	/* calculate subcache sizes */
 	l3->subcaches[0] = sc0 = !(val & BIT(0));
@@ -328,13 +327,31 @@ static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3)
 	l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13));
 
 	l3->indices = (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1;
+}
+
+static struct amd_l3_cache * __cpuinit amd_init_l3_cache(int node)
+{
+	struct amd_l3_cache *l3;
+	struct pci_dev *dev = node_to_k8_nb_misc(node);
+
+	l3 = kzalloc(sizeof(struct amd_l3_cache), GFP_ATOMIC);
+	if (!l3) {
+		printk(KERN_WARNING "Error allocating L3 struct\n");
+		return NULL;
+	}
 
 	l3->dev = dev;
+
+	amd_calc_l3_indices(l3);
+
+	return l3;
 }
 
 static void __cpuinit
 amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
 {
+	int node;
+
 	if (boot_cpu_data.x86 != 0x10)
 		return;
 
@@ -355,14 +372,28 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
 	if (num_k8_northbridges == 0)
 		return;
 
-	this_leaf->l3 = kzalloc(sizeof(struct amd_l3_cache), GFP_ATOMIC);
-	if (!this_leaf->l3) {
-		printk(KERN_WARNING "Error allocating L3 struct\n");
-		return;
+	/*
+	 * Strictly speaking, the amount in @size below is leaked since it is
+	 * never freed but this is done only on shutdown so it doesn't matter.
+	 */
+	if (!l3_caches) {
+		int size = num_k8_northbridges * sizeof(struct amd_l3_cache *);
+
+		l3_caches = kzalloc(size, GFP_ATOMIC);
+		if (!l3_caches)
+			return;
 	}
 
-	this_leaf->l3->can_disable = true;
-	amd_calc_l3_indices(this_leaf->l3);
+	node = amd_get_nb_id(smp_processor_id());
+
+	if (!l3_caches[node]) {
+		l3_caches[node] = amd_init_l3_cache(node);
+		l3_caches[node]->can_disable = true;
+	}
+
+	WARN_ON(!l3_caches[node]);
+
+	this_leaf->l3 = l3_caches[node];
 }
 
 static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
-- 
cgit v1.1


From 59d3b388741cf1a5eb7ad27fd4e9ed72643164ae Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Thu, 22 Apr 2010 16:07:02 +0200
Subject: x86, cacheinfo: Disable index in all four subcaches

When disabling an L3 cache index, make sure we disable that index in
all four subcaches of the L3. Clarify nomenclature while at it, wrt to
disable slots versus disable index and rename accordingly.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
LKML-Reference: <1271945222-5283-6-git-send-email-bp@amd64.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/intel_cacheinfo.c | 60 +++++++++++++++++++++++++----------
 1 file changed, 43 insertions(+), 17 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 1346e9c..33eae20 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -397,7 +397,7 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
 }
 
 static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
-				  unsigned int index)
+				  unsigned int slot)
 {
 	struct pci_dev *dev = this_leaf->l3->dev;
 	unsigned int reg = 0;
@@ -408,21 +408,53 @@ static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
 	if (!dev)
 		return -EINVAL;
 
-	pci_read_config_dword(dev, 0x1BC + index * 4, &reg);
+	pci_read_config_dword(dev, 0x1BC + slot * 4, &reg);
 	return sprintf(buf, "0x%08x\n", reg);
 }
 
-#define SHOW_CACHE_DISABLE(index)					\
+#define SHOW_CACHE_DISABLE(slot)					\
 static ssize_t								\
-show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf)	\
+show_cache_disable_##slot(struct _cpuid4_info *this_leaf, char *buf)	\
 {									\
-	return show_cache_disable(this_leaf, buf, index);		\
+	return show_cache_disable(this_leaf, buf, slot);		\
 }
 SHOW_CACHE_DISABLE(0)
 SHOW_CACHE_DISABLE(1)
 
+static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu,
+				 unsigned slot, unsigned long idx)
+{
+	int i;
+
+	idx |= BIT(30);
+
+	/*
+	 *  disable index in all 4 subcaches
+	 */
+	for (i = 0; i < 4; i++) {
+		u32 reg = idx | (i << 20);
+
+		if (!l3->subcaches[i])
+			continue;
+
+		pci_write_config_dword(l3->dev, 0x1BC + slot * 4, reg);
+
+		/*
+		 * We need to WBINVD on a core on the node containing the L3
+		 * cache which indices we disable therefore a simple wbinvd()
+		 * is not sufficient.
+		 */
+		wbinvd_on_cpu(cpu);
+
+		reg |= BIT(31);
+		pci_write_config_dword(l3->dev, 0x1BC + slot * 4, reg);
+	}
+}
+
+
 static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
-	const char *buf, size_t count, unsigned int index)
+				   const char *buf, size_t count,
+				   unsigned int slot)
 {
 	struct pci_dev *dev = this_leaf->l3->dev;
 	int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
@@ -448,23 +480,17 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
 	    ((val & SUBCACHE_INDEX) > this_leaf->l3->indices))
 		return -EINVAL;
 
-	val |= BIT(30);
-	pci_write_config_dword(dev, 0x1BC + index * 4, val);
-	/*
-	 * We need to WBINVD on a core on the node containing the L3 cache which
-	 * indices we disable therefore a simple wbinvd() is not sufficient.
-	 */
-	wbinvd_on_cpu(cpu);
-	pci_write_config_dword(dev, 0x1BC + index * 4, val | BIT(31));
+	amd_l3_disable_index(this_leaf->l3, cpu, slot, val);
+
 	return count;
 }
 
-#define STORE_CACHE_DISABLE(index)					\
+#define STORE_CACHE_DISABLE(slot)					\
 static ssize_t								\
-store_cache_disable_##index(struct _cpuid4_info *this_leaf,		\
+store_cache_disable_##slot(struct _cpuid4_info *this_leaf,		\
 			    const char *buf, size_t count)		\
 {									\
-	return store_cache_disable(this_leaf, buf, count, index);	\
+	return store_cache_disable(this_leaf, buf, count, slot);	\
 }
 STORE_CACHE_DISABLE(0)
 STORE_CACHE_DISABLE(1)
-- 
cgit v1.1


From 402af0d7c692ddcfa2333e93d3f275ebd0487926 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@novell.com>
Date: Wed, 21 Apr 2010 15:21:51 +0100
Subject: x86, asm: Introduce and use percpu_inc()

... generating slightly smaller code.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
LKML-Reference: <4BCF261F020000780003B33C@vpn.id2.novell.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 8a6f0af..7a355dd 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -539,7 +539,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 	struct mce m;
 	int i;
 
-	__get_cpu_var(mce_poll_count)++;
+	percpu_inc(mce_poll_count);
 
 	mce_setup(&m);
 
@@ -934,7 +934,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 
 	atomic_inc(&mce_entry);
 
-	__get_cpu_var(mce_exception_count)++;
+	percpu_inc(mce_exception_count);
 
 	if (notify_die(DIE_NMI, "machine check", regs, error_code,
 			   18, SIGKILL) == NOTIFY_STOP)
-- 
cgit v1.1


From 5967ed87ade85a421ef814296c3c7f182b08c225 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@novell.com>
Date: Wed, 21 Apr 2010 16:08:14 +0100
Subject: x86-64: Reduce SMP locks table size

Reduce the SMP locks table size by using relative pointers instead of
absolute ones, thus cutting the table size by half.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
LKML-Reference: <4BCF30FE020000780003B3B6@vpn.id2.novell.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/alternative.c | 45 ++++++++++++++++++++++++-------------------
 1 file changed, 25 insertions(+), 20 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 1a160d5..9367384 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -194,7 +194,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len)
 }
 
 extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
-extern u8 *__smp_locks[], *__smp_locks_end[];
+extern s32 __smp_locks[], __smp_locks_end[];
 static void *text_poke_early(void *addr, const void *opcode, size_t len);
 
 /* Replace instructions with better alternatives for this CPU type.
@@ -235,37 +235,39 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
 
 #ifdef CONFIG_SMP
 
-static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end)
+static void alternatives_smp_lock(const s32 *start, const s32 *end,
+				  u8 *text, u8 *text_end)
 {
-	u8 **ptr;
+	const s32 *poff;
 
 	mutex_lock(&text_mutex);
-	for (ptr = start; ptr < end; ptr++) {
-		if (*ptr < text)
-			continue;
-		if (*ptr > text_end)
+	for (poff = start; poff < end; poff++) {
+		u8 *ptr = (u8 *)poff + *poff;
+
+		if (!*poff || ptr < text || ptr >= text_end)
 			continue;
 		/* turn DS segment override prefix into lock prefix */
-		text_poke(*ptr, ((unsigned char []){0xf0}), 1);
+		text_poke(ptr, ((unsigned char []){0xf0}), 1);
 	};
 	mutex_unlock(&text_mutex);
 }
 
-static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end)
+static void alternatives_smp_unlock(const s32 *start, const s32 *end,
+				    u8 *text, u8 *text_end)
 {
-	u8 **ptr;
+	const s32 *poff;
 
 	if (noreplace_smp)
 		return;
 
 	mutex_lock(&text_mutex);
-	for (ptr = start; ptr < end; ptr++) {
-		if (*ptr < text)
-			continue;
-		if (*ptr > text_end)
+	for (poff = start; poff < end; poff++) {
+		u8 *ptr = (u8 *)poff + *poff;
+
+		if (!*poff || ptr < text || ptr >= text_end)
 			continue;
 		/* turn lock prefix into DS segment override prefix */
-		text_poke(*ptr, ((unsigned char []){0x3E}), 1);
+		text_poke(ptr, ((unsigned char []){0x3E}), 1);
 	};
 	mutex_unlock(&text_mutex);
 }
@@ -276,8 +278,8 @@ struct smp_alt_module {
 	char		*name;
 
 	/* ptrs to lock prefixes */
-	u8		**locks;
-	u8		**locks_end;
+	const s32	*locks;
+	const s32	*locks_end;
 
 	/* .text segment, needed to avoid patching init code ;) */
 	u8		*text;
@@ -398,16 +400,19 @@ void alternatives_smp_switch(int smp)
 int alternatives_text_reserved(void *start, void *end)
 {
 	struct smp_alt_module *mod;
-	u8 **ptr;
+	const s32 *poff;
 	u8 *text_start = start;
 	u8 *text_end = end;
 
 	list_for_each_entry(mod, &smp_alt_modules, next) {
 		if (mod->text > text_end || mod->text_end < text_start)
 			continue;
-		for (ptr = mod->locks; ptr < mod->locks_end; ptr++)
-			if (text_start <= *ptr && text_end >= *ptr)
+		for (poff = mod->locks; poff < mod->locks_end; poff++) {
+			const u8 *ptr = (const u8 *)poff + *poff;
+
+			if (text_start <= ptr && text_end > ptr)
 				return 1;
+		}
 	}
 
 	return 0;
-- 
cgit v1.1


From 47f9fe26299ae022ac1e3fa12e7e73def62b7898 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@novell.com>
Date: Wed, 21 Apr 2010 16:19:57 +0100
Subject: x86-64: Don't export init_level4_pgt

It's not used by any module, and i386 (as well as some other arches)
also doesn't export its equivalent (swapper_pg_dir).

Signed-off-by: Jan Beulich <jbeulich@novell.com>
LKML-Reference: <4BCF33BD020000780003B3E4@vpn.id2.novell.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/x8664_ksyms_64.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 693920b..1b950d1 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -54,7 +54,6 @@ EXPORT_SYMBOL(memcpy);
 EXPORT_SYMBOL(__memcpy);
 
 EXPORT_SYMBOL(empty_zero_page);
-EXPORT_SYMBOL(init_level4_pgt);
 #ifndef CONFIG_PARAVIRT
 EXPORT_SYMBOL(native_load_gs_index);
 #endif
-- 
cgit v1.1


From 40d2e76315da38993129090dc5d56377e573c312 Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Sun, 21 Mar 2010 09:00:43 -0400
Subject: x86-32: Rework cache flush denied handler

The cache flush denied error is an erratum on some AMD 486 clones.  If an invd
instruction is executed in userspace, the processor calls exception 19 (13 hex)
instead of #GP (13 decimal).  On cpus where XMM is not supported, redirect
exception 19 to do_general_protection().  Also, remove die_if_kernel(), since
this was the last user.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
LKML-Reference: <1269176446-2489-2-git-send-email-brgerst@gmail.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/entry_32.S | 19 +++++++++++++++++++
 arch/x86/kernel/traps.c    | 31 +++----------------------------
 2 files changed, 22 insertions(+), 28 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 44a8e0d..cd49141 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -53,6 +53,7 @@
 #include <asm/processor-flags.h>
 #include <asm/ftrace.h>
 #include <asm/irq_vectors.h>
+#include <asm/cpufeature.h>
 
 /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
 #include <linux/elf-em.h>
@@ -905,7 +906,25 @@ ENTRY(simd_coprocessor_error)
 	RING0_INT_FRAME
 	pushl $0
 	CFI_ADJUST_CFA_OFFSET 4
+#ifdef CONFIG_X86_INVD_BUG
+	/* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */
+661:	pushl $do_general_protection
+662:
+.section .altinstructions,"a"
+	.balign 4
+	.long 661b
+	.long 663f
+	.byte X86_FEATURE_XMM
+	.byte 662b-661b
+	.byte 664f-663f
+.previous
+.section .altinstr_replacement,"ax"
+663:	pushl $do_simd_coprocessor_error
+664:
+.previous
+#else
 	pushl $do_simd_coprocessor_error
+#endif
 	CFI_ADJUST_CFA_OFFSET 4
 	jmp error_code
 	CFI_ENDPROC
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 1168e44..a16c9df 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -108,15 +108,6 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
 	dec_preempt_count();
 }
 
-#ifdef CONFIG_X86_32
-static inline void
-die_if_kernel(const char *str, struct pt_regs *regs, long err)
-{
-	if (!user_mode_vm(regs))
-		die(str, regs, err);
-}
-#endif
-
 static void __kprobes
 do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
 	long error_code, siginfo_t *info)
@@ -729,30 +720,14 @@ do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
 	conditional_sti(regs);
 
 #ifdef CONFIG_X86_32
-	if (cpu_has_xmm) {
-		/* Handle SIMD FPU exceptions on PIII+ processors. */
-		ignore_fpu_irq = 1;
-		simd_math_error((void __user *)regs->ip);
-		return;
-	}
-	/*
-	 * Handle strange cache flush from user space exception
-	 * in all other cases.  This is undocumented behaviour.
-	 */
-	if (regs->flags & X86_VM_MASK) {
-		handle_vm86_fault((struct kernel_vm86_regs *)regs, error_code);
-		return;
-	}
-	current->thread.trap_no = 19;
-	current->thread.error_code = error_code;
-	die_if_kernel("cache flush denied", regs, error_code);
-	force_sig(SIGSEGV, current);
+	ignore_fpu_irq = 1;
 #else
 	if (!user_mode(regs) &&
 			kernel_math_error(regs, "kernel simd math error", 19))
 		return;
-	simd_math_error((void __user *)regs->ip);
 #endif
+
+	simd_math_error((void __user *)regs->ip);
 }
 
 dotraplinkage void
-- 
cgit v1.1


From 9b6dba9e0798325dab427b9d60c61630ccc39b28 Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Sun, 21 Mar 2010 09:00:44 -0400
Subject: x86: Merge simd_math_error() into math_error()

The only difference between FPU and SIMD exceptions is where the
status bits are read from (cwd/swd vs. mxcsr).  This also fixes
the discrepency introduced by commit adf77bac, which fixed FPU
but not SIMD.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
LKML-Reference: <1269176446-2489-3-git-send-email-brgerst@gmail.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/irqinit.c |   2 +-
 arch/x86/kernel/traps.c   | 100 +++++++++++++++-------------------------------
 2 files changed, 33 insertions(+), 69 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 0ed2d30..990ae7c 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -60,7 +60,7 @@ static irqreturn_t math_error_irq(int cpl, void *dev_id)
 	outb(0, 0xF0);
 	if (ignore_fpu_irq || !boot_cpu_data.hard_math)
 		return IRQ_NONE;
-	math_error((void __user *)get_irq_regs()->ip);
+	math_error(get_irq_regs(), 0, 16);
 	return IRQ_HANDLED;
 }
 
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index a16c9df..a472992 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -595,36 +595,48 @@ static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
  * the correct behaviour even in the presence of the asynchronous
  * IRQ13 behaviour
  */
-void math_error(void __user *ip)
+void math_error(struct pt_regs *regs, int error_code, int trapnr)
 {
 	struct task_struct *task;
 	siginfo_t info;
-	unsigned short cwd, swd, err;
+	unsigned short err;
 
 	/*
 	 * Save the info for the exception handler and clear the error.
 	 */
 	task = current;
 	save_init_fpu(task);
-	task->thread.trap_no = 16;
-	task->thread.error_code = 0;
+	task->thread.trap_no = trapnr;
+	task->thread.error_code = error_code;
 	info.si_signo = SIGFPE;
 	info.si_errno = 0;
-	info.si_addr = ip;
-	/*
-	 * (~cwd & swd) will mask out exceptions that are not set to unmasked
-	 * status.  0x3f is the exception bits in these regs, 0x200 is the
-	 * C1 reg you need in case of a stack fault, 0x040 is the stack
-	 * fault bit.  We should only be taking one exception at a time,
-	 * so if this combination doesn't produce any single exception,
-	 * then we have a bad program that isn't synchronizing its FPU usage
-	 * and it will suffer the consequences since we won't be able to
-	 * fully reproduce the context of the exception
-	 */
-	cwd = get_fpu_cwd(task);
-	swd = get_fpu_swd(task);
+	info.si_addr = (void __user *)regs->ip;
+	if (trapnr == 16) {
+		unsigned short cwd, swd;
+		/*
+		 * (~cwd & swd) will mask out exceptions that are not set to unmasked
+		 * status.  0x3f is the exception bits in these regs, 0x200 is the
+		 * C1 reg you need in case of a stack fault, 0x040 is the stack
+		 * fault bit.  We should only be taking one exception at a time,
+		 * so if this combination doesn't produce any single exception,
+		 * then we have a bad program that isn't synchronizing its FPU usage
+		 * and it will suffer the consequences since we won't be able to
+		 * fully reproduce the context of the exception
+		 */
+		cwd = get_fpu_cwd(task);
+		swd = get_fpu_swd(task);
 
-	err = swd & ~cwd;
+		err = swd & ~cwd;
+	} else {
+		/*
+		 * The SIMD FPU exceptions are handled a little differently, as there
+		 * is only a single status/control register.  Thus, to determine which
+		 * unmasked exception was caught we must mask the exception mask bits
+		 * at 0x1f80, and then use these to mask the exception bits at 0x3f.
+		 */
+		unsigned short mxcsr = get_fpu_mxcsr(task);
+		err = ~(mxcsr >> 7) & mxcsr;
+	}
 
 	if (err & 0x001) {	/* Invalid op */
 		/*
@@ -663,55 +675,7 @@ dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
 		return;
 #endif
 
-	math_error((void __user *)regs->ip);
-}
-
-static void simd_math_error(void __user *ip)
-{
-	struct task_struct *task;
-	siginfo_t info;
-	unsigned short mxcsr;
-
-	/*
-	 * Save the info for the exception handler and clear the error.
-	 */
-	task = current;
-	save_init_fpu(task);
-	task->thread.trap_no = 19;
-	task->thread.error_code = 0;
-	info.si_signo = SIGFPE;
-	info.si_errno = 0;
-	info.si_code = __SI_FAULT;
-	info.si_addr = ip;
-	/*
-	 * The SIMD FPU exceptions are handled a little differently, as there
-	 * is only a single status/control register.  Thus, to determine which
-	 * unmasked exception was caught we must mask the exception mask bits
-	 * at 0x1f80, and then use these to mask the exception bits at 0x3f.
-	 */
-	mxcsr = get_fpu_mxcsr(task);
-	switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
-	case 0x000:
-	default:
-		break;
-	case 0x001: /* Invalid Op */
-		info.si_code = FPE_FLTINV;
-		break;
-	case 0x002: /* Denormalize */
-	case 0x010: /* Underflow */
-		info.si_code = FPE_FLTUND;
-		break;
-	case 0x004: /* Zero Divide */
-		info.si_code = FPE_FLTDIV;
-		break;
-	case 0x008: /* Overflow */
-		info.si_code = FPE_FLTOVF;
-		break;
-	case 0x020: /* Precision */
-		info.si_code = FPE_FLTRES;
-		break;
-	}
-	force_sig_info(SIGFPE, &info, task);
+	math_error(regs, error_code, 16);
 }
 
 dotraplinkage void
@@ -727,7 +691,7 @@ do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
 		return;
 #endif
 
-	simd_math_error((void __user *)regs->ip);
+	math_error(regs, error_code, 19);
 }
 
 dotraplinkage void
-- 
cgit v1.1


From e2e75c915de045f0785387dc32f55e92fab0614c Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Sun, 21 Mar 2010 09:00:45 -0400
Subject: x86: Merge kernel_math_error() into math_error()

Clean up the kernel exception handling and make it more similar to
the other traps.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
LKML-Reference: <1269176446-2489-4-git-send-email-brgerst@gmail.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/traps.c | 44 ++++++++++++++++----------------------------
 1 file changed, 16 insertions(+), 28 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index a472992..53ba86f 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -576,20 +576,6 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
 	return;
 }
 
-#ifdef CONFIG_X86_64
-static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
-{
-	if (fixup_exception(regs))
-		return 1;
-
-	notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE);
-	/* Illegal floating point operation in the kernel */
-	current->thread.trap_no = trapnr;
-	die(str, regs, 0);
-	return 0;
-}
-#endif
-
 /*
  * Note that we play around with the 'TS' bit in an attempt to get
  * the correct behaviour even in the presence of the asynchronous
@@ -597,14 +583,28 @@ static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
  */
 void math_error(struct pt_regs *regs, int error_code, int trapnr)
 {
-	struct task_struct *task;
+	struct task_struct *task = current;
 	siginfo_t info;
 	unsigned short err;
+	char *str = (trapnr == 16) ? "fpu exception" : "simd exception";
+
+	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, SIGFPE) == NOTIFY_STOP)
+		return;
+	conditional_sti(regs);
+
+	if (!user_mode_vm(regs))
+	{
+		if (!fixup_exception(regs)) {
+			task->thread.error_code = error_code;
+			task->thread.trap_no = trapnr;
+			die(str, regs, error_code);
+		}
+		return;
+	}
 
 	/*
 	 * Save the info for the exception handler and clear the error.
 	 */
-	task = current;
 	save_init_fpu(task);
 	task->thread.trap_no = trapnr;
 	task->thread.error_code = error_code;
@@ -665,14 +665,8 @@ void math_error(struct pt_regs *regs, int error_code, int trapnr)
 
 dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
 {
-	conditional_sti(regs);
-
 #ifdef CONFIG_X86_32
 	ignore_fpu_irq = 1;
-#else
-	if (!user_mode(regs) &&
-	    kernel_math_error(regs, "kernel x87 math error", 16))
-		return;
 #endif
 
 	math_error(regs, error_code, 16);
@@ -681,14 +675,8 @@ dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
 dotraplinkage void
 do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
 {
-	conditional_sti(regs);
-
 #ifdef CONFIG_X86_32
 	ignore_fpu_irq = 1;
-#else
-	if (!user_mode(regs) &&
-			kernel_math_error(regs, "kernel simd math error", 19))
-		return;
 #endif
 
 	math_error(regs, error_code, 19);
-- 
cgit v1.1


From 250825008f1f94887bc039e9227a8adfb5ba366e Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Sun, 21 Mar 2010 09:00:46 -0400
Subject: x86-32: Don't set ignore_fpu_irq in simd exception

Any processor that supports simd will have an internal fpu, and the
irq13 handler will not be enabled.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
LKML-Reference: <1269176446-2489-5-git-send-email-brgerst@gmail.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/traps.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 53ba86f..00516e1 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -675,10 +675,6 @@ dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
 dotraplinkage void
 do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
 {
-#ifdef CONFIG_X86_32
-	ignore_fpu_irq = 1;
-#endif
-
 	math_error(regs, error_code, 19);
 }
 
-- 
cgit v1.1


From 097c1bd5673edaf2a162724636858b71f658fdd2 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Mon, 3 May 2010 15:49:31 -0700
Subject: x86, cpu: Make APERF/MPERF a normal table-driven flag

APERF/MPERF can be handled via the table like all the other scattered
CPU flags.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: Thomas Renninger <trenn@suse.de>
Cc: Borislav Petkov <borislav.petkov@amd.com>
LKML-Reference: <1270065406-1814-4-git-send-email-bp@amd64.org>
---
 arch/x86/kernel/cpu/addon_cpuid_features.c | 23 ++++++++---------------
 1 file changed, 8 insertions(+), 15 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index fd1fc19..10fa568 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -30,13 +30,14 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
 	const struct cpuid_bit *cb;
 
 	static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
-		{ X86_FEATURE_IDA,   CR_EAX, 1, 0x00000006 },
-		{ X86_FEATURE_ARAT,  CR_EAX, 2, 0x00000006 },
-		{ X86_FEATURE_CPB,   CR_EDX, 9, 0x80000007 },
-		{ X86_FEATURE_NPT,   CR_EDX, 0, 0x8000000a },
-		{ X86_FEATURE_LBRV,  CR_EDX, 1, 0x8000000a },
-		{ X86_FEATURE_SVML,  CR_EDX, 2, 0x8000000a },
-		{ X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a },
+		{ X86_FEATURE_IDA,   		CR_EAX, 1, 0x00000006 },
+		{ X86_FEATURE_ARAT,  		CR_EAX, 2, 0x00000006 },
+		{ X86_FEATURE_APERFMPERF,	CR_ECX, 0, 0x00000006 },
+		{ X86_FEATURE_CPB,   		CR_EDX, 9, 0x80000007 },
+		{ X86_FEATURE_NPT,   		CR_EDX, 0, 0x8000000a },
+		{ X86_FEATURE_LBRV,  		CR_EDX, 1, 0x8000000a },
+		{ X86_FEATURE_SVML,  		CR_EDX, 2, 0x8000000a },
+		{ X86_FEATURE_NRIPS, 		CR_EDX, 3, 0x8000000a },
 		{ 0, 0, 0, 0 }
 	};
 
@@ -54,14 +55,6 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
 		if (regs[cb->reg] & (1 << cb->bit))
 			set_cpu_cap(c, cb->feature);
 	}
-
-	/*
-	 * common AMD/Intel features
-	 */
-	if (c->cpuid_level >= 6) {
-		if (cpuid_ecx(6) & 0x1)
-			set_cpu_cap(c, X86_FEATURE_APERFMPERF);
-	}
 }
 
 /* leaf 0xb SMT level */
-- 
cgit v1.1


From 2c2df8418ac7908eec4558407b83f16739006c54 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 30 Mar 2010 01:07:02 -0700
Subject: x86, acpi/irq: Introduce apci_isa_irq_to_gsi

There are a number of cases where the current code makes the assumption
that isa irqs identity map to the first 16 acpi global system intereupts.
In most instances that assumption is correct as that is the required
behaviour in dual i8259 mode and the default behavior in ioapic mode.

However there are some systems out there that take advantage of acpis
interrupt remapping  for the isa irqs to have a completely different
mapping of isa_irq to gsi.

Introduce acpi_isa_irq_to_gsi to perform this mapping explicitly in the
code that needs it.  Initially this will be just the current assumed
identity mapping to ensure it's introduction does not cause regressions.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
LKML-Reference: <1269936436-7039-1-git-send-email-ebiederm@xmission.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/acpi/boot.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index cd40aba..da718d6 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -458,6 +458,14 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
 	return 0;
 }
 
+int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi)
+{
+	if (isa_irq >= 16)
+		return -1;
+	*gsi = isa_irq;
+	return 0;
+}
+
 /*
  * success: return IRQ number (>=0)
  * failure: return < 0
-- 
cgit v1.1


From 9a0a91bb56d2915cdb8585717de38376ad20fef9 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 30 Mar 2010 01:07:03 -0700
Subject: x86, acpi/irq: Teach acpi_get_override_irq to take a gsi not an
 isa_irq

In perverse acpi implementations the isa irqs are not identity mapped
to the first 16 gsi.  Furthermore at least the extended interrupt
resource capability may return gsi's and not isa irqs.  So since
what we get from acpi is a gsi teach acpi_get_overrride_irq to
operate on a gsi instead of an isa_irq.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
LKML-Reference: <1269936436-7039-2-git-send-email-ebiederm@xmission.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/apic/io_apic.c | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 127b871..73ec928 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -4082,22 +4082,27 @@ int __init io_apic_get_version(int ioapic)
 	return reg_01.bits.version;
 }
 
-int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
+int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity)
 {
-	int i;
+	int ioapic, pin, idx;
 
 	if (skip_ioapic_setup)
 		return -1;
 
-	for (i = 0; i < mp_irq_entries; i++)
-		if (mp_irqs[i].irqtype == mp_INT &&
-		    mp_irqs[i].srcbusirq == bus_irq)
-			break;
-	if (i >= mp_irq_entries)
+	ioapic = mp_find_ioapic(gsi);
+	if (ioapic < 0)
+		return -1;
+
+	pin = mp_find_ioapic_pin(ioapic, gsi);
+	if (pin < 0)
+		return -1;
+
+	idx = find_irq_entry(ioapic, pin, mp_INT);
+	if (idx < 0)
 		return -1;
 
-	*trigger = irq_trigger(i);
-	*polarity = irq_polarity(i);
+	*trigger = irq_trigger(idx);
+	*polarity = irq_polarity(idx);
 	return 0;
 }
 
-- 
cgit v1.1


From 9d2062b879495649bb525cf7979126da2e45d288 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 30 Mar 2010 01:07:05 -0700
Subject: x86, acpi/irq: Fix acpi_sci_ioapic_setup so it has both bus_irq and
 gsi

Currently acpi_sci_ioapic_setup calls mp_override_legacy_irq with
bus_irq == gsi, which is wrong if we are comming from an override
Instead pass the bus_irq into acpi_sci_ioapic_setup.

This fix was inspired by a similar fix from:
Yinghai Lu <yinghai@kernel.org>

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
LKML-Reference: <1269936436-7039-4-git-send-email-ebiederm@xmission.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/acpi/boot.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index da718d6..0a036dc 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -313,7 +313,7 @@ acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end)
 /*
  * Parse Interrupt Source Override for the ACPI SCI
  */
-static void __init acpi_sci_ioapic_setup(u32 gsi, u16 polarity, u16 trigger)
+static void __init acpi_sci_ioapic_setup(u8 bus_irq, u16 polarity, u16 trigger, u32 gsi)
 {
 	if (trigger == 0)	/* compatible SCI trigger is level */
 		trigger = 3;
@@ -333,7 +333,7 @@ static void __init acpi_sci_ioapic_setup(u32 gsi, u16 polarity, u16 trigger)
 	 * If GSI is < 16, this will update its flags,
 	 * else it will create a new mp_irqs[] entry.
 	 */
-	mp_override_legacy_irq(gsi, polarity, trigger, gsi);
+	mp_override_legacy_irq(bus_irq, polarity, trigger, gsi);
 
 	/*
 	 * stash over-ride to indicate we've been here
@@ -357,9 +357,10 @@ acpi_parse_int_src_ovr(struct acpi_subtable_header * header,
 	acpi_table_print_madt_entry(header);
 
 	if (intsrc->source_irq == acpi_gbl_FADT.sci_interrupt) {
-		acpi_sci_ioapic_setup(intsrc->global_irq,
+		acpi_sci_ioapic_setup(intsrc->source_irq,
 				      intsrc->inti_flags & ACPI_MADT_POLARITY_MASK,
-				      (intsrc->inti_flags & ACPI_MADT_TRIGGER_MASK) >> 2);
+				      (intsrc->inti_flags & ACPI_MADT_TRIGGER_MASK) >> 2,
+				      intsrc->global_irq);
 		return 0;
 	}
 
@@ -1162,7 +1163,8 @@ static int __init acpi_parse_madt_ioapic_entries(void)
 	 * pretend we got one so we can set the SCI flags.
 	 */
 	if (!acpi_sci_override_gsi)
-		acpi_sci_ioapic_setup(acpi_gbl_FADT.sci_interrupt, 0, 0);
+		acpi_sci_ioapic_setup(acpi_gbl_FADT.sci_interrupt, 0, 0,
+				      acpi_gbl_FADT.sci_interrupt);
 
 	/* Fill in identity legacy mappings where no override */
 	mp_config_acpi_legacy_irqs();
-- 
cgit v1.1


From 0fd52670fb6400be0996ac492b5ed77f3d83d69a Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 30 Mar 2010 01:07:06 -0700
Subject: x86, acpi/irq: Generalize mp_config_acpi_legacy_irqs

Remove the assumption that there is not an override for isa irq 0.
Instead lookup the gsi and from that lookup the ioapic and pin of each
isa irq indivdually.

In general this should not have any behavioural affect but in
perverse cases this gets all of the details correct, instead of
doing something weird.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
LKML-Reference: <1269936436-7039-5-git-send-email-ebiederm@xmission.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/acpi/boot.c | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 0a036dc..3ee92f2 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -961,8 +961,6 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
 void __init mp_config_acpi_legacy_irqs(void)
 {
 	int i;
-	int ioapic;
-	unsigned int dstapic;
 	struct mpc_intsrc mp_irq;
 
 #if defined (CONFIG_MCA) || defined (CONFIG_EISA)
@@ -983,19 +981,27 @@ void __init mp_config_acpi_legacy_irqs(void)
 #endif
 
 	/*
-	 * Locate the IOAPIC that manages the ISA IRQs (0-15).
-	 */
-	ioapic = mp_find_ioapic(0);
-	if (ioapic < 0)
-		return;
-	dstapic = mp_ioapics[ioapic].apicid;
-
-	/*
 	 * Use the default configuration for the IRQs 0-15.  Unless
 	 * overridden by (MADT) interrupt source override entries.
 	 */
 	for (i = 0; i < 16; i++) {
+		int ioapic, pin;
+		unsigned int dstapic;
 		int idx;
+		u32 gsi;
+
+		/* Locate the gsi that irq i maps to. */
+		if (acpi_isa_irq_to_gsi(i, &gsi))
+			continue;
+
+		/*
+		 * Locate the IOAPIC that manages the ISA IRQ.
+		 */
+		ioapic = mp_find_ioapic(gsi);
+		if (ioapic < 0)
+			continue;
+		pin = mp_find_ioapic_pin(ioapic, gsi);
+		dstapic = mp_ioapics[ioapic].apicid;
 
 		for (idx = 0; idx < mp_irq_entries; idx++) {
 			struct mpc_intsrc *irq = mp_irqs + idx;
@@ -1005,7 +1011,7 @@ void __init mp_config_acpi_legacy_irqs(void)
 				break;
 
 			/* Do we already have a mapping for this IOAPIC pin */
-			if (irq->dstapic == dstapic && irq->dstirq == i)
+			if (irq->dstapic == dstapic && irq->dstirq == pin)
 				break;
 		}
 
@@ -1020,7 +1026,7 @@ void __init mp_config_acpi_legacy_irqs(void)
 		mp_irq.dstapic = dstapic;
 		mp_irq.irqtype = mp_INT;
 		mp_irq.srcbusirq = i; /* Identity mapped */
-		mp_irq.dstirq = i;
+		mp_irq.dstirq = pin;
 
 		save_mp_irq(&mp_irq);
 	}
-- 
cgit v1.1


From 4b6b19a1c7302477653d799a53d48063dd53d555 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 30 Mar 2010 01:07:08 -0700
Subject: x86, ioapic: Fix io_apic_redir_entries to return the number of
 entries.

io_apic_redir_entries has a huge conceptual bug.  It returns the maximum
redirection entry not the number of redirection entries.  Which simply
does not match what the name of the function.  This just caught me
and it caught  Feng Tang, and  Len Brown when they wrote sfi_parse_ioapic.

Modify io_apic_redir_entries to actually return the number of redirection
entries, and fix the callers so that they properly handle receiving the
number of the number of redirection table entries, instead of the
number of redirection table entries less one.

While the usage in sfi.c does not show up in this patch it is fixed
by virtue of the fact that io_apic_redir_entries now has the semantics
sfi_parse_ioapic most reasonably expects.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
LKML-Reference: <1269936436-7039-7-git-send-email-ebiederm@xmission.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/apic/io_apic.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 73ec928..0a053e6 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3855,7 +3855,11 @@ int __init io_apic_get_redir_entries (int ioapic)
 	reg_01.raw = io_apic_read(ioapic, 1);
 	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
 
-	return reg_01.bits.entries;
+	/* The register returns the maximum index redir index
+	 * supported, which is one less than the total number of redir
+	 * entries.
+	 */
+	return reg_01.bits.entries + 1;
 }
 
 void __init probe_nr_irqs_gsi(void)
@@ -3871,7 +3875,7 @@ void __init probe_nr_irqs_gsi(void)
 
 		nr = 0;
 		for (idx = 0; idx < nr_ioapics; idx++)
-			nr += io_apic_get_redir_entries(idx) + 1;
+			nr += io_apic_get_redir_entries(idx);
 
 		if (nr > nr_irqs_gsi)
 			nr_irqs_gsi = nr;
@@ -4306,7 +4310,7 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
 	 */
 	mp_gsi_routing[idx].gsi_base = gsi_base;
 	mp_gsi_routing[idx].gsi_end = gsi_base +
-	    io_apic_get_redir_entries(idx);
+	    io_apic_get_redir_entries(idx) - 1;
 
 	printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
 	       "GSI %d-%d\n", idx, mp_ioapics[idx].apicid,
-- 
cgit v1.1


From eddb0c55a14074d6bac8c2ef169aefd7e2c6f139 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 30 Mar 2010 01:07:09 -0700
Subject: x86, ioapic: Fix the types of gsi values

This patches fixes the types of gsi_base and gsi_end values in
struct mp_ioapic_gsi, and the gsi parameter of mp_find_ioapic
and mp_find_ioapic_pin

A gsi is cannonically a u32, not an int.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
LKML-Reference: <1269936436-7039-8-git-send-email-ebiederm@xmission.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/apic/io_apic.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 0a053e6..9ab9762 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -4247,7 +4247,7 @@ void __init ioapic_insert_resources(void)
 	}
 }
 
-int mp_find_ioapic(int gsi)
+int mp_find_ioapic(u32 gsi)
 {
 	int i = 0;
 
@@ -4262,7 +4262,7 @@ int mp_find_ioapic(int gsi)
 	return -1;
 }
 
-int mp_find_ioapic_pin(int ioapic, int gsi)
+int mp_find_ioapic_pin(int ioapic, u32 gsi)
 {
 	if (WARN_ON(ioapic == -1))
 		return -1;
-- 
cgit v1.1


From 5777372af5c929b8f3c706ed7b295b7279537c88 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 30 Mar 2010 01:07:10 -0700
Subject: x86, ioapic: Teach mp_register_ioapic to compute a global gsi_end

Add the global variable gsi_end and teach mp_register_ioapic
to keep it uptodate as we add more ioapics into the system.

ioapics can only be added early in boot so the code that
runs later can treat gsi_end as a constant.

Remove the have hacks in sfi.c to second guess mp_register_ioapic
by keeping t's own running total of how many gsi's have been seen,
and instead use the gsi_end.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
LKML-Reference: <1269936436-7039-9-git-send-email-ebiederm@xmission.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/apic/io_apic.c | 6 ++++++
 arch/x86/kernel/sfi.c          | 4 +---
 2 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 9ab9762..f807255 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -89,6 +89,9 @@ int nr_ioapics;
 /* IO APIC gsi routing info */
 struct mp_ioapic_gsi  mp_gsi_routing[MAX_IO_APICS];
 
+/* The last gsi number used */
+u32 gsi_end;
+
 /* MP IRQ source entries */
 struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES];
 
@@ -4312,6 +4315,9 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
 	mp_gsi_routing[idx].gsi_end = gsi_base +
 	    io_apic_get_redir_entries(idx) - 1;
 
+	if (mp_gsi_routing[idx].gsi_end > gsi_end)
+		gsi_end = mp_gsi_routing[idx].gsi_end;
+
 	printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
 	       "GSI %d-%d\n", idx, mp_ioapics[idx].apicid,
 	       mp_ioapics[idx].apicver, mp_ioapics[idx].apicaddr,
diff --git a/arch/x86/kernel/sfi.c b/arch/x86/kernel/sfi.c
index 34e0993..7ded578 100644
--- a/arch/x86/kernel/sfi.c
+++ b/arch/x86/kernel/sfi.c
@@ -81,7 +81,6 @@ static int __init sfi_parse_cpus(struct sfi_table_header *table)
 #endif /* CONFIG_X86_LOCAL_APIC */
 
 #ifdef CONFIG_X86_IO_APIC
-static u32 gsi_base;
 
 static int __init sfi_parse_ioapic(struct sfi_table_header *table)
 {
@@ -94,8 +93,7 @@ static int __init sfi_parse_ioapic(struct sfi_table_header *table)
 	pentry = (struct sfi_apic_table_entry *)sb->pentry;
 
 	for (i = 0; i < num; i++) {
-		mp_register_ioapic(i, pentry->phys_addr, gsi_base);
-		gsi_base += io_apic_get_redir_entries(i);
+		mp_register_ioapic(i, pentry->phys_addr, gsi_end + 1);
 		pentry++;
 	}
 
-- 
cgit v1.1


From cf7500c0ea133d66f8449d86392d83f840102632 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 30 Mar 2010 01:07:11 -0700
Subject: x86, ioapic: In mpparse use mp_register_ioapic

Long ago MP_ioapic_info was the primary way of setting up our
ioapic data structures and mp_register_ioapic was a compatibility
shim for acpi code.  Now the situation is reversed and
and mp_register_ioapic is the primary way of setting up our
ioapic data structures.

Keep the setting up of ioapic data structures uniform by
having mp_register_ioapic call mp_register_ioapic.

This changes a few fields:

- type: is now hardset to MP_IOAPIC but type had to
  bey MP_IOAPIC or MP_ioapic_info would not have been called.

- flags: is now hard coded to MPC_APIC_USABLE.
  We require flags to contain at least MPC_APIC_USEBLE in
  MP_ioapic_info and we don't ever examine flags so dropping
  a few flags that might possibly exist that we have never
  used is harmless.

- apicaddr: Unchanged

- apicver: Read from the ioapic instead of using the cached
  hardware value in the MP table.  The real hardware value
  will be more accurate.

- apicid: Now verified to be unique and changed if it is not.
  If the BIOS got this right this is a noop.  If the BIOS did
  not fixing things appears to be the better solution.

This adds gsi_base and gsi_end values to our ioapics defined with
the mpatable, which will make our lives simpler later since
we can always assume gsi_base and gsi_end are valid.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
LKML-Reference: <1269936436-7039-10-git-send-email-ebiederm@xmission.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/mpparse.c | 25 +------------------------
 1 file changed, 1 insertion(+), 24 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index e81030f..5ae5d24 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -115,21 +115,6 @@ static void __init MP_bus_info(struct mpc_bus *m)
 		printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
 }
 
-static int bad_ioapic(unsigned long address)
-{
-	if (nr_ioapics >= MAX_IO_APICS) {
-		printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
-		       "(found %d)\n", MAX_IO_APICS, nr_ioapics);
-		panic("Recompile kernel with bigger MAX_IO_APICS!\n");
-	}
-	if (!address) {
-		printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
-		       " found in table, skipping!\n");
-		return 1;
-	}
-	return 0;
-}
-
 static void __init MP_ioapic_info(struct mpc_ioapic *m)
 {
 	if (!(m->flags & MPC_APIC_USABLE))
@@ -138,15 +123,7 @@ static void __init MP_ioapic_info(struct mpc_ioapic *m)
 	printk(KERN_INFO "I/O APIC #%d Version %d at 0x%X.\n",
 	       m->apicid, m->apicver, m->apicaddr);
 
-	if (bad_ioapic(m->apicaddr))
-		return;
-
-	mp_ioapics[nr_ioapics].apicaddr = m->apicaddr;
-	mp_ioapics[nr_ioapics].apicid = m->apicid;
-	mp_ioapics[nr_ioapics].type = m->type;
-	mp_ioapics[nr_ioapics].apicver = m->apicver;
-	mp_ioapics[nr_ioapics].flags = m->flags;
-	nr_ioapics++;
+	mp_register_ioapic(m->apicid, m->apicaddr, gsi_end + 1);
 }
 
 static void print_MP_intsrc_info(struct mpc_intsrc *m)
-- 
cgit v1.1


From 7716a5c4ff5f1f3dc5e9edcab125cbf7fceef0af Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 30 Mar 2010 01:07:12 -0700
Subject: x86, ioapic: Move nr_ioapic_registers calculation to
 mp_register_ioapic.

Now that all ioapic registration happens in mp_register_ioapic we can
move the calculation of nr_ioapic_registers there from enable_IO_APIC.
The number of ioapic registers is already calucated in mp_register_ioapic
so all that really needs to be done is to save the caluclated value
in nr_ioapic_registers.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
LKML-Reference: <1269936436-7039-11-git-send-email-ebiederm@xmission.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/apic/io_apic.c | 22 ++++++++--------------
 1 file changed, 8 insertions(+), 14 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index f807255..dae9240 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1953,20 +1953,8 @@ static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
 
 void __init enable_IO_APIC(void)
 {
-	union IO_APIC_reg_01 reg_01;
 	int i8259_apic, i8259_pin;
 	int apic;
-	unsigned long flags;
-
-	/*
-	 * The number of IO-APIC IRQ registers (== #pins):
-	 */
-	for (apic = 0; apic < nr_ioapics; apic++) {
-		raw_spin_lock_irqsave(&ioapic_lock, flags);
-		reg_01.raw = io_apic_read(apic, 1);
-		raw_spin_unlock_irqrestore(&ioapic_lock, flags);
-		nr_ioapic_registers[apic] = reg_01.bits.entries+1;
-	}
 
 	if (!legacy_pic->nr_legacy_irqs)
 		return;
@@ -4293,6 +4281,7 @@ static int bad_ioapic(unsigned long address)
 void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
 {
 	int idx = 0;
+	int entries;
 
 	if (bad_ioapic(address))
 		return;
@@ -4311,9 +4300,14 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
 	 * Build basic GSI lookup table to facilitate gsi->io_apic lookups
 	 * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
 	 */
+	entries = io_apic_get_redir_entries(idx);
 	mp_gsi_routing[idx].gsi_base = gsi_base;
-	mp_gsi_routing[idx].gsi_end = gsi_base +
-	    io_apic_get_redir_entries(idx) - 1;
+	mp_gsi_routing[idx].gsi_end = gsi_base + entries - 1;
+
+	/*
+	 * The number of IO-APIC IRQ registers (== #pins):
+	 */
+	nr_ioapic_registers[idx] = entries;
 
 	if (mp_gsi_routing[idx].gsi_end > gsi_end)
 		gsi_end = mp_gsi_routing[idx].gsi_end;
-- 
cgit v1.1


From d464207c4fdd70c2a0febd4f9c58206fa915bb36 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 30 Mar 2010 01:07:13 -0700
Subject: x86, ioapic: Optimize pin_2_irq

Now that all ioapics have valid gsi_base values use this to
accellerate pin_2_irq.  In the case of acpi this also ensures
that pin_2_irq will compute the same irq value for an ioapic
pin as acpi will.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
LKML-Reference: <1269936436-7039-12-git-send-email-ebiederm@xmission.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/apic/io_apic.c | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index dae9240..0d35f46 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1019,7 +1019,7 @@ static inline int irq_trigger(int idx)
 int (*ioapic_renumber_irq)(int ioapic, int irq);
 static int pin_2_irq(int idx, int apic, int pin)
 {
-	int irq, i;
+	int irq;
 	int bus = mp_irqs[idx].srcbus;
 
 	/*
@@ -1031,18 +1031,13 @@ static int pin_2_irq(int idx, int apic, int pin)
 	if (test_bit(bus, mp_bus_not_pci)) {
 		irq = mp_irqs[idx].srcbusirq;
 	} else {
-		/*
-		 * PCI IRQs are mapped in order
-		 */
-		i = irq = 0;
-		while (i < apic)
-			irq += nr_ioapic_registers[i++];
-		irq += pin;
+		u32 gsi = mp_gsi_routing[apic].gsi_base + pin;
 		/*
                  * For MPS mode, so far only needed by ES7000 platform
                  */
 		if (ioapic_renumber_irq)
-			irq = ioapic_renumber_irq(apic, irq);
+			gsi = ioapic_renumber_irq(apic, gsi);
+		irq = gsi;
 	}
 
 #ifdef CONFIG_X86_32
-- 
cgit v1.1


From 4afc51a835d3aeba11c35090f524e05c84586d27 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 30 Mar 2010 01:07:14 -0700
Subject: x86, ioapic: Simplify probe_nr_irqs_gsi.

Use the global gsi_end value now that all ioapics have
valid gsi numbers instead of a combination of acpi_probe_gsi
and walking all of the ioapics and couting their number of
entries by hand if acpi_probe_gsi gave us an answer we did
not like.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
LKML-Reference: <1269936436-7039-13-git-send-email-ebiederm@xmission.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/acpi/boot.c    | 23 -----------------------
 arch/x86/kernel/apic/io_apic.c | 17 +++--------------
 2 files changed, 3 insertions(+), 37 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 3ee92f2..07a63ce 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -876,29 +876,6 @@ static int __init acpi_parse_madt_lapic_entries(void)
 extern int es7000_plat;
 #endif
 
-int __init acpi_probe_gsi(void)
-{
-	int idx;
-	int gsi;
-	int max_gsi = 0;
-
-	if (acpi_disabled)
-		return 0;
-
-	if (!acpi_ioapic)
-		return 0;
-
-	max_gsi = 0;
-	for (idx = 0; idx < nr_ioapics; idx++) {
-		gsi = mp_gsi_routing[idx].gsi_end;
-
-		if (gsi > max_gsi)
-			max_gsi = gsi;
-	}
-
-	return max_gsi + 1;
-}
-
 static void assign_to_mp_irq(struct mpc_intsrc *m,
 				    struct mpc_intsrc *mp_irq)
 {
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 0d35f46..9f3f6ca 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3850,22 +3850,11 @@ int __init io_apic_get_redir_entries (int ioapic)
 
 void __init probe_nr_irqs_gsi(void)
 {
-	int nr = 0;
+	int nr;
 
-	nr = acpi_probe_gsi();
-	if (nr > nr_irqs_gsi) {
+	nr = gsi_end + 1;
+	if (nr > nr_irqs_gsi)
 		nr_irqs_gsi = nr;
-	} else {
-		/* for acpi=off or acpi is not compiled in */
-		int idx;
-
-		nr = 0;
-		for (idx = 0; idx < nr_ioapics; idx++)
-			nr += io_apic_get_redir_entries(idx);
-
-		if (nr > nr_irqs_gsi)
-			nr_irqs_gsi = nr;
-	}
 
 	printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi);
 }
-- 
cgit v1.1


From 988856ee1623bd37e384105f7bb2b7fe44c009f6 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 30 Mar 2010 01:07:15 -0700
Subject: x86, acpi/irq: Handle isa irqs that are not identity mapped to gsi's.

ACPI irq source overrides are allowed for the 16 isa irqs and are
allowed to map any gsi to any isa irq.  A few motherboards have been
seen to take advantage of this and put the isa irqs on the 2nd or
3rd ioapic.  This causes some problems, most notably the fact
that we can not use any gsi < 16.

To correct this move the gsis that are not isa irqs and have
a gsi number < 16 into the linux irq space just past gsi_end.
This is what the es7000 platform is doing today.  Moving only the
low 16 gsis above the rest of the gsi's only penalizes weird
platforms, leaving sane acpi implementations with a 1-1 mapping
of gsis and irqs.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
LKML-Reference: <1269936436-7039-14-git-send-email-ebiederm@xmission.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/acpi/boot.c    | 57 +++++++++++++++++++++++++++++++++++++++---
 arch/x86/kernel/apic/io_apic.c |  8 ++++--
 2 files changed, 59 insertions(+), 6 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 07a63ce..325fbba 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -94,6 +94,53 @@ enum acpi_irq_model_id acpi_irq_model = ACPI_IRQ_MODEL_PIC;
 
 
 /*
+ * ISA irqs by default are the first 16 gsis but can be
+ * any gsi as specified by an interrupt source override.
+ */
+static u32 isa_irq_to_gsi[NR_IRQS_LEGACY] __read_mostly = {
+	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+};
+
+static unsigned int gsi_to_irq(unsigned int gsi)
+{
+	unsigned int irq = gsi + NR_IRQS_LEGACY;
+	unsigned int i;
+
+	for (i = 0; i < NR_IRQS_LEGACY; i++) {
+		if (isa_irq_to_gsi[i] == gsi) {
+			return i;
+		}
+	}
+
+	/* Provide an identity mapping of gsi == irq
+	 * except on truly weird platforms that have
+	 * non isa irqs in the first 16 gsis.
+	 */
+	if (gsi >= NR_IRQS_LEGACY)
+		irq = gsi;
+	else
+		irq = gsi_end + 1 + gsi;
+
+	return irq;
+}
+
+static u32 irq_to_gsi(int irq)
+{
+	unsigned int gsi;
+
+	if (irq < NR_IRQS_LEGACY)
+		gsi = isa_irq_to_gsi[irq];
+	else if (irq <= gsi_end)
+		gsi = irq;
+	else if (irq <= (gsi_end + NR_IRQS_LEGACY))
+		gsi = irq - gsi_end;
+	else
+		gsi = 0xffffffff;
+
+	return gsi;
+}
+
+/*
  * Temporarily use the virtual area starting from FIX_IO_APIC_BASE_END,
  * to map the target physical address. The problem is that set_fixmap()
  * provides a single page, and it is possible that the page is not
@@ -449,7 +496,7 @@ void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger)
 
 int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
 {
-	*irq = gsi;
+	*irq = gsi_to_irq(gsi);
 
 #ifdef CONFIG_X86_IO_APIC
 	if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC)
@@ -463,7 +510,7 @@ int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi)
 {
 	if (isa_irq >= 16)
 		return -1;
-	*gsi = isa_irq;
+	*gsi = irq_to_gsi(isa_irq);
 	return 0;
 }
 
@@ -491,7 +538,7 @@ int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
 		plat_gsi = mp_register_gsi(dev, gsi, trigger, polarity);
 	}
 #endif
-	irq = plat_gsi;
+	irq = gsi_to_irq(plat_gsi);
 
 	return irq;
 }
@@ -933,6 +980,8 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
 	mp_irq.dstirq = pin;	/* INTIN# */
 
 	save_mp_irq(&mp_irq);
+
+	isa_irq_to_gsi[bus_irq] = gsi;
 }
 
 void __init mp_config_acpi_legacy_irqs(void)
@@ -1086,7 +1135,7 @@ int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
 	set_io_apic_irq_attr(&irq_attr, ioapic, ioapic_pin,
 			     trigger == ACPI_EDGE_SENSITIVE ? 0 : 1,
 			     polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
-	io_apic_set_pci_routing(dev, gsi, &irq_attr);
+	io_apic_set_pci_routing(dev, gsi_to_irq(gsi), &irq_attr);
 
 	return gsi;
 }
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 9f3f6ca..594827c 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1037,7 +1037,11 @@ static int pin_2_irq(int idx, int apic, int pin)
                  */
 		if (ioapic_renumber_irq)
 			gsi = ioapic_renumber_irq(apic, gsi);
-		irq = gsi;
+
+		if (gsi >= NR_IRQS_LEGACY)
+			irq = gsi;
+		else
+			irq = gsi_end + 1 + gsi;
 	}
 
 #ifdef CONFIG_X86_32
@@ -3852,7 +3856,7 @@ void __init probe_nr_irqs_gsi(void)
 {
 	int nr;
 
-	nr = gsi_end + 1;
+	nr = gsi_end + 1 + NR_IRQS_LEGACY;
 	if (nr > nr_irqs_gsi)
 		nr_irqs_gsi = nr;
 
-- 
cgit v1.1


From 7b20bd5fb902088579af4e70f7f802b93181a628 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 30 Mar 2010 01:07:16 -0700
Subject: x86, irq: Kill io_apic_renumber_irq

Now that the generic irq layer is performing the exact same remapping as
io_apic_renumber_irq we can kill this weird  es7000 specific function.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
LKML-Reference: <1269936436-7039-15-git-send-email-ebiederm@xmission.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/acpi/boot.c      |  5 -----
 arch/x86/kernel/apic/es7000_32.c | 19 -------------------
 arch/x86/kernel/apic/io_apic.c   |  6 ------
 3 files changed, 30 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 325fbba..9a5ed58 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -1117,11 +1117,6 @@ int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
 
 	ioapic_pin = mp_find_ioapic_pin(ioapic, gsi);
 
-#ifdef CONFIG_X86_32
-	if (ioapic_renumber_irq)
-		gsi = ioapic_renumber_irq(ioapic, gsi);
-#endif
-
 	if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
 		printk(KERN_ERR "Invalid reference to IOAPIC pin "
 		       "%d-%d\n", mp_ioapics[ioapic].apicid,
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 03ba1b8..425e53a 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -131,24 +131,6 @@ int					es7000_plat;
 
 static unsigned int			base;
 
-static int
-es7000_rename_gsi(int ioapic, int gsi)
-{
-	if (es7000_plat == ES7000_ZORRO)
-		return gsi;
-
-	if (!base) {
-		int i;
-		for (i = 0; i < nr_ioapics; i++)
-			base += nr_ioapic_registers[i];
-	}
-
-	if (!ioapic && (gsi < 16))
-		gsi += base;
-
-	return gsi;
-}
-
 static int __cpuinit wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip)
 {
 	unsigned long vect = 0, psaival = 0;
@@ -190,7 +172,6 @@ static void setup_unisys(void)
 		es7000_plat = ES7000_ZORRO;
 	else
 		es7000_plat = ES7000_CLASSIC;
-	ioapic_renumber_irq = es7000_rename_gsi;
 }
 
 /*
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 594827c..d174d88 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1016,7 +1016,6 @@ static inline int irq_trigger(int idx)
 	return MPBIOS_trigger(idx);
 }
 
-int (*ioapic_renumber_irq)(int ioapic, int irq);
 static int pin_2_irq(int idx, int apic, int pin)
 {
 	int irq;
@@ -1032,11 +1031,6 @@ static int pin_2_irq(int idx, int apic, int pin)
 		irq = mp_irqs[idx].srcbusirq;
 	} else {
 		u32 gsi = mp_gsi_routing[apic].gsi_base + pin;
-		/*
-                 * For MPS mode, so far only needed by ES7000 platform
-                 */
-		if (ioapic_renumber_irq)
-			gsi = ioapic_renumber_irq(apic, gsi);
 
 		if (gsi >= NR_IRQS_LEGACY)
 			irq = gsi;
-- 
cgit v1.1


From a2a47c6c3d1a7c01da4464b3b7be93b924f874c1 Mon Sep 17 00:00:00 2001
From: Ky Srinivasan <ksrinivasan@novell.com>
Date: Thu, 6 May 2010 12:08:41 -0700
Subject: x86: Detect running on a Microsoft HyperV system

This patch integrates HyperV detection within the framework currently
used by VmWare. With this patch, we can avoid having to replicate the
HyperV detection code in each of the Microsoft HyperV drivers.

Reworked and tweaked by Greg K-H to build properly.

Signed-off-by: K. Y. Srinivasan <ksrinivasan@novell.com>
LKML-Reference: <20100506190841.GA1605@kroah.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Vadim Rozenfeld <vrozenfe@redhat.com>
Cc: Avi Kivity <avi@redhat.com>
Cc: Gleb Natapov <gleb@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: "K.Prasad" <prasad@linux.vnet.ibm.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Alan Cox <alan@linux.intel.com>
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: Hank Janssen <hjanssen@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/Makefile     |  2 +-
 arch/x86/kernel/cpu/hypervisor.c | 10 ++++--
 arch/x86/kernel/cpu/mshyperv.c   | 67 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 75 insertions(+), 4 deletions(-)
 create mode 100644 arch/x86/kernel/cpu/mshyperv.c

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index c202b62..3a785da 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -14,7 +14,7 @@ CFLAGS_common.o		:= $(nostackp)
 
 obj-y			:= intel_cacheinfo.o addon_cpuid_features.o
 obj-y			+= proc.o capflags.o powerflags.o common.o
-obj-y			+= vmware.o hypervisor.o sched.o
+obj-y			+= vmware.o hypervisor.o sched.o mshyperv.o
 
 obj-$(CONFIG_X86_32)	+= bugs.o cmpxchg.o
 obj-$(CONFIG_X86_64)	+= bugs_64.o
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index 08be922..de3f4e0 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -23,6 +23,7 @@
 
 #include <asm/processor.h>
 #include <asm/vmware.h>
+#include <asm/mshyperv.h>
 #include <asm/hypervisor.h>
 
 static inline void __cpuinit
@@ -30,6 +31,8 @@ detect_hypervisor_vendor(struct cpuinfo_x86 *c)
 {
 	if (vmware_platform())
 		c->x86_hyper_vendor = X86_HYPER_VENDOR_VMWARE;
+	else if (ms_hyperv_platform())
+		c->x86_hyper_vendor = X86_HYPER_VENDOR_MSFT;
 	else
 		c->x86_hyper_vendor = X86_HYPER_VENDOR_NONE;
 }
@@ -37,10 +40,11 @@ detect_hypervisor_vendor(struct cpuinfo_x86 *c)
 static inline void __cpuinit
 hypervisor_set_feature_bits(struct cpuinfo_x86 *c)
 {
-	if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE) {
+	if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE)
 		vmware_set_feature_bits(c);
-		return;
-	}
+	else if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_MSFT)
+		ms_hyperv_set_feature_bits(c);
+	return;
 }
 
 void __cpuinit init_hypervisor(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
new file mode 100644
index 0000000..2443b61
--- /dev/null
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -0,0 +1,67 @@
+/*
+ * HyperV  Detection code.
+ *
+ * Copyright (C) 2010, Novell, Inc.
+ * Author : K. Y. Srinivasan <ksrinivasan@novell.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ */
+
+#include <linux/types.h>
+#include <asm/processor.h>
+#include <asm/hyperv.h>
+#include <asm/mshyperv.h>
+
+
+int ms_hyperv_platform(void)
+{
+	u32 eax, ebx, ecx, edx;
+	char hyp_signature[13];
+
+	cpuid(1, &eax, &ebx, &ecx, &edx);
+	if (!(ecx & HYPERV_HYPERVISOR_PRESENT_BIT))
+		return 0;
+
+	cpuid(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS, &eax, &ebx, &ecx, &edx);
+	*(u32 *)(hyp_signature + 0) = ebx;
+	*(u32 *)(hyp_signature + 4) = ecx;
+	*(u32 *)(hyp_signature + 8) = edx;
+
+	if ((eax < HYPERV_CPUID_MIN) || (memcmp("Microsoft Hv", hyp_signature, 12)))
+		return 0;
+	return 1;
+}
+
+void __cpuinit ms_hyperv_set_feature_bits(struct cpuinfo_x86 *c)
+{
+	u32 eax, ebx, ecx, edx;
+
+	c->x86_hyper_features = 0;
+	/*
+	 * Extract the features, recommendations etc.
+	 * The first 9 bits will be used to track hypervisor features.
+	 * The next 6 bits will be used to track the hypervisor
+	 * recommendations.
+	 */
+	cpuid(HYPERV_CPUID_FEATURES, &eax, &ebx, &ecx, &edx);
+	c->x86_hyper_features |= (eax & 0x1ff);
+
+	cpuid(HYPERV_CPUID_ENLIGHTMENT_INFO, &eax, &ebx, &ecx, &edx);
+	c->x86_hyper_features |= ((eax & 0x3f) << 9);
+	printk(KERN_INFO "Detected HyperV with features: %x\n",
+		c->x86_hyper_features);
+}
-- 
cgit v1.1


From 2b107d93635616db0c3f893c8cc2e6d5cd8d77b2 Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Fri, 7 May 2010 14:59:45 -0700
Subject: x86: Avoid check hlt for newer cpus

Check hlt instruction was targeted for some older CPUs. It is an expensive
operation in that it takes 4 ticks to break out the check.  We can avoid
such check completely for newer x86 cpus (family >= 5).

[ hpa: corrected family > 5 to family >= 5 ]

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
LKML-Reference: <1273269585-14346-1-git-send-email-jacob.jun.pan@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/bugs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 01a2652..c39576c 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -86,7 +86,7 @@ static void __init check_fpu(void)
 
 static void __init check_hlt(void)
 {
-	if (paravirt_enabled())
+	if (boot_cpu_data.x86 >= 5 || paravirt_enabled())
 		return;
 
 	printk(KERN_INFO "Checking 'hlt' instruction... ");
-- 
cgit v1.1


From 9fa02317429449e8176c9bb6da3ac00eb14d52d3 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Fri, 7 May 2010 16:55:41 -0700
Subject: x86, HyperV: fix up the license to mshyperv.c

This should have been GPLv2 only, we cut and pasted from the wrong file
originally, sorry.

Also removed some unneeded boilerplate license code, we all know where
to find the GPLv2, and that there's no warranty as that is implicit from
the license.

Cc: Ky Srinivasan <ksrinivasan@novell.com>
Cc: Hank Janssen <hjanssen@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
LKML-Reference: <20100507235541.GA15448@kroah.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mshyperv.c | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 2443b61..a58d8e6 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -6,18 +6,7 @@
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ * the Free Software Foundation; version 2 of the License.
  *
  */
 
-- 
cgit v1.1


From e08cae4181af9483b04ecfac48f01c8e5a5f27bf Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Fri, 7 May 2010 16:57:28 -0700
Subject: x86: Clean up the hypervisor layer

Clean up the hypervisor layer and the hypervisor drivers, using an ops
structure instead of an enumeration with if statements.

The identity of the hypervisor, if needed, can be tested by testing
the pointer value in x86_hyper.

The MS-HyperV private state is moved into a normal global variable
(it's per-system state, not per-CPU state).  Being a normal bss
variable, it will be left at all zero on non-HyperV platforms, and so
can generally be tested for HyperV-specific features without
additional qualification.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Acked-by: Greg KH <greg@kroah.com>
Cc: Hank Janssen <hjanssen@microsoft.com>
Cc: Alok Kataria <akataria@vmware.com>
Cc: Ky Srinivasan <ksrinivasan@novell.com>
LKML-Reference: <4BE49778.6060800@zytor.com>
---
 arch/x86/kernel/cpu/hypervisor.c | 56 ++++++++++++++++++++++++----------------
 arch/x86/kernel/cpu/mshyperv.c   | 51 +++++++++++++++++-------------------
 arch/x86/kernel/cpu/vmware.c     | 36 ++++++++++++++------------
 3 files changed, 78 insertions(+), 65 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index de3f4e0..8738175 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -22,40 +22,52 @@
  */
 
 #include <asm/processor.h>
-#include <asm/vmware.h>
-#include <asm/mshyperv.h>
 #include <asm/hypervisor.h>
 
-static inline void __cpuinit
-detect_hypervisor_vendor(struct cpuinfo_x86 *c)
+/*
+ * Hypervisor detect order.  This is specified explicitly here because
+ * some hypervisors might implement compatibility modes for other
+ * hypervisors and therefore need to be detected in specific sequence.
+ */
+static const __initconst struct hypervisor_x86 * const hypervisors[] =
 {
-	if (vmware_platform())
-		c->x86_hyper_vendor = X86_HYPER_VENDOR_VMWARE;
-	else if (ms_hyperv_platform())
-		c->x86_hyper_vendor = X86_HYPER_VENDOR_MSFT;
-	else
-		c->x86_hyper_vendor = X86_HYPER_VENDOR_NONE;
-}
+	&x86_hyper_vmware,
+	&x86_hyper_ms_hyperv,
+};
 
-static inline void __cpuinit
-hypervisor_set_feature_bits(struct cpuinfo_x86 *c)
+const struct hypervisor_x86 *x86_hyper;
+
+static inline void __init
+detect_hypervisor_vendor(void)
 {
-	if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE)
-		vmware_set_feature_bits(c);
-	else if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_MSFT)
-		ms_hyperv_set_feature_bits(c);
-	return;
+	const struct hypervisor_x86 *h, * const *p;
+
+	for (p = hypervisors; p < hypervisors + ARRAY_SIZE(hypervisors); p++) {
+		h = *p;
+		if (h->detect()) {
+			x86_hyper = h;
+			printk(KERN_INFO "Hypervisor detected: %s\n", h->name);
+			break;
+		}
+	}
 }
 
 void __cpuinit init_hypervisor(struct cpuinfo_x86 *c)
 {
-	detect_hypervisor_vendor(c);
-	hypervisor_set_feature_bits(c);
+	if (x86_hyper && x86_hyper->set_cpu_features)
+		x86_hyper->set_cpu_features(c);
 }
 
 void __init init_hypervisor_platform(void)
 {
+
+	detect_hypervisor_vendor();
+
+	if (!x86_hyper)
+		return;
+
 	init_hypervisor(&boot_cpu_data);
-	if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE)
-		vmware_platform_setup();
+
+	if (x86_hyper->init_platform)
+		x86_hyper->init_platform();
 }
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index a58d8e6..5969c3e 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -12,45 +12,42 @@
 
 #include <linux/types.h>
 #include <asm/processor.h>
+#include <asm/hypervisor.h>
 #include <asm/hyperv.h>
 #include <asm/mshyperv.h>
 
+struct ms_hyperv_info ms_hyperv;
 
-int ms_hyperv_platform(void)
+static bool __init ms_hyperv_platform(void)
 {
-	u32 eax, ebx, ecx, edx;
-	char hyp_signature[13];
+	u32 eax;
+	u32 hyp_signature[3];
 
-	cpuid(1, &eax, &ebx, &ecx, &edx);
-	if (!(ecx & HYPERV_HYPERVISOR_PRESENT_BIT))
-		return 0;
+	if (!boot_cpu_has(X86_FEATURE_HYPERVISOR))
+		return false;
 
-	cpuid(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS, &eax, &ebx, &ecx, &edx);
-	*(u32 *)(hyp_signature + 0) = ebx;
-	*(u32 *)(hyp_signature + 4) = ecx;
-	*(u32 *)(hyp_signature + 8) = edx;
+	cpuid(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS,
+	      &eax, &hyp_signature[0], &hyp_signature[1], &hyp_signature[2]);
 
-	if ((eax < HYPERV_CPUID_MIN) || (memcmp("Microsoft Hv", hyp_signature, 12)))
-		return 0;
-	return 1;
+	return eax >= HYPERV_CPUID_MIN &&
+		eax <= HYPERV_CPUID_MAX &&
+		!memcmp("Microsoft Hv", hyp_signature, 12);
 }
 
-void __cpuinit ms_hyperv_set_feature_bits(struct cpuinfo_x86 *c)
+static void __init ms_hyperv_init_platform(void)
 {
-	u32 eax, ebx, ecx, edx;
-
-	c->x86_hyper_features = 0;
 	/*
-	 * Extract the features, recommendations etc.
-	 * The first 9 bits will be used to track hypervisor features.
-	 * The next 6 bits will be used to track the hypervisor
-	 * recommendations.
+	 * Extract the features and hints
 	 */
-	cpuid(HYPERV_CPUID_FEATURES, &eax, &ebx, &ecx, &edx);
-	c->x86_hyper_features |= (eax & 0x1ff);
+	ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES);
+	ms_hyperv.hints    = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO);
 
-	cpuid(HYPERV_CPUID_ENLIGHTMENT_INFO, &eax, &ebx, &ecx, &edx);
-	c->x86_hyper_features |= ((eax & 0x3f) << 9);
-	printk(KERN_INFO "Detected HyperV with features: %x\n",
-		c->x86_hyper_features);
+	printk(KERN_INFO "HyperV: features 0x%x, hints 0x%x\n",
+	       ms_hyperv.features, ms_hyperv.hints);
 }
+
+const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
+	.name			= "Microsoft HyperV",
+	.detect			= ms_hyperv_platform,
+	.init_platform		= ms_hyperv_init_platform,
+};
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 1cbed97..46a5b5d 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -23,8 +23,8 @@
 
 #include <linux/dmi.h>
 #include <asm/div64.h>
-#include <asm/vmware.h>
 #include <asm/x86_init.h>
+#include <asm/hypervisor.h>
 
 #define CPUID_VMWARE_INFO_LEAF	0x40000000
 #define VMWARE_HYPERVISOR_MAGIC	0x564D5868
@@ -64,7 +64,7 @@ static unsigned long vmware_get_tsc_khz(void)
 	return tsc_hz;
 }
 
-void __init vmware_platform_setup(void)
+static void __init vmware_platform_setup(void)
 {
 	uint32_t eax, ebx, ecx, edx;
 
@@ -82,24 +82,21 @@ void __init vmware_platform_setup(void)
  * serial key should be enough, as this will always have a VMware
  * specific string when running under VMware hypervisor.
  */
-int vmware_platform(void)
+static bool __init vmware_platform(void)
 {
 	if (cpu_has_hypervisor) {
-		unsigned int eax, ebx, ecx, edx;
-		char hyper_vendor_id[13];
-
-		cpuid(CPUID_VMWARE_INFO_LEAF, &eax, &ebx, &ecx, &edx);
-		memcpy(hyper_vendor_id + 0, &ebx, 4);
-		memcpy(hyper_vendor_id + 4, &ecx, 4);
-		memcpy(hyper_vendor_id + 8, &edx, 4);
-		hyper_vendor_id[12] = '\0';
-		if (!strcmp(hyper_vendor_id, "VMwareVMware"))
-			return 1;
+		unsigned int eax;
+		unsigned int hyper_vendor_id[3];
+
+		cpuid(CPUID_VMWARE_INFO_LEAF, &eax, &hyper_vendor_id[0],
+		      &hyper_vendor_id[1], &hyper_vendor_id[2]);
+		if (!memcmp(hyper_vendor_id, "VMwareVMware", 12))
+			return true;
 	} else if (dmi_available && dmi_name_in_serial("VMware") &&
 		   __vmware_platform())
-		return 1;
+		return true;
 
-	return 0;
+	return false;
 }
 
 /*
@@ -114,8 +111,15 @@ int vmware_platform(void)
  * so that the kernel could just trust the hypervisor with providing a
  * reliable virtual TSC that is suitable for timekeeping.
  */
-void __cpuinit vmware_set_feature_bits(struct cpuinfo_x86 *c)
+static void __cpuinit vmware_set_cpu_features(struct cpuinfo_x86 *c)
 {
 	set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
 	set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE);
 }
+
+const __refconst struct hypervisor_x86 x86_hyper_vmware = {
+	.name			= "VMware",
+	.detect			= vmware_platform,
+	.set_cpu_features	= vmware_set_cpu_features,
+	.init_platform		= vmware_platform_setup,
+};
-- 
cgit v1.1


From 96f6e775b58687d85ee33004d414419b5ec34106 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Sun, 9 May 2010 01:10:34 -0700
Subject: x86, hypervisor: Export the x86_hyper* symbols

Export x86_hyper and the related specific structures, allowing for
hypervisor identification by modules.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: Greg KH <greg@kroah.com>
Cc: Hank Janssen <hjanssen@microsoft.com>
Cc: Alok Kataria <akataria@vmware.com>
Cc: Ky Srinivasan <ksrinivasan@novell.com>
Cc: Dmitry Torokhov <dtor@vmware.com>
LKML-Reference: <4BE49778.6060800@zytor.com>
---
 arch/x86/kernel/cpu/hypervisor.c | 1 +
 arch/x86/kernel/cpu/mshyperv.c   | 1 +
 arch/x86/kernel/cpu/vmware.c     | 2 +-
 3 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index 8738175..4afb5a2 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -36,6 +36,7 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] =
 };
 
 const struct hypervisor_x86 *x86_hyper;
+EXPORT_SYMBOL(x86_hyper);
 
 static inline void __init
 detect_hypervisor_vendor(void)
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 5969c3e..0f13717 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -51,3 +51,4 @@ const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
 	.detect			= ms_hyperv_platform,
 	.init_platform		= ms_hyperv_init_platform,
 };
+EXPORT_SYMBOL(x86_hyper_ms_hyperv);
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 265b432..b9d1ff5 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -99,7 +99,6 @@ static bool __init vmware_platform(void)
 
 	return false;
 }
-EXPORT_SYMBOL(vmware_platform);
 
 /*
  * VMware hypervisor takes care of exporting a reliable TSC to the guest.
@@ -125,3 +124,4 @@ const __refconst struct hypervisor_x86 x86_hyper_vmware = {
 	.set_cpu_features	= vmware_set_cpu_features,
 	.init_platform		= vmware_platform_setup,
 };
+EXPORT_SYMBOL(x86_hyper_vmware);
-- 
cgit v1.1


From 3998d095354d2a3062bdaa821ef07a1e1c82873c Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Sun, 9 May 2010 22:46:54 -0700
Subject: x86, hypervisor: add missing <linux/module.h>

EXPORT_SYMBOL() needs <linux/module.h> to be included; fixes modular
builds of the VMware balloon driver, and any future modular drivers
which depends on the hypervisor.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: Greg KH <greg@kroah.com>
Cc: Hank Janssen <hjanssen@microsoft.com>
Cc: Alok Kataria <akataria@vmware.com>
Cc: Ky Srinivasan <ksrinivasan@novell.com>
Cc: Dmitry Torokhov <dtor@vmware.com>
LKML-Reference: <4BE49778.6060800@zytor.com>
---
 arch/x86/kernel/cpu/hypervisor.c | 1 +
 arch/x86/kernel/cpu/mshyperv.c   | 1 +
 2 files changed, 2 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index 4afb5a2..dd531cc 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -21,6 +21,7 @@
  *
  */
 
+#include <linux/module.h>
 #include <asm/processor.h>
 #include <asm/hypervisor.h>
 
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 0f13717..16f41bb 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -11,6 +11,7 @@
  */
 
 #include <linux/types.h>
+#include <linux/module.h>
 #include <asm/processor.h>
 #include <asm/hypervisor.h>
 #include <asm/hyperv.h>
-- 
cgit v1.1


From c9ad488289144ae5ef53b012e15895ef1f5e4bb6 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 6 May 2010 11:45:45 +0300
Subject: x86: Eliminate TS_XSAVE

The fpu code currently uses current->thread_info->status & TS_XSAVE as
a way to distinguish between XSAVE capable processors and older processors.
The decision is not really task specific; instead we use the task status to
avoid a global memory reference - the value should be the same across all
threads.

Eliminate this tie-in into the task structure by using an alternative
instruction keyed off the XSAVE cpu feature; this results in shorter and
faster code, without introducing a global memory reference.

[ hpa: in the future, this probably should use an asm jmp ]

Signed-off-by: Avi Kivity <avi@redhat.com>
Acked-by: Suresh Siddha <suresh.b.siddha@intel.com>
LKML-Reference: <1273135546-29690-2-git-send-email-avi@redhat.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/common.c | 5 +----
 arch/x86/kernel/i387.c       | 5 +----
 arch/x86/kernel/xsave.c      | 6 +++---
 3 files changed, 5 insertions(+), 11 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 4868e4a..c1c00d0 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1243,10 +1243,7 @@ void __cpuinit cpu_init(void)
 	/*
 	 * Force FPU initialization:
 	 */
-	if (cpu_has_xsave)
-		current_thread_info()->status = TS_XSAVE;
-	else
-		current_thread_info()->status = 0;
+	current_thread_info()->status = 0;
 	clear_used_math();
 	mxcsr_feature_mask_init();
 
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 54c31c2..14ca1dc 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -102,10 +102,7 @@ void __cpuinit fpu_init(void)
 
 	mxcsr_feature_mask_init();
 	/* clean state in init */
-	if (cpu_has_xsave)
-		current_thread_info()->status = TS_XSAVE;
-	else
-		current_thread_info()->status = 0;
+	current_thread_info()->status = 0;
 	clear_used_math();
 }
 #endif	/* CONFIG_X86_64 */
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 782c3a36..c1b0a11 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -99,7 +99,7 @@ int save_i387_xstate(void __user *buf)
 		if (err)
 			return err;
 
-		if (task_thread_info(tsk)->status & TS_XSAVE)
+		if (use_xsave())
 			err = xsave_user(buf);
 		else
 			err = fxsave_user(buf);
@@ -116,7 +116,7 @@ int save_i387_xstate(void __user *buf)
 
 	clear_used_math(); /* trigger finit */
 
-	if (task_thread_info(tsk)->status & TS_XSAVE) {
+	if (use_xsave()) {
 		struct _fpstate __user *fx = buf;
 		struct _xstate __user *x = buf;
 		u64 xstate_bv;
@@ -225,7 +225,7 @@ int restore_i387_xstate(void __user *buf)
 		clts();
 		task_thread_info(current)->status |= TS_USEDFPU;
 	}
-	if (task_thread_info(tsk)->status & TS_XSAVE)
+	if (use_xsave())
 		err = restore_user_xstate(buf);
 	else
 		err = fxrstor_checking((__force struct i387_fxsave_struct *)
-- 
cgit v1.1


From 86603283326c9e95e5ad4e9fdddeec93cac5d9ad Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 6 May 2010 11:45:46 +0300
Subject: x86: Introduce 'struct fpu' and related API

Currently all fpu state access is through tsk->thread.xstate.  Since we wish
to generalize fpu access to non-task contexts, wrap the state in a new
'struct fpu' and convert existing access to use an fpu API.

Signal frame handlers are not converted to the API since they will remain
task context only things.

Signed-off-by: Avi Kivity <avi@redhat.com>
Acked-by: Suresh Siddha <suresh.b.siddha@intel.com>
LKML-Reference: <1273135546-29690-3-git-send-email-avi@redhat.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/i387.c       | 102 +++++++++++++++++++++----------------------
 arch/x86/kernel/process.c    |  21 ++++-----
 arch/x86/kernel/process_32.c |   2 +-
 arch/x86/kernel/process_64.c |   2 +-
 arch/x86/kernel/xsave.c      |   2 +-
 5 files changed, 63 insertions(+), 66 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 14ca1dc..86cef6b 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -107,57 +107,57 @@ void __cpuinit fpu_init(void)
 }
 #endif	/* CONFIG_X86_64 */
 
-/*
- * The _current_ task is using the FPU for the first time
- * so initialize it and set the mxcsr to its default
- * value at reset if we support XMM instructions and then
- * remeber the current task has used the FPU.
- */
-int init_fpu(struct task_struct *tsk)
+static void fpu_finit(struct fpu *fpu)
 {
-	if (tsk_used_math(tsk)) {
-		if (HAVE_HWFP && tsk == current)
-			unlazy_fpu(tsk);
-		return 0;
-	}
-
-	/*
-	 * Memory allocation at the first usage of the FPU and other state.
-	 */
-	if (!tsk->thread.xstate) {
-		tsk->thread.xstate = kmem_cache_alloc(task_xstate_cachep,
-						      GFP_KERNEL);
-		if (!tsk->thread.xstate)
-			return -ENOMEM;
-	}
-
 #ifdef CONFIG_X86_32
 	if (!HAVE_HWFP) {
-		memset(tsk->thread.xstate, 0, xstate_size);
-		finit_task(tsk);
-		set_stopped_child_used_math(tsk);
-		return 0;
+		finit_soft_fpu(&fpu->state->soft);
+		return;
 	}
 #endif
 
 	if (cpu_has_fxsr) {
-		struct i387_fxsave_struct *fx = &tsk->thread.xstate->fxsave;
+		struct i387_fxsave_struct *fx = &fpu->state->fxsave;
 
 		memset(fx, 0, xstate_size);
 		fx->cwd = 0x37f;
 		if (cpu_has_xmm)
 			fx->mxcsr = MXCSR_DEFAULT;
 	} else {
-		struct i387_fsave_struct *fp = &tsk->thread.xstate->fsave;
+		struct i387_fsave_struct *fp = &fpu->state->fsave;
 		memset(fp, 0, xstate_size);
 		fp->cwd = 0xffff037fu;
 		fp->swd = 0xffff0000u;
 		fp->twd = 0xffffffffu;
 		fp->fos = 0xffff0000u;
 	}
+}
+
+/*
+ * The _current_ task is using the FPU for the first time
+ * so initialize it and set the mxcsr to its default
+ * value at reset if we support XMM instructions and then
+ * remeber the current task has used the FPU.
+ */
+int init_fpu(struct task_struct *tsk)
+{
+	int ret;
+
+	if (tsk_used_math(tsk)) {
+		if (HAVE_HWFP && tsk == current)
+			unlazy_fpu(tsk);
+		return 0;
+	}
+
 	/*
-	 * Only the device not available exception or ptrace can call init_fpu.
+	 * Memory allocation at the first usage of the FPU and other state.
 	 */
+	ret = fpu_alloc(&tsk->thread.fpu);
+	if (ret)
+		return ret;
+
+	fpu_finit(&tsk->thread.fpu);
+
 	set_stopped_child_used_math(tsk);
 	return 0;
 }
@@ -191,7 +191,7 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
 		return ret;
 
 	return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
-				   &target->thread.xstate->fxsave, 0, -1);
+				   &target->thread.fpu.state->fxsave, 0, -1);
 }
 
 int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
@@ -208,19 +208,19 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
 		return ret;
 
 	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
-				 &target->thread.xstate->fxsave, 0, -1);
+				 &target->thread.fpu.state->fxsave, 0, -1);
 
 	/*
 	 * mxcsr reserved bits must be masked to zero for security reasons.
 	 */
-	target->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask;
+	target->thread.fpu.state->fxsave.mxcsr &= mxcsr_feature_mask;
 
 	/*
 	 * update the header bits in the xsave header, indicating the
 	 * presence of FP and SSE state.
 	 */
 	if (cpu_has_xsave)
-		target->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE;
+		target->thread.fpu.state->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE;
 
 	return ret;
 }
@@ -243,14 +243,14 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
 	 * memory layout in the thread struct, so that we can copy the entire
 	 * xstateregs to the user using one user_regset_copyout().
 	 */
-	memcpy(&target->thread.xstate->fxsave.sw_reserved,
+	memcpy(&target->thread.fpu.state->fxsave.sw_reserved,
 	       xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes));
 
 	/*
 	 * Copy the xstate memory layout.
 	 */
 	ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
-				  &target->thread.xstate->xsave, 0, -1);
+				  &target->thread.fpu.state->xsave, 0, -1);
 	return ret;
 }
 
@@ -269,14 +269,14 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
 		return ret;
 
 	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
-				 &target->thread.xstate->xsave, 0, -1);
+				 &target->thread.fpu.state->xsave, 0, -1);
 
 	/*
 	 * mxcsr reserved bits must be masked to zero for security reasons.
 	 */
-	target->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask;
+	target->thread.fpu.state->fxsave.mxcsr &= mxcsr_feature_mask;
 
-	xsave_hdr = &target->thread.xstate->xsave.xsave_hdr;
+	xsave_hdr = &target->thread.fpu.state->xsave.xsave_hdr;
 
 	xsave_hdr->xstate_bv &= pcntxt_mask;
 	/*
@@ -362,7 +362,7 @@ static inline u32 twd_fxsr_to_i387(struct i387_fxsave_struct *fxsave)
 static void
 convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk)
 {
-	struct i387_fxsave_struct *fxsave = &tsk->thread.xstate->fxsave;
+	struct i387_fxsave_struct *fxsave = &tsk->thread.fpu.state->fxsave;
 	struct _fpreg *to = (struct _fpreg *) &env->st_space[0];
 	struct _fpxreg *from = (struct _fpxreg *) &fxsave->st_space[0];
 	int i;
@@ -402,7 +402,7 @@ static void convert_to_fxsr(struct task_struct *tsk,
 			    const struct user_i387_ia32_struct *env)
 
 {
-	struct i387_fxsave_struct *fxsave = &tsk->thread.xstate->fxsave;
+	struct i387_fxsave_struct *fxsave = &tsk->thread.fpu.state->fxsave;
 	struct _fpreg *from = (struct _fpreg *) &env->st_space[0];
 	struct _fpxreg *to = (struct _fpxreg *) &fxsave->st_space[0];
 	int i;
@@ -442,7 +442,7 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset,
 
 	if (!cpu_has_fxsr) {
 		return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
-					   &target->thread.xstate->fsave, 0,
+					   &target->thread.fpu.state->fsave, 0,
 					   -1);
 	}
 
@@ -472,7 +472,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
 
 	if (!cpu_has_fxsr) {
 		return user_regset_copyin(&pos, &count, &kbuf, &ubuf,
-					  &target->thread.xstate->fsave, 0, -1);
+					  &target->thread.fpu.state->fsave, 0, -1);
 	}
 
 	if (pos > 0 || count < sizeof(env))
@@ -487,7 +487,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
 	 * presence of FP.
 	 */
 	if (cpu_has_xsave)
-		target->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FP;
+		target->thread.fpu.state->xsave.xsave_hdr.xstate_bv |= XSTATE_FP;
 	return ret;
 }
 
@@ -498,7 +498,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
 static inline int save_i387_fsave(struct _fpstate_ia32 __user *buf)
 {
 	struct task_struct *tsk = current;
-	struct i387_fsave_struct *fp = &tsk->thread.xstate->fsave;
+	struct i387_fsave_struct *fp = &tsk->thread.fpu.state->fsave;
 
 	fp->status = fp->swd;
 	if (__copy_to_user(buf, fp, sizeof(struct i387_fsave_struct)))
@@ -509,7 +509,7 @@ static inline int save_i387_fsave(struct _fpstate_ia32 __user *buf)
 static int save_i387_fxsave(struct _fpstate_ia32 __user *buf)
 {
 	struct task_struct *tsk = current;
-	struct i387_fxsave_struct *fx = &tsk->thread.xstate->fxsave;
+	struct i387_fxsave_struct *fx = &tsk->thread.fpu.state->fxsave;
 	struct user_i387_ia32_struct env;
 	int err = 0;
 
@@ -544,7 +544,7 @@ static int save_i387_xsave(void __user *buf)
 	 * header as well as change any contents in the memory layout.
 	 * xrestore as part of sigreturn will capture all the changes.
 	 */
-	tsk->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE;
+	tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE;
 
 	if (save_i387_fxsave(fx) < 0)
 		return -1;
@@ -596,7 +596,7 @@ static inline int restore_i387_fsave(struct _fpstate_ia32 __user *buf)
 {
 	struct task_struct *tsk = current;
 
-	return __copy_from_user(&tsk->thread.xstate->fsave, buf,
+	return __copy_from_user(&tsk->thread.fpu.state->fsave, buf,
 				sizeof(struct i387_fsave_struct));
 }
 
@@ -607,10 +607,10 @@ static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf,
 	struct user_i387_ia32_struct env;
 	int err;
 
-	err = __copy_from_user(&tsk->thread.xstate->fxsave, &buf->_fxsr_env[0],
+	err = __copy_from_user(&tsk->thread.fpu.state->fxsave, &buf->_fxsr_env[0],
 			       size);
 	/* mxcsr reserved bits must be masked to zero for security reasons */
-	tsk->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask;
+	tsk->thread.fpu.state->fxsave.mxcsr &= mxcsr_feature_mask;
 	if (err || __copy_from_user(&env, buf, sizeof(env)))
 		return 1;
 	convert_to_fxsr(tsk, &env);
@@ -626,7 +626,7 @@ static int restore_i387_xsave(void __user *buf)
 	struct i387_fxsave_struct __user *fx =
 		(struct i387_fxsave_struct __user *) &fx_user->_fxsr_env[0];
 	struct xsave_hdr_struct *xsave_hdr =
-				&current->thread.xstate->xsave.xsave_hdr;
+				&current->thread.fpu.state->xsave.xsave_hdr;
 	u64 mask;
 	int err;
 
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 28ad9f4..f18fd9c 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -32,25 +32,22 @@ struct kmem_cache *task_xstate_cachep;
 
 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 {
+	int ret;
+
 	*dst = *src;
-	if (src->thread.xstate) {
-		dst->thread.xstate = kmem_cache_alloc(task_xstate_cachep,
-						      GFP_KERNEL);
-		if (!dst->thread.xstate)
-			return -ENOMEM;
-		WARN_ON((unsigned long)dst->thread.xstate & 15);
-		memcpy(dst->thread.xstate, src->thread.xstate, xstate_size);
+	if (fpu_allocated(&src->thread.fpu)) {
+		memset(&dst->thread.fpu, 0, sizeof(dst->thread.fpu));
+		ret = fpu_alloc(&dst->thread.fpu);
+		if (ret)
+			return ret;
+		fpu_copy(&dst->thread.fpu, &src->thread.fpu);
 	}
 	return 0;
 }
 
 void free_thread_xstate(struct task_struct *tsk)
 {
-	if (tsk->thread.xstate) {
-		kmem_cache_free(task_xstate_cachep, tsk->thread.xstate);
-		tsk->thread.xstate = NULL;
-	}
-
+	fpu_free(&tsk->thread.fpu);
 	WARN(tsk->thread.ds_ctx, "leaking DS context\n");
 }
 
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index f6c6266..0a7a4f5 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -317,7 +317,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 
 	/* we're going to use this soon, after a few expensive things */
 	if (preload_fpu)
-		prefetch(next->xstate);
+		prefetch(next->fpu.state);
 
 	/*
 	 * Reload esp0.
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 17cb329..979215f 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -396,7 +396,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 
 	/* we're going to use this soon, after a few expensive things */
 	if (preload_fpu)
-		prefetch(next->xstate);
+		prefetch(next->fpu.state);
 
 	/*
 	 * Reload esp0, LDT and the page table pointer:
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index c1b0a11..37e68fc 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -109,7 +109,7 @@ int save_i387_xstate(void __user *buf)
 		task_thread_info(tsk)->status &= ~TS_USEDFPU;
 		stts();
 	} else {
-		if (__copy_to_user(buf, &tsk->thread.xstate->fxsave,
+		if (__copy_to_user(buf, &tsk->thread.fpu.state->fxsave,
 				   xstate_size))
 			return -1;
 	}
-- 
cgit v1.1


From 829e92458532b1dbfeb972435d45bb060cdbf5a3 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Tue, 27 Apr 2010 18:33:49 -0400
Subject: kprobes/x86: Fix removed int3 checking order

Fix kprobe/x86 to check removed int3 when failing to get kprobe
from hlist. Since we have a time window between checking int3
exists on probed address and getting kprobe on that address,
we can have following scenario:

 -------
 CPU1                     CPU2
 hit int3
 check int3 exists
                          remove int3
                          remove kprobe from hlist
 get kprobe from hlist
 no kprobe->OOPS!
 -------

This patch moves int3 checking if there is no kprobe on that
address for fixing this problem as follows:

 ------
 CPU1                     CPU2
 hit int3
                          remove int3
                          remove kprobe from hlist
 get kprobe from hlist
 no kprobe->check int3 exists
          ->rollback&retry
 ------

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Acked-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: systemtap <systemtap@sources.redhat.com>
Cc: DLE <dle-develop@lists.sourceforge.net>
Cc: Dave Anderson <anderson@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20100427223348.2322.9112.stgit@localhost6.localdomain6>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/kprobes.c | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index b43bbae..1658efd 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -534,20 +534,6 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
 	struct kprobe_ctlblk *kcb;
 
 	addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t));
-	if (*addr != BREAKPOINT_INSTRUCTION) {
-		/*
-		 * The breakpoint instruction was removed right
-		 * after we hit it.  Another cpu has removed
-		 * either a probepoint or a debugger breakpoint
-		 * at this address.  In either case, no further
-		 * handling of this interrupt is appropriate.
-		 * Back up over the (now missing) int3 and run
-		 * the original instruction.
-		 */
-		regs->ip = (unsigned long)addr;
-		return 1;
-	}
-
 	/*
 	 * We don't want to be preempted for the entire
 	 * duration of kprobe processing. We conditionally
@@ -579,6 +565,19 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
 				setup_singlestep(p, regs, kcb, 0);
 			return 1;
 		}
+	} else if (*addr != BREAKPOINT_INSTRUCTION) {
+		/*
+		 * The breakpoint instruction was removed right
+		 * after we hit it.  Another cpu has removed
+		 * either a probepoint or a debugger breakpoint
+		 * at this address.  In either case, no further
+		 * handling of this interrupt is appropriate.
+		 * Back up over the (now missing) int3 and run
+		 * the original instruction.
+		 */
+		regs->ip = (unsigned long)addr;
+		preempt_enable_no_resched();
+		return 1;
 	} else if (kprobe_running()) {
 		p = __get_cpu_var(current_kprobe);
 		if (p->break_handler && p->break_handler(p, regs)) {
-- 
cgit v1.1


From a52357259680fe5368c2fabf5949209e231f2aa2 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Tue, 11 May 2010 17:12:33 +0200
Subject: x86/amd-iommu: Add amd_iommu=off command line option

This patch adds a command line option to tell the AMD IOMMU
driver to not initialize any IOMMU it finds.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu_init.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 6360abf..3bacb4d 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -120,6 +120,7 @@ struct ivmd_header {
 bool amd_iommu_dump;
 
 static int __initdata amd_iommu_detected;
+static bool __initdata amd_iommu_disabled;
 
 u16 amd_iommu_last_bdf;			/* largest PCI device id we have
 					   to handle */
@@ -1372,6 +1373,9 @@ void __init amd_iommu_detect(void)
 	if (no_iommu || (iommu_detected && !gart_iommu_aperture))
 		return;
 
+	if (amd_iommu_disabled)
+		return;
+
 	if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) {
 		iommu_detected = 1;
 		amd_iommu_detected = 1;
@@ -1401,6 +1405,8 @@ static int __init parse_amd_iommu_options(char *str)
 	for (; *str; ++str) {
 		if (strncmp(str, "fullflush", 9) == 0)
 			amd_iommu_unmap_flush = true;
+		if (strncmp(str, "off", 3) == 0)
+			amd_iommu_disabled = true;
 	}
 
 	return 1;
-- 
cgit v1.1


From f01487119dda3d9f58c9729c7361ecc50a61c188 Mon Sep 17 00:00:00 2001
From: Andreas Herrmann <herrmann.der.user@googlemail.com>
Date: Tue, 27 Apr 2010 12:13:48 +0200
Subject: x86, amd: Check X86_FEATURE_OSVW bit before accessing OSVW MSRs

If host CPU is exposed to a guest the OSVW MSRs are not guaranteed
to be present and a GP fault occurs. Thus checking the feature flag is
essential.

Cc: <stable@kernel.org> # .32.x .33.x
Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
LKML-Reference: <20100427101348.GC4489@alberich.amd.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/process.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 28ad9f4..0415c3e 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -546,11 +546,13 @@ static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
 		 * check OSVW bit for CPUs that are not affected
 		 * by erratum #400
 		 */
-		rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, val);
-		if (val >= 2) {
-			rdmsrl(MSR_AMD64_OSVW_STATUS, val);
-			if (!(val & BIT(1)))
-				goto no_c1e_idle;
+		if (cpu_has(c, X86_FEATURE_OSVW)) {
+			rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, val);
+			if (val >= 2) {
+				rdmsrl(MSR_AMD64_OSVW_STATUS, val);
+				if (!(val & BIT(1)))
+					goto no_c1e_idle;
+			}
 		}
 		return 1;
 	}
-- 
cgit v1.1


From 7f284d3cc96e02468a42e045f77af11e5ff8b095 Mon Sep 17 00:00:00 2001
From: Frank Arnold <frank.arnold@amd.com>
Date: Thu, 22 Apr 2010 16:06:59 +0200
Subject: x86, cacheinfo: Turn off L3 cache index disable feature in
 virtualized environments

When running a quest kernel on xen we get:

BUG: unable to handle kernel NULL pointer dereference at 0000000000000038
IP: [<ffffffff8142f2fb>] cpuid4_cache_lookup_regs+0x2ca/0x3df
PGD 0
Oops: 0000 [#1] SMP
last sysfs file:
CPU 0
Modules linked in:

Pid: 0, comm: swapper Tainted: G        W  2.6.34-rc3 #1 /HVM domU
RIP: 0010:[<ffffffff8142f2fb>]  [<ffffffff8142f2fb>] cpuid4_cache_lookup_regs+0x
2ca/0x3df
RSP: 0018:ffff880002203e08  EFLAGS: 00010046
RAX: 0000000000000000 RBX: 0000000000000003 RCX: 0000000000000060
RDX: 0000000000000000 RSI: 0000000000000040 RDI: 0000000000000000
RBP: ffff880002203ed8 R08: 00000000000017c0 R09: ffff880002203e38
R10: ffff8800023d5d40 R11: ffffffff81a01e28 R12: ffff880187e6f5c0
R13: ffff880002203e34 R14: ffff880002203e58 R15: ffff880002203e68
FS:  0000000000000000(0000) GS:ffff880002200000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
CR2: 0000000000000038 CR3: 0000000001a3c000 CR4: 00000000000006f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process swapper (pid: 0, threadinfo ffffffff81a00000, task ffffffff81a44020)
Stack:
 ffffffff810d7ecb ffff880002203e20 ffffffff81059140 ffff880002203e30
<0> ffffffff810d7ec9 0000000002203e40 000000000050d140 ffff880002203e70
<0> 0000000002008140 0000000000000086 ffff880040020140 ffffffff81068b8b
Call Trace:
 <IRQ>
 [<ffffffff810d7ecb>] ? sync_supers_timer_fn+0x0/0x1c
 [<ffffffff81059140>] ? mod_timer+0x23/0x25
 [<ffffffff810d7ec9>] ? arm_supers_timer+0x34/0x36
 [<ffffffff81068b8b>] ? hrtimer_get_next_event+0xa7/0xc3
 [<ffffffff81058e85>] ? get_next_timer_interrupt+0x19a/0x20d
 [<ffffffff8142fa23>] get_cpu_leaves+0x5c/0x232
 [<ffffffff8106a7b1>] ? sched_clock_local+0x1c/0x82
 [<ffffffff8106a9a0>] ? sched_clock_tick+0x75/0x7a
 [<ffffffff8107748c>] generic_smp_call_function_single_interrupt+0xae/0xd0
 [<ffffffff8101f6ef>] smp_call_function_single_interrupt+0x18/0x27
 [<ffffffff8100a773>] call_function_single_interrupt+0x13/0x20
 <EOI>
 [<ffffffff8143c468>] ? notifier_call_chain+0x14/0x63
 [<ffffffff810295c6>] ? native_safe_halt+0xc/0xd
 [<ffffffff810114eb>] ? default_idle+0x36/0x53
 [<ffffffff81008c22>] cpu_idle+0xaa/0xe4
 [<ffffffff81423a9a>] rest_init+0x7e/0x80
 [<ffffffff81b10dd2>] start_kernel+0x40e/0x419
 [<ffffffff81b102c8>] x86_64_start_reservations+0xb3/0xb7
 [<ffffffff81b103c4>] x86_64_start_kernel+0xf8/0x107
Code: 14 d5 40 ff ae 81 8b 14 02 31 c0 3b 15 47 1c 8b 00 7d 0e 48 8b 05 36 1c 8b
 00 48 63 d2 48 8b 04 d0 c7 85 5c ff ff ff 00 00 00 00 <8b> 70 38 48 8d 8d 5c ff
 ff ff 48 8b 78 10 ba c4 01 00 00 e8 eb
RIP  [<ffffffff8142f2fb>] cpuid4_cache_lookup_regs+0x2ca/0x3df
 RSP <ffff880002203e08>
CR2: 0000000000000038
---[ end trace a7919e7f17c0a726 ]---

The L3 cache index disable feature of AMD CPUs has to be disabled if the
kernel is running as guest on top of a hypervisor because northbridge
devices are not available to the guest. Currently, this fixes a boot
crash on top of Xen. In the future this will become an issue on KVM as
well.

Check if northbridge devices are present and do not enable the feature
if there are none.

[ hpa: backported to 2.6.34 ]

Signed-off-by: Frank Arnold <frank.arnold@amd.com>
LKML-Reference: <1271945222-5283-3-git-send-email-bp@amd64.org>
Acked-by: Borislav Petkov <borislav.petkov@amd.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: <stable@kernel.org>
---
 arch/x86/kernel/cpu/intel_cacheinfo.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index b3eeb66..95962a9 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -340,6 +340,10 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
 	     (boot_cpu_data.x86_mask  < 0x1)))
 		return;
 
+	/* not in virtualized environments */
+	if (num_k8_northbridges == 0)
+		return;
+
 	this_leaf->can_disable = true;
 	this_leaf->l3_indices  = amd_calc_l3_indices();
 }
-- 
cgit v1.1


From fea24e28c663e62663097f0ed3b8ff1f9a87f15e Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Fri, 14 May 2010 14:41:20 -0700
Subject: x86, mrst: add nop functions to x86_init mpparse functions

Moorestown does not have BIOS provided MP tables, we can save some time
by avoiding scaning of these tables. e.g.
[    0.000000] Scan SMP from c0000000 for 1024 bytes.
[    0.000000] Scan SMP from c009fc00 for 1024 bytes.
[    0.000000] Scan SMP from c00f0000 for 65536 bytes.
[    0.000000] Scan SMP from c00bfff0 for 1024 bytes.

Searching EBDA with the base at 0x40E will also result in random pointer
deferencing within 1MB. This can be a problem in Lincroft if the pointer
hits VGA area and VGA mode is not enabled.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
LKML-Reference: <1273873281-17489-8-git-send-email-jacob.jun.pan@linux.intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/mrst.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c
index 0aad867..e796448 100644
--- a/arch/x86/kernel/mrst.c
+++ b/arch/x86/kernel/mrst.c
@@ -237,4 +237,9 @@ void __init x86_mrst_early_setup(void)
 	x86_init.pci.fixup_irqs = x86_init_noop;
 
 	legacy_pic = &null_legacy_pic;
+
+	/* Avoid searching for BIOS MP tables */
+	x86_init.mpparse.find_smp_config = x86_init_noop;
+	x86_init.mpparse.get_smp_config = x86_init_uint_noop;
+
 }
-- 
cgit v1.1