From d8c97d5f3aa348272df2ccb4e224b1cf9a1eb6d7 Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Thu, 8 Sep 2005 12:39:59 -0700
Subject: [IA64] simplified efi memory map parsing

New version leaves the original memory map unmodified.
Also saves any granule trimmings for use by the uncached
memory allocator.

Inspired by Khalid Aziz (various traces of his patch still
remain).  Fixes to uncached_build_memmap() and sn2 testing
by Martin Hicks.

Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/kernel/efi.c      | 423 +++++++++++++++++++++++++-------------------
 arch/ia64/kernel/setup.c    |   3 +
 arch/ia64/kernel/uncached.c |  17 +-
 3 files changed, 251 insertions(+), 192 deletions(-)

(limited to 'arch/ia64/kernel')

diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c
index 179f230..1291db5 100644
--- a/arch/ia64/kernel/efi.c
+++ b/arch/ia64/kernel/efi.c
@@ -239,57 +239,30 @@ is_available_memory (efi_memory_desc_t *md)
 	return 0;
 }
 
-/*
- * Trim descriptor MD so its starts at address START_ADDR.  If the descriptor covers
- * memory that is normally available to the kernel, issue a warning that some memory
- * is being ignored.
- */
-static void
-trim_bottom (efi_memory_desc_t *md, u64 start_addr)
-{
-	u64 num_skipped_pages;
-
-	if (md->phys_addr >= start_addr || !md->num_pages)
-		return;
-
-	num_skipped_pages = (start_addr - md->phys_addr) >> EFI_PAGE_SHIFT;
-	if (num_skipped_pages > md->num_pages)
-		num_skipped_pages = md->num_pages;
+typedef struct kern_memdesc {
+	u64 attribute;
+	u64 start;
+	u64 num_pages;
+} kern_memdesc_t;
 
-	if (is_available_memory(md))
-		printk(KERN_NOTICE "efi.%s: ignoring %luKB of memory at 0x%lx due to granule hole "
-		       "at 0x%lx\n", __FUNCTION__,
-		       (num_skipped_pages << EFI_PAGE_SHIFT) >> 10,
-		       md->phys_addr, start_addr - IA64_GRANULE_SIZE);
-	/*
-	 * NOTE: Don't set md->phys_addr to START_ADDR because that could cause the memory
-	 * descriptor list to become unsorted.  In such a case, md->num_pages will be
-	 * zero, so the Right Thing will happen.
-	 */
-	md->phys_addr += num_skipped_pages << EFI_PAGE_SHIFT;
-	md->num_pages -= num_skipped_pages;
-}
+static kern_memdesc_t *kern_memmap;
 
 static void
-trim_top (efi_memory_desc_t *md, u64 end_addr)
+walk (efi_freemem_callback_t callback, void *arg, u64 attr)
 {
-	u64 num_dropped_pages, md_end_addr;
+	kern_memdesc_t *k;
+	u64 start, end, voff;
 
-	md_end_addr = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT);
-
-	if (md_end_addr <= end_addr || !md->num_pages)
-		return;
-
-	num_dropped_pages = (md_end_addr - end_addr) >> EFI_PAGE_SHIFT;
-	if (num_dropped_pages > md->num_pages)
-		num_dropped_pages = md->num_pages;
-
-	if (is_available_memory(md))
-		printk(KERN_NOTICE "efi.%s: ignoring %luKB of memory at 0x%lx due to granule hole "
-		       "at 0x%lx\n", __FUNCTION__,
-		       (num_dropped_pages << EFI_PAGE_SHIFT) >> 10,
-		       md->phys_addr, end_addr);
-	md->num_pages -= num_dropped_pages;
+	voff = (attr == EFI_MEMORY_WB) ? PAGE_OFFSET : __IA64_UNCACHED_OFFSET;
+	for (k = kern_memmap; k->start != ~0UL; k++) {
+		if (k->attribute != attr)
+			continue;
+		start = PAGE_ALIGN(k->start);
+		end = (k->start + (k->num_pages << EFI_PAGE_SHIFT)) & PAGE_MASK;
+		if (start < end)
+			if ((*callback)(start + voff, end + voff, arg) < 0)
+				return;
+	}
 }
 
 /*
@@ -299,148 +272,19 @@ trim_top (efi_memory_desc_t *md, u64 end_addr)
 void
 efi_memmap_walk (efi_freemem_callback_t callback, void *arg)
 {
-	int prev_valid = 0;
-	struct range {
-		u64 start;
-		u64 end;
-	} prev, curr;
-	void *efi_map_start, *efi_map_end, *p, *q;
-	efi_memory_desc_t *md, *check_md;
-	u64 efi_desc_size, start, end, granule_addr, last_granule_addr, first_non_wb_addr = 0;
-	unsigned long total_mem = 0;
-
-	efi_map_start = __va(ia64_boot_param->efi_memmap);
-	efi_map_end   = efi_map_start + ia64_boot_param->efi_memmap_size;
-	efi_desc_size = ia64_boot_param->efi_memdesc_size;
-
-	for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
-		md = p;
-
-		/* skip over non-WB memory descriptors; that's all we're interested in... */
-		if (!(md->attribute & EFI_MEMORY_WB))
-			continue;
-
-		/*
-		 * granule_addr is the base of md's first granule.
-		 * [granule_addr - first_non_wb_addr) is guaranteed to
-		 * be contiguous WB memory.
-		 */
-		granule_addr = GRANULEROUNDDOWN(md->phys_addr);
-		first_non_wb_addr = max(first_non_wb_addr, granule_addr);
-
-		if (first_non_wb_addr < md->phys_addr) {
-			trim_bottom(md, granule_addr + IA64_GRANULE_SIZE);
-			granule_addr = GRANULEROUNDDOWN(md->phys_addr);
-			first_non_wb_addr = max(first_non_wb_addr, granule_addr);
-		}
-
-		for (q = p; q < efi_map_end; q += efi_desc_size) {
-			check_md = q;
-
-			if ((check_md->attribute & EFI_MEMORY_WB) &&
-			    (check_md->phys_addr == first_non_wb_addr))
-				first_non_wb_addr += check_md->num_pages << EFI_PAGE_SHIFT;
-			else
-				break;		/* non-WB or hole */
-		}
-
-		last_granule_addr = GRANULEROUNDDOWN(first_non_wb_addr);
-		if (last_granule_addr < md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT))
-			trim_top(md, last_granule_addr);
-
-		if (is_available_memory(md)) {
-			if (md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT) >= max_addr) {
-				if (md->phys_addr >= max_addr)
-					continue;
-				md->num_pages = (max_addr - md->phys_addr) >> EFI_PAGE_SHIFT;
-				first_non_wb_addr = max_addr;
-			}
-
-			if (total_mem >= mem_limit)
-				continue;
-
-			if (total_mem + (md->num_pages << EFI_PAGE_SHIFT) > mem_limit) {
-				unsigned long limit_addr = md->phys_addr;
-
-				limit_addr += mem_limit - total_mem;
-				limit_addr = GRANULEROUNDDOWN(limit_addr);
-
-				if (md->phys_addr > limit_addr)
-					continue;
-
-				md->num_pages = (limit_addr - md->phys_addr) >>
-				                EFI_PAGE_SHIFT;
-				first_non_wb_addr = max_addr = md->phys_addr +
-				              (md->num_pages << EFI_PAGE_SHIFT);
-			}
-			total_mem += (md->num_pages << EFI_PAGE_SHIFT);
-
-			if (md->num_pages == 0)
-				continue;
-
-			curr.start = PAGE_OFFSET + md->phys_addr;
-			curr.end   = curr.start + (md->num_pages << EFI_PAGE_SHIFT);
-
-			if (!prev_valid) {
-				prev = curr;
-				prev_valid = 1;
-			} else {
-				if (curr.start < prev.start)
-					printk(KERN_ERR "Oops: EFI memory table not ordered!\n");
-
-				if (prev.end == curr.start) {
-					/* merge two consecutive memory ranges */
-					prev.end = curr.end;
-				} else {
-					start = PAGE_ALIGN(prev.start);
-					end = prev.end & PAGE_MASK;
-					if ((end > start) && (*callback)(start, end, arg) < 0)
-						return;
-					prev = curr;
-				}
-			}
-		}
-	}
-	if (prev_valid) {
-		start = PAGE_ALIGN(prev.start);
-		end = prev.end & PAGE_MASK;
-		if (end > start)
-			(*callback)(start, end, arg);
-	}
+	walk(callback, arg, EFI_MEMORY_WB);
 }
 
 /*
- * Walk the EFI memory map to pull out leftover pages in the lower
- * memory regions which do not end up in the regular memory map and
- * stick them into the uncached allocator
- *
- * The regular walk function is significantly more complex than the
- * uncached walk which means it really doesn't make sense to try and
- * marge the two.
+ * Walks the EFI memory map and calls CALLBACK once for each EFI memory descriptor that
+ * has memory that is available for uncached allocator.
  */
-void __init
-efi_memmap_walk_uc (efi_freemem_callback_t callback)
+void
+efi_memmap_walk_uc (efi_freemem_callback_t callback, void *arg)
 {
-	void *efi_map_start, *efi_map_end, *p;
-	efi_memory_desc_t *md;
-	u64 efi_desc_size, start, end;
-
-	efi_map_start = __va(ia64_boot_param->efi_memmap);
-	efi_map_end = efi_map_start + ia64_boot_param->efi_memmap_size;
-	efi_desc_size = ia64_boot_param->efi_memdesc_size;
-
-	for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
-		md = p;
-		if (md->attribute == EFI_MEMORY_UC) {
-			start = PAGE_ALIGN(md->phys_addr);
-			end = PAGE_ALIGN((md->phys_addr+(md->num_pages << EFI_PAGE_SHIFT)) & PAGE_MASK);
-			if ((*callback)(start, end, NULL) < 0)
-				return;
-		}
-	}
+	walk(callback, arg, EFI_MEMORY_UC);
 }
 
-
 /*
  * Look for the PAL_CODE region reported by EFI and maps it using an
  * ITR to enable safe PAL calls in virtual mode.  See IA-64 Processor
@@ -862,3 +706,220 @@ efi_uart_console_only(void)
 	printk(KERN_ERR "Malformed %s value\n", name);
 	return 0;
 }
+
+#define efi_md_size(md)	(md->num_pages << EFI_PAGE_SHIFT)
+
+static inline u64
+kmd_end(kern_memdesc_t *kmd)
+{
+	return (kmd->start + (kmd->num_pages << EFI_PAGE_SHIFT));
+}
+
+static inline u64
+efi_md_end(efi_memory_desc_t *md)
+{
+	return (md->phys_addr + efi_md_size(md));
+}
+
+static inline int
+efi_wb(efi_memory_desc_t *md)
+{
+	return (md->attribute & EFI_MEMORY_WB);
+}
+
+static inline int
+efi_uc(efi_memory_desc_t *md)
+{
+	return (md->attribute & EFI_MEMORY_UC);
+}
+
+/*
+ * Look for the first granule aligned memory descriptor memory
+ * that is big enough to hold EFI memory map. Make sure this
+ * descriptor is atleast granule sized so it does not get trimmed
+ */
+struct kern_memdesc *
+find_memmap_space (void)
+{
+	u64	contig_low=0, contig_high=0;
+	u64	as = 0, ae;
+	void *efi_map_start, *efi_map_end, *p, *q;
+	efi_memory_desc_t *md, *pmd = NULL, *check_md;
+	u64	space_needed, efi_desc_size;
+	unsigned long total_mem = 0;
+
+	efi_map_start = __va(ia64_boot_param->efi_memmap);
+	efi_map_end   = efi_map_start + ia64_boot_param->efi_memmap_size;
+	efi_desc_size = ia64_boot_param->efi_memdesc_size;
+
+	/*
+	 * Worst case: we need 3 kernel descriptors for each efi descriptor
+	 * (if every entry has a WB part in the middle, and UC head and tail),
+	 * plus one for the end marker.
+	 */
+	space_needed = sizeof(kern_memdesc_t) *
+		(3 * (ia64_boot_param->efi_memmap_size/efi_desc_size) + 1);
+
+	for (p = efi_map_start; p < efi_map_end; pmd = md, p += efi_desc_size) {
+		md = p;
+		if (!efi_wb(md)) {
+			continue;
+		}
+		if (pmd == NULL || !efi_wb(pmd) || efi_md_end(pmd) != md->phys_addr) {
+			contig_low = GRANULEROUNDUP(md->phys_addr);
+			contig_high = efi_md_end(md);
+			for (q = p + efi_desc_size; q < efi_map_end; q += efi_desc_size) {
+				check_md = q;
+				if (!efi_wb(check_md))
+					break;
+				if (contig_high != check_md->phys_addr)
+					break;
+				contig_high = efi_md_end(check_md);
+			}
+			contig_high = GRANULEROUNDDOWN(contig_high);
+		}
+		if (!is_available_memory(md) || md->type == EFI_LOADER_DATA)
+			continue;
+
+		/* Round ends inward to granule boundaries */
+		as = max(contig_low, md->phys_addr);
+		ae = min(contig_high, efi_md_end(md));
+
+		/* keep within max_addr= command line arg */
+		ae = min(ae, max_addr);
+		if (ae <= as)
+			continue;
+
+		/* avoid going over mem= command line arg */
+		if (total_mem + (ae - as) > mem_limit)
+			ae -= total_mem + (ae - as) - mem_limit;
+
+		if (ae <= as)
+			continue;
+
+		if (ae - as > space_needed)
+			break;
+	}
+	if (p >= efi_map_end)
+		panic("Can't allocate space for kernel memory descriptors");
+
+	return __va(as);
+}
+
+/*
+ * Walk the EFI memory map and gather all memory available for kernel
+ * to use.  We can allocate partial granules only if the unavailable
+ * parts exist, and are WB.
+ */
+void
+efi_memmap_init(unsigned long *s, unsigned long *e)
+{
+	struct kern_memdesc *k, *prev = 0;
+	u64	contig_low=0, contig_high=0;
+	u64	as, ae, lim;
+	void *efi_map_start, *efi_map_end, *p, *q;
+	efi_memory_desc_t *md, *pmd = NULL, *check_md;
+	u64	efi_desc_size;
+	unsigned long total_mem = 0;
+
+	k = kern_memmap = find_memmap_space();
+
+	efi_map_start = __va(ia64_boot_param->efi_memmap);
+	efi_map_end   = efi_map_start + ia64_boot_param->efi_memmap_size;
+	efi_desc_size = ia64_boot_param->efi_memdesc_size;
+
+	for (p = efi_map_start; p < efi_map_end; pmd = md, p += efi_desc_size) {
+		md = p;
+		if (!efi_wb(md)) {
+			if (efi_uc(md) && (md->type == EFI_CONVENTIONAL_MEMORY ||
+				    	   md->type == EFI_BOOT_SERVICES_DATA)) {
+				k->attribute = EFI_MEMORY_UC;
+				k->start = md->phys_addr;
+				k->num_pages = md->num_pages;
+				k++;
+			}
+			continue;
+		}
+		if (pmd == NULL || !efi_wb(pmd) || efi_md_end(pmd) != md->phys_addr) {
+			contig_low = GRANULEROUNDUP(md->phys_addr);
+			contig_high = efi_md_end(md);
+			for (q = p + efi_desc_size; q < efi_map_end; q += efi_desc_size) {
+				check_md = q;
+				if (!efi_wb(check_md))
+					break;
+				if (contig_high != check_md->phys_addr)
+					break;
+				contig_high = efi_md_end(check_md);
+			}
+			contig_high = GRANULEROUNDDOWN(contig_high);
+		}
+		if (!is_available_memory(md))
+			continue;
+
+		/*
+		 * Round ends inward to granule boundaries
+		 * Give trimmings to uncached allocator
+		 */
+		if (md->phys_addr < contig_low) {
+			lim = min(efi_md_end(md), contig_low);
+			if (efi_uc(md)) {
+				if (k > kern_memmap && (k-1)->attribute == EFI_MEMORY_UC &&
+				    kmd_end(k-1) == md->phys_addr) {
+					(k-1)->num_pages += (lim - md->phys_addr) >> EFI_PAGE_SHIFT;
+				} else {
+					k->attribute = EFI_MEMORY_UC;
+					k->start = md->phys_addr;
+					k->num_pages = (lim - md->phys_addr) >> EFI_PAGE_SHIFT;
+					k++;
+				}
+			}
+			as = contig_low;
+		} else
+			as = md->phys_addr;
+
+		if (efi_md_end(md) > contig_high) {
+			lim = max(md->phys_addr, contig_high);
+			if (efi_uc(md)) {
+				if (lim == md->phys_addr && k > kern_memmap &&
+				    (k-1)->attribute == EFI_MEMORY_UC &&
+				    kmd_end(k-1) == md->phys_addr) {
+					(k-1)->num_pages += md->num_pages;
+				} else {
+					k->attribute = EFI_MEMORY_UC;
+					k->start = lim;
+					k->num_pages = (efi_md_end(md) - lim) >> EFI_PAGE_SHIFT;
+					k++;
+				}
+			}
+			ae = contig_high;
+		} else
+			ae = efi_md_end(md);
+
+		/* keep within max_addr= command line arg */
+		ae = min(ae, max_addr);
+		if (ae <= as)
+			continue;
+
+		/* avoid going over mem= command line arg */
+		if (total_mem + (ae - as) > mem_limit)
+			ae -= total_mem + (ae - as) - mem_limit;
+
+		if (ae <= as)
+			continue;
+		if (prev && kmd_end(prev) == md->phys_addr) {
+			prev->num_pages += (ae - as) >> EFI_PAGE_SHIFT;
+			total_mem += ae - as;
+			continue;
+		}
+		k->attribute = EFI_MEMORY_WB;
+		k->start = as;
+		k->num_pages = (ae - as) >> EFI_PAGE_SHIFT;
+		total_mem += ae - as;
+		prev = k++;
+	}
+	k->start = ~0L; /* end-marker */
+
+	/* reserve the memory we are using for kern_memmap */
+	*s = (u64)kern_memmap;
+	*e = (u64)++k;
+}
diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c
index 84f89da..1658d68 100644
--- a/arch/ia64/kernel/setup.c
+++ b/arch/ia64/kernel/setup.c
@@ -211,6 +211,9 @@ reserve_memory (void)
 	}
 #endif
 
+	efi_memmap_init(&rsvd_region[n].start, &rsvd_region[n].end);
+	n++;
+
 	/* end of memory marker */
 	rsvd_region[n].start = ~0UL;
 	rsvd_region[n].end   = ~0UL;
diff --git a/arch/ia64/kernel/uncached.c b/arch/ia64/kernel/uncached.c
index 4e9d06c..c6d4044 100644
--- a/arch/ia64/kernel/uncached.c
+++ b/arch/ia64/kernel/uncached.c
@@ -205,23 +205,18 @@ EXPORT_SYMBOL(uncached_free_page);
 static int __init
 uncached_build_memmap(unsigned long start, unsigned long end, void *arg)
 {
-	long length;
-	unsigned long vstart, vend;
+	long length = end - start;
 	int node;
 
-	length = end - start;
-	vstart = start + __IA64_UNCACHED_OFFSET;
-	vend = end + __IA64_UNCACHED_OFFSET;
-
 	dprintk(KERN_ERR "uncached_build_memmap(%lx %lx)\n", start, end);
 
-	memset((char *)vstart, 0, length);
+	memset((char *)start, 0, length);
 
-	node = paddr_to_nid(start);
+	node = paddr_to_nid(start - __IA64_UNCACHED_OFFSET);
 
-	for (; vstart < vend ; vstart += PAGE_SIZE) {
-		dprintk(KERN_INFO "sticking %lx into the pool!\n", vstart);
-		gen_pool_free(uncached_pool[node], vstart, PAGE_SIZE);
+	for (; start < end ; start += PAGE_SIZE) {
+		dprintk(KERN_INFO "sticking %lx into the pool!\n", start);
+		gen_pool_free(uncached_pool[node], start, PAGE_SIZE);
 	}
 
 	return 0;
-- 
cgit v1.1


From be379124c0a5abfbe57dab2823fe8a71ce797aee Mon Sep 17 00:00:00 2001
From: Khalid Aziz <khalid.aziz@hp.com>
Date: Mon, 19 Sep 2005 15:42:36 -0700
Subject: [IA64] include EFI memory information in /proc/iomem

User mode kexec tools expect to find information about physical
memory in /proc/iomem (as they do on x86) to validate the addresses
that the new kernel will use.

Signed-off-by: Khalid Aziz <khalid.aziz@hp.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/kernel/efi.c   | 87 ++++++++++++++++++++++++++++++++++++++++++++++++
 arch/ia64/kernel/setup.c | 29 ++++++++++++++++
 2 files changed, 116 insertions(+)

(limited to 'arch/ia64/kernel')

diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c
index 1291db5..f72ea6a 100644
--- a/arch/ia64/kernel/efi.c
+++ b/arch/ia64/kernel/efi.c
@@ -923,3 +923,90 @@ efi_memmap_init(unsigned long *s, unsigned long *e)
 	*s = (u64)kern_memmap;
 	*e = (u64)++k;
 }
+
+void
+efi_initialize_iomem_resources(struct resource *code_resource,
+			       struct resource *data_resource)
+{
+	struct resource *res;
+	void *efi_map_start, *efi_map_end, *p;
+	efi_memory_desc_t *md;
+	u64 efi_desc_size;
+	char *name;
+	unsigned long flags;
+
+	efi_map_start = __va(ia64_boot_param->efi_memmap);
+	efi_map_end   = efi_map_start + ia64_boot_param->efi_memmap_size;
+	efi_desc_size = ia64_boot_param->efi_memdesc_size;
+
+	res = NULL;
+
+	for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
+		md = p;
+
+		if (md->num_pages == 0) /* should not happen */
+			continue;
+
+		flags = IORESOURCE_MEM;
+		switch (md->type) {
+
+			case EFI_MEMORY_MAPPED_IO:
+			case EFI_MEMORY_MAPPED_IO_PORT_SPACE:
+				continue;
+
+			case EFI_LOADER_CODE:
+			case EFI_LOADER_DATA:
+			case EFI_BOOT_SERVICES_DATA:
+			case EFI_BOOT_SERVICES_CODE:
+			case EFI_CONVENTIONAL_MEMORY:
+				if (md->attribute & EFI_MEMORY_WP) {
+					name = "System ROM";
+					flags |= IORESOURCE_READONLY;
+				} else {
+					name = "System RAM";
+				}
+				break;
+
+			case EFI_ACPI_MEMORY_NVS:
+				name = "ACPI Non-volatile Storage";
+				flags |= IORESOURCE_BUSY;
+				break;
+
+			case EFI_UNUSABLE_MEMORY:
+				name = "reserved";
+				flags |= IORESOURCE_BUSY | IORESOURCE_DISABLED;
+				break;
+
+			case EFI_RESERVED_TYPE:
+			case EFI_RUNTIME_SERVICES_CODE:
+			case EFI_RUNTIME_SERVICES_DATA:
+			case EFI_ACPI_RECLAIM_MEMORY:
+			default:
+				name = "reserved";
+				flags |= IORESOURCE_BUSY;
+				break;
+		}
+
+		if ((res = kcalloc(1, sizeof(struct resource), GFP_KERNEL)) == NULL) {
+			printk(KERN_ERR "failed to alocate resource for iomem\n");
+			return;
+		}
+
+		res->name = name;
+		res->start = md->phys_addr;
+		res->end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT) - 1;
+		res->flags = flags;
+
+		if (insert_resource(&iomem_resource, res) < 0)
+			kfree(res);
+		else {
+			/*
+			 * We don't know which region contains
+			 * kernel data so we try it repeatedly and
+			 * let the resource manager test it.
+			 */
+			insert_resource(res, code_resource);
+			insert_resource(res, data_resource);
+		}
+	}
+}
diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c
index 1658d68..83b37c4 100644
--- a/arch/ia64/kernel/setup.c
+++ b/arch/ia64/kernel/setup.c
@@ -78,6 +78,19 @@ struct screen_info screen_info;
 unsigned long vga_console_iobase;
 unsigned long vga_console_membase;
 
+static struct resource data_resource = {
+	.name	= "Kernel data",
+	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+static struct resource code_resource = {
+	.name	= "Kernel code",
+	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM
+};
+extern void efi_initialize_iomem_resources(struct resource *,
+		struct resource *);
+extern char _text[], _edata[], _etext[];
+
 unsigned long ia64_max_cacheline_size;
 unsigned long ia64_iobase;	/* virtual address for I/O accesses */
 EXPORT_SYMBOL(ia64_iobase);
@@ -171,6 +184,22 @@ sort_regions (struct rsvd_region *rsvd_region, int max)
 	}
 }
 
+/*
+ * Request address space for all standard resources
+ */
+static int __init register_memory(void)
+{
+	code_resource.start = ia64_tpa(_text);
+	code_resource.end   = ia64_tpa(_etext) - 1;
+	data_resource.start = ia64_tpa(_etext);
+	data_resource.end   = ia64_tpa(_edata) - 1;
+	efi_initialize_iomem_resources(&code_resource, &data_resource);
+
+	return 0;
+}
+
+__initcall(register_memory);
+
 /**
  * reserve_memory - setup reserved memory areas
  *
-- 
cgit v1.1


From 44c451208da397438e7062393aeb3a19ddb76a60 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bjorn.helgaas@hp.com>
Date: Fri, 16 Sep 2005 11:43:10 -0600
Subject: [IA64] ia64: add ar.k0 usage note

Update comment about how ar.k0 is used.  Make the initialization the
same as in start_secondary() (no functional change, just make it look
more similar).

Signed-off-by: Bjorn Helgaas <bjorn.helgaas@hp.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/kernel/setup.c | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

(limited to 'arch/ia64/kernel')

diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c
index 1f5c26d..e256b11 100644
--- a/arch/ia64/kernel/setup.c
+++ b/arch/ia64/kernel/setup.c
@@ -244,28 +244,31 @@ find_initrd (void)
 static void __init
 io_port_init (void)
 {
-	extern unsigned long ia64_iobase;
 	unsigned long phys_iobase;
 
 	/*
-	 *  Set `iobase' to the appropriate address in region 6 (uncached access range).
+	 * Set `iobase' based on the EFI memory map or, failing that, the
+	 * value firmware left in ar.k0.
 	 *
-	 *  The EFI memory map is the "preferred" location to get the I/O port space base,
-	 *  rather the relying on AR.KR0. This should become more clear in future SAL
-	 *  specs. We'll fall back to getting it out of AR.KR0 if no appropriate entry is
-	 *  found in the memory map.
+	 * Note that in ia32 mode, IN/OUT instructions use ar.k0 to compute
+	 * the port's virtual address, so ia32_load_state() loads it with a
+	 * user virtual address.  But in ia64 mode, glibc uses the
+	 * *physical* address in ar.k0 to mmap the appropriate area from
+	 * /dev/mem, and the inX()/outX() interfaces use MMIO.  In both
+	 * cases, user-mode can only use the legacy 0-64K I/O port space.
+	 *
+	 * ar.k0 is not involved in kernel I/O port accesses, which can use
+	 * any of the I/O port spaces and are done via MMIO using the
+	 * virtual mmio_base from the appropriate io_space[].
 	 */
 	phys_iobase = efi_get_iobase();
-	if (phys_iobase)
-		/* set AR.KR0 since this is all we use it for anyway */
-		ia64_set_kr(IA64_KR_IO_BASE, phys_iobase);
-	else {
+	if (!phys_iobase) {
 		phys_iobase = ia64_get_kr(IA64_KR_IO_BASE);
-		printk(KERN_INFO "No I/O port range found in EFI memory map, falling back "
-		       "to AR.KR0\n");
-		printk(KERN_INFO "I/O port base = 0x%lx\n", phys_iobase);
+		printk(KERN_INFO "No I/O port range found in EFI memory map, "
+			"falling back to AR.KR0 (0x%lx)\n", phys_iobase);
 	}
 	ia64_iobase = (unsigned long) ioremap(phys_iobase, 0);
+	ia64_set_kr(IA64_KR_IO_BASE, __pa(ia64_iobase));
 
 	/* setup legacy IO port space */
 	io_space[0].mmio_base = ia64_iobase;
-- 
cgit v1.1


From 650316f1228c0dc5e45c17765caef30db62468cd Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bjorn.helgaas@hp.com>
Date: Fri, 16 Sep 2005 11:43:45 -0600
Subject: [IA64] move ACPI IOSAPIC locality domain mapping from pci.c to acpi.c

Move acpi_map_iosapics() from pci.c to acpi.c, since it doesn't
have anything to do with PCI.

Signed-off-by: Bjorn Helgaas <bjorn.helgaas@hp.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/kernel/acpi.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'arch/ia64/kernel')

diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c
index 7e92647..9ad94dd 100644
--- a/arch/ia64/kernel/acpi.c
+++ b/arch/ia64/kernel/acpi.c
@@ -838,7 +838,7 @@ EXPORT_SYMBOL(acpi_unmap_lsapic);
 #endif				/* CONFIG_ACPI_HOTPLUG_CPU */
 
 #ifdef CONFIG_ACPI_NUMA
-acpi_status __devinit
+static acpi_status __devinit
 acpi_map_iosapic(acpi_handle handle, u32 depth, void *context, void **ret)
 {
 	struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
@@ -890,7 +890,16 @@ acpi_map_iosapic(acpi_handle handle, u32 depth, void *context, void **ret)
 	map_iosapic_to_node(gsi_base, node);
 	return AE_OK;
 }
-#endif				/* CONFIG_NUMA */
+
+static int __init
+acpi_map_iosapics (void)
+{
+	acpi_get_devices(NULL, acpi_map_iosapic, NULL, NULL);
+	return 0;
+}
+
+fs_initcall(acpi_map_iosapics);
+#endif				/* CONFIG_ACPI_NUMA */
 
 int acpi_register_ioapic(acpi_handle handle, u64 phys_addr, u32 gsi_base)
 {
-- 
cgit v1.1


From 20bb86852a6b7d9ca8c48ff921ff3904038959cf Mon Sep 17 00:00:00 2001
From: Keith Owens <kaos@sgi.com>
Date: Thu, 22 Sep 2005 18:49:15 +1000
Subject: [IA64] Wire in the MCA/INIT handler stacks

Wire the MCA/INIT handler stacks into DTR[2] and track them in
IA64_KR(CURRENT_STACK).  This gives the MCA/INIT handler stacks the
same TLB status as normal kernel stacks.  Reload the old CURRENT_STACK
data on return from OS to SAL.

Signed-off-by: Keith Owens <kaos@sgi.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/kernel/mca_asm.S | 96 ++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 85 insertions(+), 11 deletions(-)

(limited to 'arch/ia64/kernel')

diff --git a/arch/ia64/kernel/mca_asm.S b/arch/ia64/kernel/mca_asm.S
index 499a065..db32fc1 100644
--- a/arch/ia64/kernel/mca_asm.S
+++ b/arch/ia64/kernel/mca_asm.S
@@ -489,24 +489,27 @@ ia64_state_save:
 	;;
 	st8 [temp1]=r17,16	// pal_min_state
 	st8 [temp2]=r6,16	// prev_IA64_KR_CURRENT
+	mov r6=IA64_KR(CURRENT_STACK)
+	;;
+	st8 [temp1]=r6,16	// prev_IA64_KR_CURRENT_STACK
+	st8 [temp2]=r0,16	// prev_task, starts off as NULL
 	mov r6=cr.ifa
 	;;
-	st8 [temp1]=r0,16	// prev_task, starts off as NULL
-	st8 [temp2]=r12,16	// cr.isr
+	st8 [temp1]=r12,16	// cr.isr
+	st8 [temp2]=r6,16	// cr.ifa
 	mov r12=cr.itir
 	;;
-	st8 [temp1]=r6,16	// cr.ifa
-	st8 [temp2]=r12,16	// cr.itir
+	st8 [temp1]=r12,16	// cr.itir
+	st8 [temp2]=r11,16	// cr.iipa
 	mov r12=cr.iim
 	;;
-	st8 [temp1]=r11,16	// cr.iipa
-	st8 [temp2]=r12,16	// cr.iim
-	mov r6=cr.iha
+	st8 [temp1]=r12,16	// cr.iim
 (p1)	mov r12=IA64_MCA_COLD_BOOT
 (p2)	mov r12=IA64_INIT_WARM_BOOT
+	mov r6=cr.iha
 	;;
-	st8 [temp1]=r6,16	// cr.iha
-	st8 [temp2]=r12		// os_status, default is cold boot
+	st8 [temp2]=r6,16	// cr.iha
+	st8 [temp1]=r12		// os_status, default is cold boot
 	mov r6=IA64_MCA_SAME_CONTEXT
 	;;
 	st8 [temp1]=r6		// context, default is same context
@@ -823,9 +826,12 @@ ia64_state_restore:
 	ld8 r12=[temp1],16	// sal_ra
 	ld8 r9=[temp2],16	// sal_gp
 	;;
-	ld8 r22=[temp1],24	// pal_min_state, virtual.  skip prev_task
+	ld8 r22=[temp1],16	// pal_min_state, virtual
 	ld8 r21=[temp2],16	// prev_IA64_KR_CURRENT
 	;;
+	ld8 r16=[temp1],16	// prev_IA64_KR_CURRENT_STACK
+	ld8 r20=[temp2],16	// prev_task
+	;;
 	ld8 temp3=[temp1],16	// cr.isr
 	ld8 temp4=[temp2],16	// cr.ifa
 	;;
@@ -846,6 +852,45 @@ ia64_state_restore:
 	ld8 r8=[temp1]		// os_status
 	ld8 r10=[temp2]		// context
 
+	/* Wire IA64_TR_CURRENT_STACK to the stack that we are resuming to.  To
+	 * avoid any dependencies on the algorithm in ia64_switch_to(), just
+	 * purge any existing CURRENT_STACK mapping and insert the new one.
+	 *
+	 * r16 contains prev_IA64_KR_CURRENT_STACK, r21 contains
+	 * prev_IA64_KR_CURRENT, these values may have been changed by the C
+	 * code.  Do not use r8, r9, r10, r22, they contain values ready for
+	 * the return to SAL.
+	 */
+
+	mov r15=IA64_KR(CURRENT_STACK)		// physical granule mapped by IA64_TR_CURRENT_STACK
+	;;
+	shl r15=r15,IA64_GRANULE_SHIFT
+	;;
+	dep r15=-1,r15,61,3			// virtual granule
+	mov r18=IA64_GRANULE_SHIFT<<2		// for cr.itir.ps
+	;;
+	ptr.d r15,r18
+	;;
+	srlz.d
+
+	extr.u r19=r21,61,3			// r21 = prev_IA64_KR_CURRENT
+	shl r20=r16,IA64_GRANULE_SHIFT		// r16 = prev_IA64_KR_CURRENT_STACK
+	movl r21=PAGE_KERNEL			// page properties
+	;;
+	mov IA64_KR(CURRENT_STACK)=r16
+	cmp.ne p6,p0=RGN_KERNEL,r19		// new stack is in the kernel region?
+	or r21=r20,r21				// construct PA | page properties
+(p6)	br.spnt 1f				// the dreaded cpu 0 idle task in region 5:(
+	;;
+	mov cr.itir=r18
+	mov cr.ifa=r21
+	mov r20=IA64_TR_CURRENT_STACK
+	;;
+	itr.d dtr[r20]=r21
+	;;
+	srlz.d
+1:
+
 	br.sptk b0
 
 //EndStub//////////////////////////////////////////////////////////////////////
@@ -982,6 +1027,7 @@ ia64_set_kernel_registers:
 	add temp4=temp4, temp1	// &struct ia64_sal_os_state.os_gp
 	add r12=temp1, temp3	// kernel stack pointer on MCA/INIT stack
 	add r13=temp1, r3	// set current to start of MCA/INIT stack
+	add r20=temp1, r3	// physical start of MCA/INIT stack
 	;;
 	ld8 r1=[temp4]		// OS GP from SAL OS state
 	;;
@@ -991,7 +1037,35 @@ ia64_set_kernel_registers:
 	;;
 	mov IA64_KR(CURRENT)=r13
 
-	// FIXME: do I need to wire IA64_KR_CURRENT_STACK and IA64_TR_CURRENT_STACK?
+	/* Wire IA64_TR_CURRENT_STACK to the MCA/INIT handler stack.  To avoid
+	 * any dependencies on the algorithm in ia64_switch_to(), just purge
+	 * any existing CURRENT_STACK mapping and insert the new one.
+	 */
+
+	mov r16=IA64_KR(CURRENT_STACK)		// physical granule mapped by IA64_TR_CURRENT_STACK
+	;;
+	shl r16=r16,IA64_GRANULE_SHIFT
+	;;
+	dep r16=-1,r16,61,3			// virtual granule
+	mov r18=IA64_GRANULE_SHIFT<<2		// for cr.itir.ps
+	;;
+	ptr.d r16,r18
+	;;
+	srlz.d
+
+	shr.u r16=r20,IA64_GRANULE_SHIFT	// r20 = physical start of MCA/INIT stack
+	movl r21=PAGE_KERNEL			// page properties
+	;;
+	mov IA64_KR(CURRENT_STACK)=r16
+	or r21=r20,r21				// construct PA | page properties
+	;;
+	mov cr.itir=r18
+	mov cr.ifa=r13
+	mov r20=IA64_TR_CURRENT_STACK
+	;;
+	itr.d dtr[r20]=r21
+	;;
+	srlz.d
 
 	br.sptk b0
 
-- 
cgit v1.1


From 4881e2cd25d2d9cf9fca263caff3a0ce732d7d6b Mon Sep 17 00:00:00 2001
From: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Date: Tue, 20 Sep 2005 16:34:41 +0900
Subject: [IA64] MCA recovery verify pfn_valid

Verify the pfn is valid before calling pfn_to_page(),
and cut isolation message if nothing was done.

Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Acked-by: Russ Anderson <rja@sgi.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/kernel/mca_drv.c | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

(limited to 'arch/ia64/kernel')

diff --git a/arch/ia64/kernel/mca_drv.c b/arch/ia64/kernel/mca_drv.c
index 80f83d6..f081c60 100644
--- a/arch/ia64/kernel/mca_drv.c
+++ b/arch/ia64/kernel/mca_drv.c
@@ -56,8 +56,9 @@ static struct page *page_isolate[MAX_PAGE_ISOLATE];
 static int num_page_isolate = 0;
 
 typedef enum {
-	ISOLATE_NG = 0,
-	ISOLATE_OK = 1
+	ISOLATE_NG,
+	ISOLATE_OK,
+	ISOLATE_NONE
 } isolate_status_t;
 
 /*
@@ -74,7 +75,7 @@ static struct {
  * @paddr:	poisoned memory location
  *
  * Return value:
- *	ISOLATE_OK / ISOLATE_NG
+ *	one of isolate_status_t, ISOLATE_OK/NG/NONE.
  */
 
 static isolate_status_t
@@ -85,7 +86,10 @@ mca_page_isolate(unsigned long paddr)
 
 	/* whether physical address is valid or not */
 	if (!ia64_phys_addr_valid(paddr))
-		return ISOLATE_NG;
+		return ISOLATE_NONE;
+
+	if (!pfn_valid(paddr))
+		return ISOLATE_NONE;
 
 	/* convert physical address to physical page number */
 	p = pfn_to_page(paddr>>PAGE_SHIFT);
@@ -122,10 +126,15 @@ mca_handler_bh(unsigned long paddr)
 		current->pid, current->comm);
 
 	spin_lock(&mca_bh_lock);
-	if (mca_page_isolate(paddr) == ISOLATE_OK) {
+	switch (mca_page_isolate(paddr)) {
+	case ISOLATE_OK:
 		printk(KERN_DEBUG "Page isolation: ( %lx ) success.\n", paddr);
-	} else {
+		break;
+	case ISOLATE_NG:
 		printk(KERN_DEBUG "Page isolation: ( %lx ) failure.\n", paddr);
+		break;
+	default:
+		break;
 	}
 	spin_unlock(&mca_bh_lock);
 
-- 
cgit v1.1


From d719948e623622cf9fc8016f7ec63422d929eb3b Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Wed, 28 Sep 2005 16:09:46 -0700
Subject: [IA64] end of kernel 'data' is at _end, not _edata

/proc/iomem describes a block of memory as "Kernel data",
but the end address is derived from "_edata".  The kernel
actually has many other sections beyond _edata.  Get the
real end address from _end.

Acked-by: Khalid Aziz <khalid_aziz@hp.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/kernel/setup.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/ia64/kernel')

diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c
index 83b37c4..3e9b797 100644
--- a/arch/ia64/kernel/setup.c
+++ b/arch/ia64/kernel/setup.c
@@ -89,7 +89,7 @@ static struct resource code_resource = {
 };
 extern void efi_initialize_iomem_resources(struct resource *,
 		struct resource *);
-extern char _text[], _edata[], _etext[];
+extern char _text[], _end[], _etext[];
 
 unsigned long ia64_max_cacheline_size;
 unsigned long ia64_iobase;	/* virtual address for I/O accesses */
@@ -192,7 +192,7 @@ static int __init register_memory(void)
 	code_resource.start = ia64_tpa(_text);
 	code_resource.end   = ia64_tpa(_etext) - 1;
 	data_resource.start = ia64_tpa(_etext);
-	data_resource.end   = ia64_tpa(_edata) - 1;
+	data_resource.end   = ia64_tpa(_end) - 1;
 	efi_initialize_iomem_resources(&code_resource, &data_resource);
 
 	return 0;
-- 
cgit v1.1


From 76e677e25dd3d8af77d0b3810eacaacaf2f93f2f Mon Sep 17 00:00:00 2001
From: Bryan Sutula <Bryan.Sutula@hp.com>
Date: Wed, 5 Oct 2005 11:02:06 -0600
Subject: [IA64] Avoid kernel hang during CMC interrupt storm

I've noticed a kernel hang during a storm of CMC interrupts, which was
tracked down to the continual execution of the interrupt handler.

There's code in the CMC handler that's supposed to disable CMC
interrupts and switch to polling mode when it sees a bunch of CMCs.
Because disabling CMCs across all CPUs isn't safe in interrupt context,
the disable is done with a schedule_work().  But with continual CMC
interrupts, the schedule_work() never gets executed.

The following patch immediately disables CMC interrupts for the current
CPU.  This then allows (at least) one CPU to ignore CMC interrupts,
execute the schedule_work() code, and disable CMC interrupts on the rest
of the CPUs.

Acked-by: Keith Owens <kaos@sgi.com>
Signed-off-by: Bryan Sutula <Bryan.Sutula@hp.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/kernel/mca.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'arch/ia64/kernel')

diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c
index 6dc726a..d0a5106 100644
--- a/arch/ia64/kernel/mca.c
+++ b/arch/ia64/kernel/mca.c
@@ -1016,6 +1016,11 @@ ia64_mca_cmc_int_handler(int cmc_irq, void *arg, struct pt_regs *ptregs)
 
 			cmc_polling_enabled = 1;
 			spin_unlock(&cmc_history_lock);
+			/* If we're being hit with CMC interrupts, we won't
+			 * ever execute the schedule_work() below.  Need to
+			 * disable CMC interrupts on this processor now.
+			 */
+			ia64_mca_cmc_vector_disable(NULL);
 			schedule_work(&cmc_disable_work);
 
 			/*
-- 
cgit v1.1


From ce6e71ad4866dad366be135b024b776c00ec9ec8 Mon Sep 17 00:00:00 2001
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Date: Tue, 4 Oct 2005 16:35:31 -0700
Subject: [IA64] fix siblings field value in /proc/cpuinfo

Fix the "siblings" field value in /proc/cpuinfo so that it now shows the
number of siblings as seen by OS, instead of what is available from
hardware perspective.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/kernel/setup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/ia64/kernel')

diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c
index 1f5c26d..0ca6ef1 100644
--- a/arch/ia64/kernel/setup.c
+++ b/arch/ia64/kernel/setup.c
@@ -526,7 +526,7 @@ show_cpuinfo (struct seq_file *m, void *v)
 		   c->itc_freq / 1000000, c->itc_freq % 1000000,
 		   lpj*HZ/500000, (lpj*HZ/5000) % 100);
 #ifdef CONFIG_SMP
-	seq_printf(m, "siblings   : %u\n", c->num_log);
+	seq_printf(m, "siblings   : %u\n", cpus_weight(cpu_core_map[cpunum]));
 	if (c->threads_per_core > 1 || c->cores_per_socket > 1)
 		seq_printf(m,
 		   	   "physical id: %u\n"
-- 
cgit v1.1


From 9c184a073bfd650cc791956d6ca79725bb682716 Mon Sep 17 00:00:00 2001
From: "H. J. Lu" <hjl@lucon.org>
Date: Fri, 7 Oct 2005 11:01:19 -0700
Subject: [IA64] Fix 2.6 kernel for the new ia64 assembler

The new ia64 assembler uses slot 1 for the offset of a long (2-slot)
instruction and the old assembler uses slot 2. The 2.6 kernel assumes
slot 2 and won't boot when the new assembler is used:

http://sources.redhat.com/bugzilla/show_bug.cgi?id=1433

This patch will work with either slot 1 or 2.

Patch provided by H.J. Lu

Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/kernel/patch.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

(limited to 'arch/ia64/kernel')

diff --git a/arch/ia64/kernel/patch.c b/arch/ia64/kernel/patch.c
index 367804a..6a4ac7d 100644
--- a/arch/ia64/kernel/patch.c
+++ b/arch/ia64/kernel/patch.c
@@ -64,22 +64,30 @@ ia64_patch (u64 insn_addr, u64 mask, u64 val)
 void
 ia64_patch_imm64 (u64 insn_addr, u64 val)
 {
-	ia64_patch(insn_addr,
+	/* The assembler may generate offset pointing to either slot 1
+	   or slot 2 for a long (2-slot) instruction, occupying slots 1
+	   and 2.  */
+  	insn_addr &= -16UL;
+	ia64_patch(insn_addr + 2,
 		   0x01fffefe000UL, (  ((val & 0x8000000000000000UL) >> 27) /* bit 63 -> 36 */
 				     | ((val & 0x0000000000200000UL) <<  0) /* bit 21 -> 21 */
 				     | ((val & 0x00000000001f0000UL) <<  6) /* bit 16 -> 22 */
 				     | ((val & 0x000000000000ff80UL) << 20) /* bit  7 -> 27 */
 				     | ((val & 0x000000000000007fUL) << 13) /* bit  0 -> 13 */));
-	ia64_patch(insn_addr - 1, 0x1ffffffffffUL, val >> 22);
+	ia64_patch(insn_addr + 1, 0x1ffffffffffUL, val >> 22);
 }
 
 void
 ia64_patch_imm60 (u64 insn_addr, u64 val)
 {
-	ia64_patch(insn_addr,
+	/* The assembler may generate offset pointing to either slot 1
+	   or slot 2 for a long (2-slot) instruction, occupying slots 1
+	   and 2.  */
+  	insn_addr &= -16UL;
+	ia64_patch(insn_addr + 2,
 		   0x011ffffe000UL, (  ((val & 0x0800000000000000UL) >> 23) /* bit 59 -> 36 */
 				     | ((val & 0x00000000000fffffUL) << 13) /* bit  0 -> 13 */));
-	ia64_patch(insn_addr - 1, 0x1fffffffffcUL, val >> 18);
+	ia64_patch(insn_addr + 1, 0x1fffffffffcUL, val >> 18);
 }
 
 /*
-- 
cgit v1.1


From dc565b525d4b7091a3abb6616d210c8a896a11d7 Mon Sep 17 00:00:00 2001
From: "hawkes@sgi.com" <hawkes@sgi.com>
Date: Mon, 10 Oct 2005 08:43:26 -0700
Subject: [IA64] wider use of for_each_cpu_mask() in arch/ia64

In arch/ia64 change the explicit use of for-loops and NR_CPUS into the
general for_each_cpu() or for_each_online_cpu() constructs, as
appropriate.  This widens the scope of potential future optimizations
of the general constructs, as well as takes advantage of the existing
optimizations of first_cpu() and next_cpu().

Signed-off-by: John Hawkes <hawkes@sgi.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/kernel/irq.c     | 12 ++++++------
 arch/ia64/kernel/module.c  |  6 +++---
 arch/ia64/kernel/smp.c     | 10 +++++-----
 arch/ia64/kernel/smpboot.c |  6 +++---
 4 files changed, 17 insertions(+), 17 deletions(-)

(limited to 'arch/ia64/kernel')

diff --git a/arch/ia64/kernel/irq.c b/arch/ia64/kernel/irq.c
index 205d980..d33244c 100644
--- a/arch/ia64/kernel/irq.c
+++ b/arch/ia64/kernel/irq.c
@@ -57,9 +57,9 @@ int show_interrupts(struct seq_file *p, void *v)
 
 	if (i == 0) {
 		seq_printf(p, "           ");
-		for (j=0; j<NR_CPUS; j++)
-			if (cpu_online(j))
-				seq_printf(p, "CPU%d       ",j);
+		for_each_online_cpu(j) {
+			seq_printf(p, "CPU%d       ",j);
+		}
 		seq_putc(p, '\n');
 	}
 
@@ -72,9 +72,9 @@ int show_interrupts(struct seq_file *p, void *v)
 #ifndef CONFIG_SMP
 		seq_printf(p, "%10u ", kstat_irqs(i));
 #else
-		for (j = 0; j < NR_CPUS; j++)
-			if (cpu_online(j))
-				seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
+		for_each_online_cpu(j) {
+			seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
+		}
 #endif
 		seq_printf(p, " %14s", irq_desc[i].handler->typename);
 		seq_printf(p, "  %s", action->name);
diff --git a/arch/ia64/kernel/module.c b/arch/ia64/kernel/module.c
index f1aca7c..7a2f0a7 100644
--- a/arch/ia64/kernel/module.c
+++ b/arch/ia64/kernel/module.c
@@ -947,8 +947,8 @@ void
 percpu_modcopy (void *pcpudst, const void *src, unsigned long size)
 {
 	unsigned int i;
-	for (i = 0; i < NR_CPUS; i++)
-		if (cpu_possible(i))
-			memcpy(pcpudst + __per_cpu_offset[i], src, size);
+	for_each_cpu(i) {
+		memcpy(pcpudst + __per_cpu_offset[i], src, size);
+	}
 }
 #endif /* CONFIG_SMP */
diff --git a/arch/ia64/kernel/smp.c b/arch/ia64/kernel/smp.c
index 0166a98..657ac99 100644
--- a/arch/ia64/kernel/smp.c
+++ b/arch/ia64/kernel/smp.c
@@ -185,8 +185,8 @@ send_IPI_allbutself (int op)
 {
 	unsigned int i;
 
-	for (i = 0; i < NR_CPUS; i++) {
-		if (cpu_online(i) && i != smp_processor_id())
+	for_each_online_cpu(i) {
+		if (i != smp_processor_id())
 			send_IPI_single(i, op);
 	}
 }
@@ -199,9 +199,9 @@ send_IPI_all (int op)
 {
 	int i;
 
-	for (i = 0; i < NR_CPUS; i++)
-		if (cpu_online(i))
-			send_IPI_single(i, op);
+	for_each_online_cpu(i) {
+		send_IPI_single(i, op);
+	}
 }
 
 /*
diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c
index 7d72c0d..400a489 100644
--- a/arch/ia64/kernel/smpboot.c
+++ b/arch/ia64/kernel/smpboot.c
@@ -694,9 +694,9 @@ smp_cpus_done (unsigned int dummy)
 	 * Allow the user to impress friends.
 	 */
 
-	for (cpu = 0; cpu < NR_CPUS; cpu++)
-		if (cpu_online(cpu))
-			bogosum += cpu_data(cpu)->loops_per_jiffy;
+	for_each_online_cpu(cpu) {
+		bogosum += cpu_data(cpu)->loops_per_jiffy;
+	}
 
 	printk(KERN_INFO "Total of %d processors activated (%lu.%02lu BogoMIPS).\n",
 	       (int)num_online_cpus(), bogosum/(500000/HZ), (bogosum/(5000/HZ))%100);
-- 
cgit v1.1


From ddf6d0a00cbb4ad6d4fb4200cc911bb9389174c1 Mon Sep 17 00:00:00 2001
From: "hawkes@sgi.com" <hawkes@sgi.com>
Date: Thu, 13 Oct 2005 12:01:18 -0700
Subject: [IA64] another place to use for_each_cpu_mask() in arch/ia64

In arch/ia64 change the explicit use of a for-loop using NR_CPUS into the
general for_each_online_cpu() construct.  This widens the scope of potential
future optimizations of the general constructs, as well as takes advantage
of the existing optimizations of first_cpu() and next_cpu(), which is
advantageous when the true CPU count is much smaller than NR_CPUS.

Signed-off-by: John Hawkes <hawkes@sgi.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/kernel/mca.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'arch/ia64/kernel')

diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c
index d0a5106..52c47da 100644
--- a/arch/ia64/kernel/mca.c
+++ b/arch/ia64/kernel/mca.c
@@ -508,9 +508,7 @@ ia64_mca_wakeup_all(void)
 	int cpu;
 
 	/* Clear the Rendez checkin flag for all cpus */
-	for(cpu = 0; cpu < NR_CPUS; cpu++) {
-		if (!cpu_online(cpu))
-			continue;
+	for_each_online_cpu(cpu) {
 		if (ia64_mc_info.imi_rendez_checkin[cpu] == IA64_MCA_RENDEZ_CHECKIN_DONE)
 			ia64_mca_wakeup(cpu);
 	}
-- 
cgit v1.1


From 4ac0068f44f192f2de95a7bb36df3e19767a45fb Mon Sep 17 00:00:00 2001
From: Cliff Wickman <cpw@sgi.com>
Date: Thu, 27 Oct 2005 10:29:08 -0500
Subject: [IA64] ptrace - find memory sharers on children list

In arch/ia64/kernel/ptrace.c there is a test for a peek or poke of a
register image (in register backing storage).
The test can be unnecessarily long (and occurs while holding the tasklist_lock).
Especially long on a large system with thousands of active tasks.

The ptrace caller (presumably a debugger) specifies the pid of
its target and an address to peek or poke.  But the debugger could be
attached to several tasks.
The idea of find_thread_for_addr() is to find whether the target address
is in the RBS for any of those tasks.

Currently it searches the thread-list of the target pid.  If that search
does not find a match, and the shared mm-struct's user count indicates
that there are other tasks sharing this address space (a rare occurrence),
a search is made of all the tasks in the system.

Another approach can drastically shorten this procedure.
It depends upon the fact that in order to peek or poke from/to any task,
the debugger must first attach to that task.  And when it does, the
attached task is made a child of the debugger (is chained to its children list).

Therefore we can search just the debugger's children list.

Signed-off-by: Cliff Wickman <cpw@sgi.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/kernel/ptrace.c | 26 ++++++++++----------------
 1 file changed, 10 insertions(+), 16 deletions(-)

(limited to 'arch/ia64/kernel')

diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c
index bbb8bc7..9a9c1bd 100644
--- a/arch/ia64/kernel/ptrace.c
+++ b/arch/ia64/kernel/ptrace.c
@@ -589,6 +589,7 @@ find_thread_for_addr (struct task_struct *child, unsigned long addr)
 {
 	struct task_struct *g, *p;
 	struct mm_struct *mm;
+	struct list_head *this, *next;
 	int mm_users;
 
 	if (!(mm = get_task_mm(child)))
@@ -600,28 +601,21 @@ find_thread_for_addr (struct task_struct *child, unsigned long addr)
 		goto out;		/* not multi-threaded */
 
 	/*
-	 * First, traverse the child's thread-list.  Good for scalability with
-	 * NPTL-threads.
+	 * Traverse the current process' children list.  Every task that
+	 * one attaches to becomes a child.  And it is only attached children
+	 * of the debugger that are of interest (ptrace_check_attach checks
+	 * for this).
 	 */
-	p = child;
-	do {
-		if (thread_matches(p, addr)) {
-			child = p;
-			goto out;
-		}
-		if (mm_users-- <= 1)
-			goto out;
-	} while ((p = next_thread(p)) != child);
-
-	do_each_thread(g, p) {
-		if (child->mm != mm)
+ 	list_for_each_safe(this, next, &current->children) {
+		p = list_entry(this, struct task_struct, sibling);
+		if (p->mm != mm)
 			continue;
-
 		if (thread_matches(p, addr)) {
 			child = p;
 			goto out;
 		}
-	} while_each_thread(g, p);
+	}
+
   out:
 	mmput(mm);
 	return child;
-- 
cgit v1.1


From 0e1f60609258e18ec0a0477c646101212822d387 Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Fri, 28 Oct 2005 15:52:13 -0700
Subject: [IA64] fix warning unused variable `g'

4ac0068f44f192f2de95a7bb36df3e19767a45fb forgot to delete
the declaration of this variable which is no longer used.

Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/kernel/ptrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/ia64/kernel')

diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c
index 9a9c1bd..4b19d04 100644
--- a/arch/ia64/kernel/ptrace.c
+++ b/arch/ia64/kernel/ptrace.c
@@ -587,7 +587,7 @@ thread_matches (struct task_struct *thread, unsigned long addr)
 static struct task_struct *
 find_thread_for_addr (struct task_struct *child, unsigned long addr)
 {
-	struct task_struct *g, *p;
+	struct task_struct *p;
 	struct mm_struct *mm;
 	struct list_head *this, *next;
 	int mm_users;
-- 
cgit v1.1


From ab50b8ed818016cfecd747d6d4bb9139986bc029 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:15:56 -0700
Subject: [PATCH] mm: vm_stat_account unshackled

The original vm_stat_account has fallen into disuse, with only one user, and
only one user of vm_stat_unaccount.  It's easier to keep track if we convert
them all to __vm_stat_account, then free it from its __shackles.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ia64/kernel/perfmon.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/ia64/kernel')

diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index d71731e..f7dfc10 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -2352,7 +2352,8 @@ pfm_smpl_buffer_alloc(struct task_struct *task, pfm_context_t *ctx, unsigned lon
 	insert_vm_struct(mm, vma);
 
 	mm->total_vm  += size >> PAGE_SHIFT;
-	vm_stat_account(vma);
+	vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file,
+							vma_pages(vma));
 	up_write(&task->mm->mmap_sem);
 
 	/*
-- 
cgit v1.1


From ecea8d19c9f0ebd62ddaa07fc919ff4e4b820d99 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 30 Oct 2005 15:03:00 -0800
Subject: [PATCH] jiffies_64 cleanup

Define jiffies_64 in kernel/timer.c rather than having 24 duplicated
defines in each architecture.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ia64/kernel/time.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'arch/ia64/kernel')

diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c
index 8b8a5a4..5b7e736 100644
--- a/arch/ia64/kernel/time.c
+++ b/arch/ia64/kernel/time.c
@@ -32,10 +32,6 @@
 
 extern unsigned long wall_jiffies;
 
-u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
-
-EXPORT_SYMBOL(jiffies_64);
-
 #define TIME_KEEPER_ID	0	/* smp_processor_id() of time-keeper */
 
 #ifdef CONFIG_IA64_DEBUG_IRQ
-- 
cgit v1.1


From 4e57b6817880946a3a78d5d8cad1ace363f7e449 Mon Sep 17 00:00:00 2001
From: Tim Schmielau <tim@physik3.uni-rostock.de>
Date: Sun, 30 Oct 2005 15:03:48 -0800
Subject: [PATCH] fix missing includes

I recently picked up my older work to remove unnecessary #includes of
sched.h, starting from a patch by Dave Jones to not include sched.h
from module.h. This reduces the number of indirect includes of sched.h
by ~300. Another ~400 pointless direct includes can be removed after
this disentangling (patch to follow later).
However, quite a few indirect includes need to be fixed up for this.

In order to feed the patches through -mm with as little disturbance as
possible, I've split out the fixes I accumulated up to now (complete for
i386 and x86_64, more archs to follow later) and post them before the real
patch.  This way this large part of the patch is kept simple with only
adding #includes, and all hunks are independent of each other.  So if any
hunk rejects or gets in the way of other patches, just drop it.  My scripts
will pick it up again in the next round.

Signed-off-by: Tim Schmielau <tim@physik3.uni-rostock.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ia64/kernel/cyclone.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/ia64/kernel')

diff --git a/arch/ia64/kernel/cyclone.c b/arch/ia64/kernel/cyclone.c
index 768c7e4..6ade379 100644
--- a/arch/ia64/kernel/cyclone.c
+++ b/arch/ia64/kernel/cyclone.c
@@ -2,6 +2,7 @@
 #include <linux/smp.h>
 #include <linux/time.h>
 #include <linux/errno.h>
+#include <linux/timex.h>
 #include <asm/io.h>
 
 /* IBM Summit (EXA) Cyclone counter code*/
-- 
cgit v1.1


From dcc17d1baef3721d1574e5b2f4f2d4607514bcff Mon Sep 17 00:00:00 2001
From: Peter Keilty <peter.keilty@hp.com>
Date: Mon, 31 Oct 2005 16:44:47 -0500
Subject: [IA64] Use bitmaps for efficient context allocation/free

Corrects the very inefficent method of finding free context_ids in
get_mmu_context().  Instead of walking the task_list of all processes,
2 bitmaps are used to efficently store and lookup state, inuse and
needs flushing. The entire rid address space is now used before calling
wrap_mmu_context and global tlb flushing.

Special thanks to Ken and Rohit for their review and modifications in
using a bit flushmap.

Signed-off-by: Peter Keilty <peter.keilty@hp.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/kernel/setup.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/ia64/kernel')

diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c
index fc56ca2..c9388a9 100644
--- a/arch/ia64/kernel/setup.c
+++ b/arch/ia64/kernel/setup.c
@@ -454,6 +454,7 @@ setup_arch (char **cmdline_p)
 #endif
 
 	cpu_init();	/* initialize the bootstrap CPU */
+	mmu_context_init();	/* initialize context_id bitmap */
 
 #ifdef CONFIG_ACPI
 	acpi_boot_init();
-- 
cgit v1.1


From e1531b4218a7ccfc1b2234b87105201e5ebe1bbf Mon Sep 17 00:00:00 2001
From: "John W. Linville" <linville@tuxdriver.com>
Date: Mon, 7 Nov 2005 00:57:54 -0800
Subject: [PATCH] ia64: re-implement dma_get_cache_alignment to avoid
 EXPORT_SYMBOL

The current ia64 implementation of dma_get_cache_alignment does not work
for modules because it relies on a symbol which is not exported.  Direct
access to a global is a little ugly anyway, so this patch re-implements
dma_get_cache_alignment in a manner similar to what is currently used for
x86_64.

Signed-off-by: John W. Linville <linville@tuxdriver.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ia64/kernel/setup.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'arch/ia64/kernel')

diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c
index fc56ca2..3af6de3 100644
--- a/arch/ia64/kernel/setup.c
+++ b/arch/ia64/kernel/setup.c
@@ -92,6 +92,13 @@ extern void efi_initialize_iomem_resources(struct resource *,
 extern char _text[], _end[], _etext[];
 
 unsigned long ia64_max_cacheline_size;
+
+int dma_get_cache_alignment(void)
+{
+        return ia64_max_cacheline_size;
+}
+EXPORT_SYMBOL(dma_get_cache_alignment);
+
 unsigned long ia64_iobase;	/* virtual address for I/O accesses */
 EXPORT_SYMBOL(ia64_iobase);
 struct io_space io_space[MAX_IO_SPACES];
-- 
cgit v1.1


From 66ff2d0691e00e1e7bfdf398a970310c9a0fe671 Mon Sep 17 00:00:00 2001
From: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Date: Mon, 7 Nov 2005 01:00:07 -0800
Subject: [PATCH] Kprobes: rearrange preempt_disable/enable() calls

The following set of patches are aimed at improving kprobes scalability.  We
currently serialize kprobe registration, unregistration and handler execution
using a single spinlock - kprobe_lock.

With these changes, kprobe handlers can run without any locks held.  It also
allows for simultaneous kprobe handler executions on different processors as
we now track kprobe execution on a per processor basis.  It is now necessary
that the handlers be re-entrant since handlers can run concurrently on
multiple processors.

All changes have been tested on i386, ia64, ppc64 and x86_64, while sparc64
has been compile tested only.

The patches can be viewed as 3 logical chunks:

patch 1: 	Reorder preempt_(dis/en)able calls
patches 2-7: 	Introduce per_cpu data areas to track kprobe execution
patches 8-9: 	Use RCU to synchronize kprobe (un)registration and handler
		execution.

Thanks to Maneesh Soni, James Keniston and Anil Keshavamurthy for their
review and suggestions. Thanks again to Anil, Hien Nguyen and Kevin Stafford
for testing the patches.

This patch:

Reorder preempt_disable/enable() calls in arch kprobes files in preparation to
introduce locking changes.  No functional changes introduced by this patch.

Signed-off-by: Ananth N Mavinakayahanalli <ananth@in.ibm.com>
Signed-off-by: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ia64/kernel/kprobes.c | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

(limited to 'arch/ia64/kernel')

diff --git a/arch/ia64/kernel/kprobes.c b/arch/ia64/kernel/kprobes.c
index 471086b..1e80ec8 100644
--- a/arch/ia64/kernel/kprobes.c
+++ b/arch/ia64/kernel/kprobes.c
@@ -395,7 +395,7 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
         /*
          * By returning a non-zero value, we are telling
          * kprobe_handler() that we have handled unlocking
-         * and re-enabling preemption.
+	 * and re-enabling preemption
          */
         return 1;
 }
@@ -607,8 +607,6 @@ static int __kprobes pre_kprobes_handler(struct die_args *args)
 	struct pt_regs *regs = args->regs;
 	kprobe_opcode_t *addr = (kprobe_opcode_t *)instruction_pointer(regs);
 
-	preempt_disable();
-
 	/* Handle recursion cases */
 	if (kprobe_running()) {
 		p = get_kprobe(addr);
@@ -665,6 +663,11 @@ static int __kprobes pre_kprobes_handler(struct die_args *args)
 		goto no_kprobe;
 	}
 
+	/*
+	 * This preempt_disable() matches the preempt_enable_no_resched()
+	 * in post_kprobes_handler()
+	 */
+	preempt_disable();
 	kprobe_status = KPROBE_HIT_ACTIVE;
 	set_current_kprobe(p);
 
@@ -682,7 +685,6 @@ ss_probe:
 	return 1;
 
 no_kprobe:
-	preempt_enable_no_resched();
 	return ret;
 }
 
@@ -733,22 +735,26 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
 				       unsigned long val, void *data)
 {
 	struct die_args *args = (struct die_args *)data;
+	int ret = NOTIFY_DONE;
+
+	preempt_disable();
 	switch(val) {
 	case DIE_BREAK:
 		if (pre_kprobes_handler(args))
-			return NOTIFY_STOP;
+			ret = NOTIFY_STOP;
 		break;
 	case DIE_SS:
 		if (post_kprobes_handler(args->regs))
-			return NOTIFY_STOP;
+			ret = NOTIFY_STOP;
 		break;
 	case DIE_PAGE_FAULT:
 		if (kprobes_fault_handler(args->regs, args->trapnr))
-			return NOTIFY_STOP;
+			ret = NOTIFY_STOP;
 	default:
 		break;
 	}
-	return NOTIFY_DONE;
+	preempt_enable();
+	return ret;
 }
 
 int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
-- 
cgit v1.1


From 8a5c4dc5e5d72b7802f5647082ccf3861a94f013 Mon Sep 17 00:00:00 2001
From: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Date: Mon, 7 Nov 2005 01:00:09 -0800
Subject: [PATCH] Kprobes: Track kprobe on a per_cpu basis - ia64 changes

IA64 changes to track kprobe execution on a per-cpu basis.  We now track the
kprobe state machine independently on each cpu using an arch specific kprobe
control block.

Signed-off-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Signed-off-by: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ia64/kernel/kprobes.c | 83 +++++++++++++++++++++++++---------------------
 1 file changed, 45 insertions(+), 38 deletions(-)

(limited to 'arch/ia64/kernel')

diff --git a/arch/ia64/kernel/kprobes.c b/arch/ia64/kernel/kprobes.c
index 1e80ec8..17e70b1 100644
--- a/arch/ia64/kernel/kprobes.c
+++ b/arch/ia64/kernel/kprobes.c
@@ -38,13 +38,8 @@
 
 extern void jprobe_inst_return(void);
 
-/* kprobe_status settings */
-#define KPROBE_HIT_ACTIVE	0x00000001
-#define KPROBE_HIT_SS		0x00000002
-
-static struct kprobe *current_kprobe, *kprobe_prev;
-static unsigned long kprobe_status, kprobe_status_prev;
-static struct pt_regs jprobe_saved_regs;
+DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
+DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
 
 enum instruction_type {A, I, M, F, B, L, X, u};
 static enum instruction_type bundle_encoding[32][3] = {
@@ -313,21 +308,22 @@ static int __kprobes valid_kprobe_addr(int template, int slot,
 	return 0;
 }
 
-static inline void save_previous_kprobe(void)
+static inline void save_previous_kprobe(struct kprobe_ctlblk *kcb)
 {
-	kprobe_prev = current_kprobe;
-	kprobe_status_prev = kprobe_status;
+	kcb->prev_kprobe.kp = kprobe_running();
+	kcb->prev_kprobe.status = kcb->kprobe_status;
 }
 
-static inline void restore_previous_kprobe(void)
+static inline void restore_previous_kprobe(struct kprobe_ctlblk *kcb)
 {
-	current_kprobe = kprobe_prev;
-	kprobe_status = kprobe_status_prev;
+	__get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp;
+	kcb->kprobe_status = kcb->prev_kprobe.status;
 }
 
-static inline void set_current_kprobe(struct kprobe *p)
+static inline void set_current_kprobe(struct kprobe *p,
+			struct kprobe_ctlblk *kcb)
 {
-	current_kprobe = p;
+	__get_cpu_var(current_kprobe) = p;
 }
 
 static void kretprobe_trampoline(void)
@@ -389,6 +385,7 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
 	BUG_ON(!orig_ret_address || (orig_ret_address == trampoline_address));
 	regs->cr_iip = orig_ret_address;
 
+	reset_current_kprobe();
 	unlock_kprobes();
 	preempt_enable_no_resched();
 
@@ -606,12 +603,13 @@ static int __kprobes pre_kprobes_handler(struct die_args *args)
 	int ret = 0;
 	struct pt_regs *regs = args->regs;
 	kprobe_opcode_t *addr = (kprobe_opcode_t *)instruction_pointer(regs);
+	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
 
 	/* Handle recursion cases */
 	if (kprobe_running()) {
 		p = get_kprobe(addr);
 		if (p) {
-			if ( (kprobe_status == KPROBE_HIT_SS) &&
+			if ((kcb->kprobe_status == KPROBE_HIT_SS) &&
 	 		     (p->ainsn.inst_flag == INST_FLAG_BREAK_INST)) {
   				ia64_psr(regs)->ss = 0;
 				unlock_kprobes();
@@ -623,17 +621,17 @@ static int __kprobes pre_kprobes_handler(struct die_args *args)
 			 * just single step on the instruction of the new probe
 			 * without calling any user handlers.
 			 */
-			save_previous_kprobe();
-			set_current_kprobe(p);
+			save_previous_kprobe(kcb);
+			set_current_kprobe(p, kcb);
 			p->nmissed++;
 			prepare_ss(p, regs);
-			kprobe_status = KPROBE_REENTER;
+			kcb->kprobe_status = KPROBE_REENTER;
 			return 1;
 		} else if (args->err == __IA64_BREAK_JPROBE) {
 			/*
 			 * jprobe instrumented function just completed
 			 */
-			p = current_kprobe;
+			p = __get_cpu_var(current_kprobe);
 			if (p->break_handler && p->break_handler(p, regs)) {
 				goto ss_probe;
 			}
@@ -668,8 +666,8 @@ static int __kprobes pre_kprobes_handler(struct die_args *args)
 	 * in post_kprobes_handler()
 	 */
 	preempt_disable();
-	kprobe_status = KPROBE_HIT_ACTIVE;
-	set_current_kprobe(p);
+	set_current_kprobe(p, kcb);
+	kcb->kprobe_status = KPROBE_HIT_ACTIVE;
 
 	if (p->pre_handler && p->pre_handler(p, regs))
 		/*
@@ -681,7 +679,7 @@ static int __kprobes pre_kprobes_handler(struct die_args *args)
 
 ss_probe:
 	prepare_ss(p, regs);
-	kprobe_status = KPROBE_HIT_SS;
+	kcb->kprobe_status = KPROBE_HIT_SS;
 	return 1;
 
 no_kprobe:
@@ -690,22 +688,25 @@ no_kprobe:
 
 static int __kprobes post_kprobes_handler(struct pt_regs *regs)
 {
-	if (!kprobe_running())
+	struct kprobe *cur = kprobe_running();
+	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+
+	if (!cur)
 		return 0;
 
-	if ((kprobe_status != KPROBE_REENTER) && current_kprobe->post_handler) {
-		kprobe_status = KPROBE_HIT_SSDONE;
-		current_kprobe->post_handler(current_kprobe, regs, 0);
+	if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
+		kcb->kprobe_status = KPROBE_HIT_SSDONE;
+		cur->post_handler(cur, regs, 0);
 	}
 
-	resume_execution(current_kprobe, regs);
+	resume_execution(cur, regs);
 
 	/*Restore back the original saved kprobes variables and continue. */
-	if (kprobe_status == KPROBE_REENTER) {
-		restore_previous_kprobe();
+	if (kcb->kprobe_status == KPROBE_REENTER) {
+		restore_previous_kprobe(kcb);
 		goto out;
 	}
-
+	reset_current_kprobe();
 	unlock_kprobes();
 
 out:
@@ -715,15 +716,18 @@ out:
 
 static int __kprobes kprobes_fault_handler(struct pt_regs *regs, int trapnr)
 {
-	if (!kprobe_running())
+	struct kprobe *cur = kprobe_running();
+	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+
+	if (!cur)
 		return 0;
 
-	if (current_kprobe->fault_handler &&
-	    current_kprobe->fault_handler(current_kprobe, regs, trapnr))
+	if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr))
 		return 1;
 
-	if (kprobe_status & KPROBE_HIT_SS) {
-		resume_execution(current_kprobe, regs);
+	if (kcb->kprobe_status & KPROBE_HIT_SS) {
+		resume_execution(cur, regs);
+		reset_current_kprobe();
 		unlock_kprobes();
 		preempt_enable_no_resched();
 	}
@@ -761,9 +765,10 @@ int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
 {
 	struct jprobe *jp = container_of(p, struct jprobe, kp);
 	unsigned long addr = ((struct fnptr *)(jp->entry))->ip;
+	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
 
 	/* save architectural state */
-	jprobe_saved_regs = *regs;
+	kcb->jprobe_saved_regs = *regs;
 
 	/* after rfi, execute the jprobe instrumented function */
 	regs->cr_iip = addr & ~0xFULL;
@@ -781,7 +786,9 @@ int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
 
 int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
 {
-	*regs = jprobe_saved_regs;
+	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+
+	*regs = kcb->jprobe_saved_regs;
 	return 1;
 }
 
-- 
cgit v1.1


From 991a51d83a3d9bebfafdd1e692cf310899d60791 Mon Sep 17 00:00:00 2001
From: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Date: Mon, 7 Nov 2005 01:00:14 -0800
Subject: [PATCH] Kprobes: Use RCU for (un)register synchronization - arch
 changes

Changes to the arch kprobes infrastructure to take advantage of the locking
changes introduced by usage of RCU for synchronization.  All handlers are now
run without any locks held, so they have to be re-entrant or provide their own
synchronization.

Signed-off-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Signed-off-by: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ia64/kernel/kprobes.c | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

(limited to 'arch/ia64/kernel')

diff --git a/arch/ia64/kernel/kprobes.c b/arch/ia64/kernel/kprobes.c
index 17e70b1..fddbac3 100644
--- a/arch/ia64/kernel/kprobes.c
+++ b/arch/ia64/kernel/kprobes.c
@@ -26,7 +26,6 @@
 #include <linux/config.h>
 #include <linux/kprobes.h>
 #include <linux/ptrace.h>
-#include <linux/spinlock.h>
 #include <linux/string.h>
 #include <linux/slab.h>
 #include <linux/preempt.h>
@@ -343,10 +342,11 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
 	struct kretprobe_instance *ri = NULL;
 	struct hlist_head *head;
 	struct hlist_node *node, *tmp;
-	unsigned long orig_ret_address = 0;
+	unsigned long flags, orig_ret_address = 0;
 	unsigned long trampoline_address =
 		((struct fnptr *)kretprobe_trampoline)->ip;
 
+	spin_lock_irqsave(&kretprobe_lock, flags);
         head = kretprobe_inst_table_head(current);
 
 	/*
@@ -386,7 +386,7 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
 	regs->cr_iip = orig_ret_address;
 
 	reset_current_kprobe();
-	unlock_kprobes();
+	spin_unlock_irqrestore(&kretprobe_lock, flags);
 	preempt_enable_no_resched();
 
         /*
@@ -397,6 +397,7 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
         return 1;
 }
 
+/* Called with kretprobe_lock held */
 void __kprobes arch_prepare_kretprobe(struct kretprobe *rp,
 				      struct pt_regs *regs)
 {
@@ -612,7 +613,6 @@ static int __kprobes pre_kprobes_handler(struct die_args *args)
 			if ((kcb->kprobe_status == KPROBE_HIT_SS) &&
 	 		     (p->ainsn.inst_flag == INST_FLAG_BREAK_INST)) {
   				ia64_psr(regs)->ss = 0;
-				unlock_kprobes();
 				goto no_kprobe;
 			}
 			/* We have reentered the pre_kprobe_handler(), since
@@ -641,10 +641,8 @@ static int __kprobes pre_kprobes_handler(struct die_args *args)
 		}
 	}
 
-	lock_kprobes();
 	p = get_kprobe(addr);
 	if (!p) {
-		unlock_kprobes();
 		if (!is_ia64_break_inst(regs)) {
 			/*
 			 * The breakpoint instruction was removed right
@@ -707,7 +705,6 @@ static int __kprobes post_kprobes_handler(struct pt_regs *regs)
 		goto out;
 	}
 	reset_current_kprobe();
-	unlock_kprobes();
 
 out:
 	preempt_enable_no_resched();
@@ -728,7 +725,6 @@ static int __kprobes kprobes_fault_handler(struct pt_regs *regs, int trapnr)
 	if (kcb->kprobe_status & KPROBE_HIT_SS) {
 		resume_execution(cur, regs);
 		reset_current_kprobe();
-		unlock_kprobes();
 		preempt_enable_no_resched();
 	}
 
@@ -741,7 +737,7 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
 	struct die_args *args = (struct die_args *)data;
 	int ret = NOTIFY_DONE;
 
-	preempt_disable();
+	rcu_read_lock();
 	switch(val) {
 	case DIE_BREAK:
 		if (pre_kprobes_handler(args))
@@ -757,7 +753,7 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
 	default:
 		break;
 	}
-	preempt_enable();
+	rcu_read_unlock();
 	return ret;
 }
 
-- 
cgit v1.1


From d217d5450f11d8c907c0458d175b0dc999b4d06d Mon Sep 17 00:00:00 2001
From: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Date: Mon, 7 Nov 2005 01:00:14 -0800
Subject: [PATCH] Kprobes: preempt_disable/enable() simplification

Reorganize the preempt_disable/enable calls to eliminate the extra preempt
depth.  Changes based on Paul McKenney's review suggestions for the kprobes
RCU changeset.

Signed-off-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Signed-off-by: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ia64/kernel/kprobes.c | 37 ++++++++++++++++++++-----------------
 1 file changed, 20 insertions(+), 17 deletions(-)

(limited to 'arch/ia64/kernel')

diff --git a/arch/ia64/kernel/kprobes.c b/arch/ia64/kernel/kprobes.c
index fddbac3..96736a1 100644
--- a/arch/ia64/kernel/kprobes.c
+++ b/arch/ia64/kernel/kprobes.c
@@ -389,11 +389,11 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
 	spin_unlock_irqrestore(&kretprobe_lock, flags);
 	preempt_enable_no_resched();
 
-        /*
-         * By returning a non-zero value, we are telling
-         * kprobe_handler() that we have handled unlocking
-	 * and re-enabling preemption
-         */
+	/*
+	 * By returning a non-zero value, we are telling
+	 * kprobe_handler() that we don't want the post_handler
+	 * to run (and have re-enabled preemption)
+	 */
         return 1;
 }
 
@@ -604,7 +604,14 @@ static int __kprobes pre_kprobes_handler(struct die_args *args)
 	int ret = 0;
 	struct pt_regs *regs = args->regs;
 	kprobe_opcode_t *addr = (kprobe_opcode_t *)instruction_pointer(regs);
-	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+	struct kprobe_ctlblk *kcb;
+
+	/*
+	 * We don't want to be preempted for the entire
+	 * duration of kprobe processing
+	 */
+	preempt_disable();
+	kcb = get_kprobe_ctlblk();
 
 	/* Handle recursion cases */
 	if (kprobe_running()) {
@@ -659,11 +666,6 @@ static int __kprobes pre_kprobes_handler(struct die_args *args)
 		goto no_kprobe;
 	}
 
-	/*
-	 * This preempt_disable() matches the preempt_enable_no_resched()
-	 * in post_kprobes_handler()
-	 */
-	preempt_disable();
 	set_current_kprobe(p, kcb);
 	kcb->kprobe_status = KPROBE_HIT_ACTIVE;
 
@@ -681,6 +683,7 @@ ss_probe:
 	return 1;
 
 no_kprobe:
+	preempt_enable_no_resched();
 	return ret;
 }
 
@@ -716,9 +719,6 @@ static int __kprobes kprobes_fault_handler(struct pt_regs *regs, int trapnr)
 	struct kprobe *cur = kprobe_running();
 	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
 
-	if (!cur)
-		return 0;
-
 	if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr))
 		return 1;
 
@@ -737,7 +737,6 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
 	struct die_args *args = (struct die_args *)data;
 	int ret = NOTIFY_DONE;
 
-	rcu_read_lock();
 	switch(val) {
 	case DIE_BREAK:
 		if (pre_kprobes_handler(args))
@@ -748,12 +747,15 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
 			ret = NOTIFY_STOP;
 		break;
 	case DIE_PAGE_FAULT:
-		if (kprobes_fault_handler(args->regs, args->trapnr))
+		/* kprobe_running() needs smp_processor_id() */
+		preempt_disable();
+		if (kprobe_running() &&
+			kprobes_fault_handler(args->regs, args->trapnr))
 			ret = NOTIFY_STOP;
+		preempt_enable();
 	default:
 		break;
 	}
-	rcu_read_unlock();
 	return ret;
 }
 
@@ -785,6 +787,7 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
 	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
 
 	*regs = kcb->jprobe_saved_regs;
+	preempt_enable_no_resched();
 	return 1;
 }
 
-- 
cgit v1.1


From b2325fe1b7e5654fac9e9419423aa2c58a3dbd83 Mon Sep 17 00:00:00 2001
From: Jesper Juhl <jesper.juhl@gmail.com>
Date: Mon, 7 Nov 2005 01:01:35 -0800
Subject: [PATCH] kfree cleanup: arch

This is the arch/ part of the big kfree cleanup patch.

Remove pointless checks for NULL prior to calling kfree() in arch/.

Signed-off-by: Jesper Juhl <jesper.juhl@gmail.com>
Acked-by: Grant Grundler <grundler@parisc-linux.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ia64/kernel/perfmon.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/ia64/kernel')

diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index f7dfc10..410d480 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -4940,7 +4940,7 @@ abort_locked:
 	if (call_made && PFM_CMD_RW_ARG(cmd) && copy_to_user(arg, args_k, base_sz*count)) ret = -EFAULT;
 
 error_args:
-	if (args_k) kfree(args_k);
+	kfree(args_k);
 
 	DPRINT(("cmd=%s ret=%ld\n", PFM_CMD_NAME(cmd), ret));
 
-- 
cgit v1.1


From 9138d581b0ef855c0314c41c14852a7231b9941c Mon Sep 17 00:00:00 2001
From: Keith Owens <kaos@sgi.com>
Date: Mon, 7 Nov 2005 11:27:13 -0800
Subject: [IA64] Extend notify_die() hooks for IA64

notify_die() added for MCA_{MONARCH,SLAVE,RENDEZVOUS}_{ENTER,PROCESS,LEAVE} and
INIT_{MONARCH,SLAVE}_{ENTER,PROCESS,LEAVE}.  We need multiple
notification points for these events because they can take many seconds
to run which has nasty effects on the behaviour of the rest of the
system.

DIE_SS replaced by a generic DIE_FAULT which checks the vector number,
to allow interception of faults other than SS.

DIE_MACHINE_{HALT,RESTART} added to allow last minute close down
processing, especially when the halt/restart routines are called from
error handlers.

DIE_OOPS added.

The check for kprobe's break numbers has been moved from traps.c to
kprobes.c, allowing DIE_BREAK to be used for any additional break
numbers, i.e. it is no longer kprobes specific.

Hooks for kernel debuggers and kernel dumpers added, ENTER and LEAVE.
Both of these disable the system for long periods which impact on
watchdogs and heartbeat systems in general.  More patches to come that
use these events to reset watchdogs and heartbeats.

unregister_die_notifier() added and both routines exported.  Requested
by Dean Nelson.

Lock removed from {un,}register_die_notifier.  notifier_chain_register()
already takes a lock.  Also the generic notifier chain locking is being
reworked to distinguish between callbacks that can block and those that
cannot, the lock in {un,}register_die_notifier would interfere with
that change.  http://marc.theaimsgroup.com/?l=linux-kernel&m=113018709002036&w=2

Leading white space removed from arch/ia64/kernel/kprobes.c.

Typo in mca.c in original version of this patch found & fixed by Dean
Nelson.

Signed-off-by: Keith Owens <kaos@sgi.com>
Acked-by: Dean Nelson <dcn@sgi.com>
Acked-by: Anil Keshavamurthy <anil.s.keshavamurthy@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/kernel/kprobes.c |  22 +++++----
 arch/ia64/kernel/mca.c     | 120 +++++++++++++++++++++++++++++++++++----------
 arch/ia64/kernel/process.c |   6 +++
 arch/ia64/kernel/traps.c   |  44 ++++++++---------
 4 files changed, 134 insertions(+), 58 deletions(-)

(limited to 'arch/ia64/kernel')

diff --git a/arch/ia64/kernel/kprobes.c b/arch/ia64/kernel/kprobes.c
index 96736a1..801eeae 100644
--- a/arch/ia64/kernel/kprobes.c
+++ b/arch/ia64/kernel/kprobes.c
@@ -347,7 +347,7 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
 		((struct fnptr *)kretprobe_trampoline)->ip;
 
 	spin_lock_irqsave(&kretprobe_lock, flags);
-        head = kretprobe_inst_table_head(current);
+	head = kretprobe_inst_table_head(current);
 
 	/*
 	 * It is possible to have multiple instances associated with a given
@@ -363,9 +363,9 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
 	 *       kretprobe_trampoline
 	 */
 	hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
-                if (ri->task != current)
+		if (ri->task != current)
 			/* another task is sharing our hash bucket */
-                        continue;
+			continue;
 
 		if (ri->rp && ri->rp->handler)
 			ri->rp->handler(ri, regs);
@@ -394,7 +394,7 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
 	 * kprobe_handler() that we don't want the post_handler
 	 * to run (and have re-enabled preemption)
 	 */
-        return 1;
+	return 1;
 }
 
 /* Called with kretprobe_lock held */
@@ -739,12 +739,16 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
 
 	switch(val) {
 	case DIE_BREAK:
-		if (pre_kprobes_handler(args))
-			ret = NOTIFY_STOP;
+		/* err is break number from ia64_bad_break() */
+		if (args->err == 0x80200 || args->err == 0x80300)
+			if (pre_kprobes_handler(args))
+				ret = NOTIFY_STOP;
 		break;
-	case DIE_SS:
-		if (post_kprobes_handler(args->regs))
-			ret = NOTIFY_STOP;
+	case DIE_FAULT:
+		/* err is vector number from ia64_fault() */
+		if (args->err == 36)
+			if (post_kprobes_handler(args->regs))
+				ret = NOTIFY_STOP;
 		break;
 	case DIE_PAGE_FAULT:
 		/* kprobe_running() needs smp_processor_id() */
diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c
index 52c47da..355af15 100644
--- a/arch/ia64/kernel/mca.c
+++ b/arch/ia64/kernel/mca.c
@@ -51,6 +51,9 @@
  *
  * 2005-08-12 Keith Owens <kaos@sgi.com>
  *	      Convert MCA/INIT handlers to use per event stacks and SAL/OS state.
+ *
+ * 2005-10-07 Keith Owens <kaos@sgi.com>
+ *	      Add notify_die() hooks.
  */
 #include <linux/config.h>
 #include <linux/types.h>
@@ -58,7 +61,6 @@
 #include <linux/sched.h>
 #include <linux/interrupt.h>
 #include <linux/irq.h>
-#include <linux/kallsyms.h>
 #include <linux/smp_lock.h>
 #include <linux/bootmem.h>
 #include <linux/acpi.h>
@@ -69,6 +71,7 @@
 #include <linux/workqueue.h>
 
 #include <asm/delay.h>
+#include <asm/kdebug.h>
 #include <asm/machvec.h>
 #include <asm/meminit.h>
 #include <asm/page.h>
@@ -132,6 +135,14 @@ extern void salinfo_log_wakeup(int type, u8 *buffer, u64 size, int irqsafe);
 
 static int mca_init;
 
+
+static void inline
+ia64_mca_spin(const char *func)
+{
+	printk(KERN_EMERG "%s: spinning here, not returning to SAL\n", func);
+	while (1)
+		cpu_relax();
+}
 /*
  * IA64_MCA log support
  */
@@ -526,13 +537,16 @@ ia64_mca_wakeup_all(void)
  *  Outputs :   None
  */
 static irqreturn_t
-ia64_mca_rendez_int_handler(int rendez_irq, void *arg, struct pt_regs *ptregs)
+ia64_mca_rendez_int_handler(int rendez_irq, void *arg, struct pt_regs *regs)
 {
 	unsigned long flags;
 	int cpu = smp_processor_id();
 
 	/* Mask all interrupts */
 	local_irq_save(flags);
+	if (notify_die(DIE_MCA_RENDZVOUS_ENTER, "MCA", regs, 0, 0, 0)
+			== NOTIFY_STOP)
+		ia64_mca_spin(__FUNCTION__);
 
 	ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_DONE;
 	/* Register with the SAL monarch that the slave has
@@ -540,10 +554,18 @@ ia64_mca_rendez_int_handler(int rendez_irq, void *arg, struct pt_regs *ptregs)
 	 */
 	ia64_sal_mc_rendez();
 
+	if (notify_die(DIE_MCA_RENDZVOUS_PROCESS, "MCA", regs, 0, 0, 0)
+			== NOTIFY_STOP)
+		ia64_mca_spin(__FUNCTION__);
+
 	/* Wait for the monarch cpu to exit. */
 	while (monarch_cpu != -1)
 	       cpu_relax();	/* spin until monarch leaves */
 
+	if (notify_die(DIE_MCA_RENDZVOUS_LEAVE, "MCA", regs, 0, 0, 0)
+			== NOTIFY_STOP)
+		ia64_mca_spin(__FUNCTION__);
+
 	/* Enable all interrupts */
 	local_irq_restore(flags);
 	return IRQ_HANDLED;
@@ -933,6 +955,9 @@ ia64_mca_handler(struct pt_regs *regs, struct switch_stack *sw,
 	oops_in_progress = 1;	/* FIXME: make printk NMI/MCA/INIT safe */
 	previous_current = ia64_mca_modify_original_stack(regs, sw, sos, "MCA");
 	monarch_cpu = cpu;
+	if (notify_die(DIE_MCA_MONARCH_ENTER, "MCA", regs, 0, 0, 0)
+			== NOTIFY_STOP)
+		ia64_mca_spin(__FUNCTION__);
 	ia64_wait_for_slaves(cpu);
 
 	/* Wakeup all the processors which are spinning in the rendezvous loop.
@@ -942,6 +967,9 @@ ia64_mca_handler(struct pt_regs *regs, struct switch_stack *sw,
 	 * spinning in SAL does not work.
 	 */
 	ia64_mca_wakeup_all();
+	if (notify_die(DIE_MCA_MONARCH_PROCESS, "MCA", regs, 0, 0, 0)
+			== NOTIFY_STOP)
+		ia64_mca_spin(__FUNCTION__);
 
 	/* Get the MCA error record and log it */
 	ia64_mca_log_sal_error_record(SAL_INFO_TYPE_MCA);
@@ -960,6 +988,9 @@ ia64_mca_handler(struct pt_regs *regs, struct switch_stack *sw,
 		ia64_sal_clear_state_info(SAL_INFO_TYPE_MCA);
 		sos->os_status = IA64_MCA_CORRECTED;
 	}
+	if (notify_die(DIE_MCA_MONARCH_LEAVE, "MCA", regs, 0, 0, recover)
+			== NOTIFY_STOP)
+		ia64_mca_spin(__FUNCTION__);
 
 	set_curr_task(cpu, previous_current);
 	monarch_cpu = -1;
@@ -1188,6 +1219,37 @@ ia64_mca_cpe_poll (unsigned long dummy)
 
 #endif /* CONFIG_ACPI */
 
+static int
+default_monarch_init_process(struct notifier_block *self, unsigned long val, void *data)
+{
+	int c;
+	struct task_struct *g, *t;
+	if (val != DIE_INIT_MONARCH_PROCESS)
+		return NOTIFY_DONE;
+	printk(KERN_ERR "Processes interrupted by INIT -");
+	for_each_online_cpu(c) {
+		struct ia64_sal_os_state *s;
+		t = __va(__per_cpu_mca[c] + IA64_MCA_CPU_INIT_STACK_OFFSET);
+		s = (struct ia64_sal_os_state *)((char *)t + MCA_SOS_OFFSET);
+		g = s->prev_task;
+		if (g) {
+			if (g->pid)
+				printk(" %d", g->pid);
+			else
+				printk(" %d (cpu %d task 0x%p)", g->pid, task_cpu(g), g);
+		}
+	}
+	printk("\n\n");
+	if (read_trylock(&tasklist_lock)) {
+		do_each_thread (g, t) {
+			printk("\nBacktrace of pid %d (%s)\n", t->pid, t->comm);
+			show_stack(t, NULL);
+		} while_each_thread (g, t);
+		read_unlock(&tasklist_lock);
+	}
+	return NOTIFY_DONE;
+}
+
 /*
  * C portion of the OS INIT handler
  *
@@ -1212,8 +1274,7 @@ ia64_init_handler(struct pt_regs *regs, struct switch_stack *sw,
 	static atomic_t slaves;
 	static atomic_t monarchs;
 	task_t *previous_current;
-	int cpu = smp_processor_id(), c;
-	struct task_struct *g, *t;
+	int cpu = smp_processor_id();
 
 	oops_in_progress = 1;	/* FIXME: make printk NMI/MCA/INIT safe */
 	console_loglevel = 15;	/* make sure printks make it to console */
@@ -1253,8 +1314,17 @@ ia64_init_handler(struct pt_regs *regs, struct switch_stack *sw,
 		ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_INIT;
 		while (monarch_cpu == -1)
 		       cpu_relax();	/* spin until monarch enters */
+		if (notify_die(DIE_INIT_SLAVE_ENTER, "INIT", regs, 0, 0, 0)
+				== NOTIFY_STOP)
+			ia64_mca_spin(__FUNCTION__);
+		if (notify_die(DIE_INIT_SLAVE_PROCESS, "INIT", regs, 0, 0, 0)
+				== NOTIFY_STOP)
+			ia64_mca_spin(__FUNCTION__);
 		while (monarch_cpu != -1)
 		       cpu_relax();	/* spin until monarch leaves */
+		if (notify_die(DIE_INIT_SLAVE_LEAVE, "INIT", regs, 0, 0, 0)
+				== NOTIFY_STOP)
+			ia64_mca_spin(__FUNCTION__);
 		printk("Slave on cpu %d returning to normal service.\n", cpu);
 		set_curr_task(cpu, previous_current);
 		ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_NOTDONE;
@@ -1263,6 +1333,9 @@ ia64_init_handler(struct pt_regs *regs, struct switch_stack *sw,
 	}
 
 	monarch_cpu = cpu;
+	if (notify_die(DIE_INIT_MONARCH_ENTER, "INIT", regs, 0, 0, 0)
+			== NOTIFY_STOP)
+		ia64_mca_spin(__FUNCTION__);
 
 	/*
 	 * Wait for a bit.  On some machines (e.g., HP's zx2000 and zx6000, INIT can be
@@ -1273,27 +1346,16 @@ ia64_init_handler(struct pt_regs *regs, struct switch_stack *sw,
 	printk("Delaying for 5 seconds...\n");
 	udelay(5*1000000);
 	ia64_wait_for_slaves(cpu);
-	printk(KERN_ERR "Processes interrupted by INIT -");
-	for_each_online_cpu(c) {
-		struct ia64_sal_os_state *s;
-		t = __va(__per_cpu_mca[c] + IA64_MCA_CPU_INIT_STACK_OFFSET);
-		s = (struct ia64_sal_os_state *)((char *)t + MCA_SOS_OFFSET);
-		g = s->prev_task;
-		if (g) {
-			if (g->pid)
-				printk(" %d", g->pid);
-			else
-				printk(" %d (cpu %d task 0x%p)", g->pid, task_cpu(g), g);
-		}
-	}
-	printk("\n\n");
-	if (read_trylock(&tasklist_lock)) {
-		do_each_thread (g, t) {
-			printk("\nBacktrace of pid %d (%s)\n", t->pid, t->comm);
-			show_stack(t, NULL);
-		} while_each_thread (g, t);
-		read_unlock(&tasklist_lock);
-	}
+	/* If nobody intercepts DIE_INIT_MONARCH_PROCESS then we drop through
+	 * to default_monarch_init_process() above and just print all the
+	 * tasks.
+	 */
+	if (notify_die(DIE_INIT_MONARCH_PROCESS, "INIT", regs, 0, 0, 0)
+			== NOTIFY_STOP)
+		ia64_mca_spin(__FUNCTION__);
+	if (notify_die(DIE_INIT_MONARCH_LEAVE, "INIT", regs, 0, 0, 0)
+			== NOTIFY_STOP)
+		ia64_mca_spin(__FUNCTION__);
 	printk("\nINIT dump complete.  Monarch on cpu %d returning to normal service.\n", cpu);
 	atomic_dec(&monarchs);
 	set_curr_task(cpu, previous_current);
@@ -1462,6 +1524,10 @@ ia64_mca_init(void)
 	s64 rc;
 	struct ia64_sal_retval isrv;
 	u64 timeout = IA64_MCA_RENDEZ_TIMEOUT;	/* platform specific */
+	static struct notifier_block default_init_monarch_nb = {
+		.notifier_call = default_monarch_init_process,
+		.priority = 0/* we need to notified last */
+	};
 
 	IA64_MCA_DEBUG("%s: begin\n", __FUNCTION__);
 
@@ -1555,6 +1621,10 @@ ia64_mca_init(void)
 		       "(status %ld)\n", rc);
 		return;
 	}
+	if (register_die_notifier(&default_init_monarch_nb)) {
+		printk(KERN_ERR "Failed to register default monarch INIT process\n");
+		return;
+	}
 
 	IA64_MCA_DEBUG("%s: registered OS INIT handler with SAL\n", __FUNCTION__);
 
diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
index 051e050..c78355c 100644
--- a/arch/ia64/kernel/process.c
+++ b/arch/ia64/kernel/process.c
@@ -4,6 +4,9 @@
  * Copyright (C) 1998-2003 Hewlett-Packard Co
  *	David Mosberger-Tang <davidm@hpl.hp.com>
  * 04/11/17 Ashok Raj	<ashok.raj@intel.com> Added CPU Hotplug Support
+ *
+ * 2005-10-07 Keith Owens <kaos@sgi.com>
+ *	      Add notify_die() hooks.
  */
 #define __KERNEL_SYSCALLS__	/* see <asm/unistd.h> */
 #include <linux/config.h>
@@ -34,6 +37,7 @@
 #include <asm/elf.h>
 #include <asm/ia32.h>
 #include <asm/irq.h>
+#include <asm/kdebug.h>
 #include <asm/pgalloc.h>
 #include <asm/processor.h>
 #include <asm/sal.h>
@@ -804,12 +808,14 @@ cpu_halt (void)
 void
 machine_restart (char *restart_cmd)
 {
+	(void) notify_die(DIE_MACHINE_RESTART, restart_cmd, NULL, 0, 0, 0);
 	(*efi.reset_system)(EFI_RESET_WARM, 0, 0, NULL);
 }
 
 void
 machine_halt (void)
 {
+	(void) notify_die(DIE_MACHINE_HALT, "", NULL, 0, 0, 0);
 	cpu_halt();
 }
 
diff --git a/arch/ia64/kernel/traps.c b/arch/ia64/kernel/traps.c
index f970359..fba5fdd 100644
--- a/arch/ia64/kernel/traps.c
+++ b/arch/ia64/kernel/traps.c
@@ -30,17 +30,20 @@ fpswa_interface_t *fpswa_interface;
 EXPORT_SYMBOL(fpswa_interface);
 
 struct notifier_block *ia64die_chain;
-static DEFINE_SPINLOCK(die_notifier_lock);
 
-int register_die_notifier(struct notifier_block *nb)
+int
+register_die_notifier(struct notifier_block *nb)
 {
-	int err = 0;
-	unsigned long flags;
-	spin_lock_irqsave(&die_notifier_lock, flags);
-	err = notifier_chain_register(&ia64die_chain, nb);
-	spin_unlock_irqrestore(&die_notifier_lock, flags);
-	return err;
+	return notifier_chain_register(&ia64die_chain, nb);
 }
+EXPORT_SYMBOL_GPL(register_die_notifier);
+
+int
+unregister_die_notifier(struct notifier_block *nb)
+{
+	return notifier_chain_unregister(&ia64die_chain, nb);
+}
+EXPORT_SYMBOL_GPL(unregister_die_notifier);
 
 void __init
 trap_init (void)
@@ -105,6 +108,7 @@ die (const char *str, struct pt_regs *regs, long err)
 	if (++die.lock_owner_depth < 3) {
 		printk("%s[%d]: %s %ld [%d]\n",
 			current->comm, current->pid, str, err, ++die_counter);
+		(void) notify_die(DIE_OOPS, (char *)str, regs, err, 255, SIGSEGV);
 		show_regs(regs);
   	} else
 		printk(KERN_ERR "Recursive die() failure, output suppressed\n");
@@ -155,9 +159,8 @@ __kprobes ia64_bad_break (unsigned long break_num, struct pt_regs *regs)
 	switch (break_num) {
 	      case 0: /* unknown error (used by GCC for __builtin_abort()) */
 		if (notify_die(DIE_BREAK, "break 0", regs, break_num, TRAP_BRKPT, SIGTRAP)
-			       	== NOTIFY_STOP) {
+			       	== NOTIFY_STOP)
 			return;
-		}
 		die_if_kernel("bugcheck!", regs, break_num);
 		sig = SIGILL; code = ILL_ILLOPC;
 		break;
@@ -210,15 +213,6 @@ __kprobes ia64_bad_break (unsigned long break_num, struct pt_regs *regs)
 		sig = SIGILL; code = __ILL_BNDMOD;
 		break;
 
-	      case 0x80200:
-	      case 0x80300:
-		if (notify_die(DIE_BREAK, "kprobe", regs, break_num, TRAP_BRKPT, SIGTRAP)
-			       	== NOTIFY_STOP) {
-			return;
-		}
-		sig = SIGTRAP; code = TRAP_BRKPT;
-		break;
-
 	      default:
 		if (break_num < 0x40000 || break_num > 0x100000)
 			die_if_kernel("Bad break", regs, break_num);
@@ -226,6 +220,9 @@ __kprobes ia64_bad_break (unsigned long break_num, struct pt_regs *regs)
 		if (break_num < 0x80000) {
 			sig = SIGILL; code = __ILL_BREAK;
 		} else {
+			if (notify_die(DIE_BREAK, "bad break", regs, break_num, TRAP_BRKPT, SIGTRAP)
+					== NOTIFY_STOP)
+				return;
 			sig = SIGTRAP; code = TRAP_BRKPT;
 		}
 	}
@@ -578,12 +575,11 @@ ia64_fault (unsigned long vector, unsigned long isr, unsigned long ifa,
 #endif
 			break;
 		      case 35: siginfo.si_code = TRAP_BRANCH; ifa = 0; break;
-		      case 36:
-			      if (notify_die(DIE_SS, "ss", &regs, vector,
-					     vector, SIGTRAP) == NOTIFY_STOP)
-				      return;
-			      siginfo.si_code = TRAP_TRACE; ifa = 0; break;
+		      case 36: siginfo.si_code = TRAP_TRACE; ifa = 0; break;
 		}
+		if (notify_die(DIE_FAULT, "ia64_fault", &regs, vector, siginfo.si_code, SIGTRAP)
+			       	== NOTIFY_STOP)
+			return;
 		siginfo.si_signo = SIGTRAP;
 		siginfo.si_errno = 0;
 		siginfo.si_addr  = (void __user *) ifa;
-- 
cgit v1.1


From cf20d1eafb648bf395b153cbcd0cde40f88c220a Mon Sep 17 00:00:00 2001
From: David Mosberger-Tang <davidm@mostang.com>
Date: Wed, 2 Nov 2005 22:40:19 -0800
Subject: [IA64] align signal-frame even when not using alternate signal-stack

At the moment, attempting to invoke a signal-handler on the normal
stack is guaranteed to fail if the stack-pointer happens not to be
16-byte aligned.  This is because the signal-trampoline will attempt
to store fp-regs with stf.spill instructions, which will trap for
misaligned addresses.  This isn't terribly useful behavior.  It's
better to just always align the signal frame to the next lower 16-byte
boundary.

Signed-off-by: David Mosberger-Tang <David.Mosberger@acm.org>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/kernel/signal.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'arch/ia64/kernel')

diff --git a/arch/ia64/kernel/signal.c b/arch/ia64/kernel/signal.c
index 774f34b..58ce07e 100644
--- a/arch/ia64/kernel/signal.c
+++ b/arch/ia64/kernel/signal.c
@@ -387,15 +387,14 @@ setup_frame (int sig, struct k_sigaction *ka, siginfo_t *info, sigset_t *set,
 	     struct sigscratch *scr)
 {
 	extern char __kernel_sigtramp[];
-	unsigned long tramp_addr, new_rbs = 0;
+	unsigned long tramp_addr, new_rbs = 0, new_sp;
 	struct sigframe __user *frame;
 	long err;
 
-	frame = (void __user *) scr->pt.r12;
+	new_sp = scr->pt.r12;
 	tramp_addr = (unsigned long) __kernel_sigtramp;
-	if ((ka->sa.sa_flags & SA_ONSTACK) && sas_ss_flags((unsigned long) frame) == 0) {
-		frame = (void __user *) ((current->sas_ss_sp + current->sas_ss_size)
-					 & ~(STACK_ALIGN - 1));
+	if ((ka->sa.sa_flags & SA_ONSTACK) && sas_ss_flags(new_sp) == 0) {
+		new_sp = current->sas_ss_sp + current->sas_ss_size;
 		/*
 		 * We need to check for the register stack being on the signal stack
 		 * separately, because it's switched separately (memory stack is switched
@@ -404,7 +403,7 @@ setup_frame (int sig, struct k_sigaction *ka, siginfo_t *info, sigset_t *set,
 		if (!rbs_on_sig_stack(scr->pt.ar_bspstore))
 			new_rbs = (current->sas_ss_sp + sizeof(long) - 1) & ~(sizeof(long) - 1);
 	}
-	frame = (void __user *) frame - ((sizeof(*frame) + STACK_ALIGN - 1) & ~(STACK_ALIGN - 1));
+	frame = (void __user *) ((new_sp - sizeof(*frame)) & -STACK_ALIGN);
 
 	if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
 		return force_sigsegv_info(sig, frame);
-- 
cgit v1.1


From a14f25a076a8e5040d6f4e93f84034c81bcddbf7 Mon Sep 17 00:00:00 2001
From: Russ Anderson <rja@efs.americas.sgi.com>
Date: Fri, 4 Nov 2005 13:39:38 -0600
Subject: [IA64] MCA recovery based on PSP bits

The determination of whether an MCA is recoverable or not must
be based on the bits set in the PSP (Processor State Parameter).
The specific bits are shown in the Intel IA-64 Architecture Software
Developer's Manual, Vol 2, Table 11-6 Software Recovery Bits in
Processor State Parameter.  Those bits should be consistent
across the entire IA-64 family of processors.

Signed-off-by: Russ Anderson <rja@sgi.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/kernel/mca_drv.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

(limited to 'arch/ia64/kernel')

diff --git a/arch/ia64/kernel/mca_drv.c b/arch/ia64/kernel/mca_drv.c
index f081c60..eb860e2 100644
--- a/arch/ia64/kernel/mca_drv.c
+++ b/arch/ia64/kernel/mca_drv.c
@@ -546,9 +546,20 @@ recover_from_processor_error(int platform, slidx_table_t *slidx,
 		(pal_processor_state_info_t*)peidx_psp(peidx);
 
 	/*
-	 * We cannot recover errors with other than bus_check.
+	 * Processor recovery status must key off of the PAL recovery
+	 * status in the Processor State Parameter.
 	 */
-	if (psp->cc || psp->rc || psp->uc)
+
+	/*
+	 * The machine check is corrected.
+	 */
+	if (psp->cm == 1)
+		return 1;
+
+	/*
+	 * The error was not contained.  Software must be reset.
+	 */
+	if (psp->us || psp->ci == 0)
 		return 0;
 
 	/*
@@ -569,8 +580,6 @@ recover_from_processor_error(int platform, slidx_table_t *slidx,
 		return 0;
 	if (pbci->eb && pbci->bsi > 0)
 		return 0;
-	if (psp->ci == 0)
-		return 0;
 
 	/*
 	 * This is a local MCA and estimated as recoverble external bus error.
-- 
cgit v1.1


From 56f87b82171245a81a5dbac5e703d3941d80da49 Mon Sep 17 00:00:00 2001
From: Russ Anderson <rja@efs.americas.sgi.com>
Date: Fri, 4 Nov 2005 13:57:00 -0600
Subject: [IA64] MCA recovery: pfn_valid() needs a pfn

paddr needs to be shifted by PAGE_SHIFT to be valid
input for pfn_valid().

Signed-off-by: Russ Anderson <rja@sgi.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/kernel/mca_drv.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/ia64/kernel')

diff --git a/arch/ia64/kernel/mca_drv.c b/arch/ia64/kernel/mca_drv.c
index f081c60..6ff32d4 100644
--- a/arch/ia64/kernel/mca_drv.c
+++ b/arch/ia64/kernel/mca_drv.c
@@ -88,7 +88,7 @@ mca_page_isolate(unsigned long paddr)
 	if (!ia64_phys_addr_valid(paddr))
 		return ISOLATE_NONE;
 
-	if (!pfn_valid(paddr))
+	if (!pfn_valid(paddr >> PAGE_SHIFT))
 		return ISOLATE_NONE;
 
 	/* convert physical address to physical page number */
-- 
cgit v1.1


From cbb921443424fb8019e52bae83e442d01f7715ef Mon Sep 17 00:00:00 2001
From: Russ Anderson <rja@efs.americas.sgi.com>
Date: Fri, 4 Nov 2005 16:58:28 -0600
Subject: [IA64] MCA recovery: Bump reference count on bad pages

When a page has a memory uncorrectable ECC error, the recovery
code wants to prevent the page from being reused.  This change
bumps the reference count to prevent the page from getting back
on the free list.

Signed-off-by: Russ Anderson (rja@sgi.com)
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/kernel/mca_drv.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/ia64/kernel')

diff --git a/arch/ia64/kernel/mca_drv.c b/arch/ia64/kernel/mca_drv.c
index 6ff32d4..eb39bc9 100644
--- a/arch/ia64/kernel/mca_drv.c
+++ b/arch/ia64/kernel/mca_drv.c
@@ -108,6 +108,7 @@ mca_page_isolate(unsigned long paddr)
 		return ISOLATE_NG;
 
 	/* add attribute 'Reserved' and register the page */
+	get_page(p);
 	SetPageReserved(p);
 	page_isolate[num_page_isolate++] = p;
 
-- 
cgit v1.1


From 5bfb5d690f36d316a5f3b4f7775fda996faa6b12 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Tue, 8 Nov 2005 21:39:01 -0800
Subject: [PATCH] sched: disable preempt in idle tasks

Run idle threads with preempt disabled.

Also corrected a bugs in arm26's cpu_idle (make it actually call schedule()).
How did it ever work before?

Might fix the CPU hotplugging hang which Nigel Cunningham noted.

We think the bug hits if the idle thread is preempted after checking
need_resched() and before going to sleep, then the CPU offlined.

After calling stop_machine_run, the CPU eventually returns from preemption and
into the idle thread and goes to sleep.  The CPU will continue executing
previous idle and have no chance to call play_dead.

By disabling preemption until we are ready to explicitly schedule, this bug is
fixed and the idle threads generally become more robust.

From: alexs <ashepard@u.washington.edu>

  PPC build fix

From: Yoichi Yuasa <yuasa@hh.iij4u.or.jp>

  MIPS build fix

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Yoichi Yuasa <yuasa@hh.iij4u.or.jp>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ia64/kernel/process.c | 2 ++
 arch/ia64/kernel/smpboot.c | 1 +
 2 files changed, 3 insertions(+)

(limited to 'arch/ia64/kernel')

diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
index 051e050..4c621fc 100644
--- a/arch/ia64/kernel/process.c
+++ b/arch/ia64/kernel/process.c
@@ -292,7 +292,9 @@ cpu_idle (void)
 #ifdef CONFIG_SMP
 		normal_xtp();
 #endif
+		preempt_enable_no_resched();
 		schedule();
+		preempt_disable();
 		check_pgt_cache();
 		if (cpu_is_offline(smp_processor_id()))
 			play_dead();
diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c
index 400a489..8f44e7d 100644
--- a/arch/ia64/kernel/smpboot.c
+++ b/arch/ia64/kernel/smpboot.c
@@ -399,6 +399,7 @@ start_secondary (void *unused)
 	Dprintk("start_secondary: starting CPU 0x%x\n", hard_smp_processor_id());
 	efi_map_pal_code();
 	cpu_init();
+	preempt_disable();
 	smp_callin();
 
 	cpu_idle();
-- 
cgit v1.1


From 64c7c8f88559624abdbe12b5da6502e8879f8d28 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Tue, 8 Nov 2005 21:39:04 -0800
Subject: [PATCH] sched: resched and cpu_idle rework

Make some changes to the NEED_RESCHED and POLLING_NRFLAG to reduce
confusion, and make their semantics rigid.  Improves efficiency of
resched_task and some cpu_idle routines.

* In resched_task:
- TIF_NEED_RESCHED is only cleared with the task's runqueue lock held,
  and as we hold it during resched_task, then there is no need for an
  atomic test and set there. The only other time this should be set is
  when the task's quantum expires, in the timer interrupt - this is
  protected against because the rq lock is irq-safe.

- If TIF_NEED_RESCHED is set, then we don't need to do anything. It
  won't get unset until the task get's schedule()d off.

- If we are running on the same CPU as the task we resched, then set
  TIF_NEED_RESCHED and no further action is required.

- If we are running on another CPU, and TIF_POLLING_NRFLAG is *not* set
  after TIF_NEED_RESCHED has been set, then we need to send an IPI.

Using these rules, we are able to remove the test and set operation in
resched_task, and make clear the previously vague semantics of
POLLING_NRFLAG.

* In idle routines:
- Enter cpu_idle with preempt disabled. When the need_resched() condition
  becomes true, explicitly call schedule(). This makes things a bit clearer
  (IMO), but haven't updated all architectures yet.

- Many do a test and clear of TIF_NEED_RESCHED for some reason. According
  to the resched_task rules, this isn't needed (and actually breaks the
  assumption that TIF_NEED_RESCHED is only cleared with the runqueue lock
  held). So remove that. Generally one less locked memory op when switching
  to the idle thread.

- Many idle routines clear TIF_POLLING_NRFLAG, and only set it in the inner
  most polling idle loops. The above resched_task semantics allow it to be
  set until before the last time need_resched() is checked before going into
  a halt requiring interrupt wakeup.

  Many idle routines simply never enter such a halt, and so POLLING_NRFLAG
  can be always left set, completely eliminating resched IPIs when rescheduling
  the idle task.

  POLLING_NRFLAG width can be increased, to reduce the chance of resched IPIs.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Con Kolivas <kernel@kolivas.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ia64/kernel/process.c | 32 +++++++++++++++++---------------
 1 file changed, 17 insertions(+), 15 deletions(-)

(limited to 'arch/ia64/kernel')

diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
index 4c621fc..640d690 100644
--- a/arch/ia64/kernel/process.c
+++ b/arch/ia64/kernel/process.c
@@ -197,11 +197,15 @@ void
 default_idle (void)
 {
 	local_irq_enable();
-	while (!need_resched())
-		if (can_do_pal_halt)
-			safe_halt();
-		else
+	while (!need_resched()) {
+		if (can_do_pal_halt) {
+			local_irq_disable();
+			if (!need_resched())
+				safe_halt();
+			local_irq_enable();
+		} else
 			cpu_relax();
+	}
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
@@ -263,16 +267,16 @@ void __attribute__((noreturn))
 cpu_idle (void)
 {
 	void (*mark_idle)(int) = ia64_mark_idle;
+  	int cpu = smp_processor_id();
+	set_thread_flag(TIF_POLLING_NRFLAG);
 
 	/* endless idle loop with no priority at all */
 	while (1) {
+		if (!need_resched()) {
+			void (*idle)(void);
 #ifdef CONFIG_SMP
-		if (!need_resched())
 			min_xtp();
 #endif
-		while (!need_resched()) {
-			void (*idle)(void);
-
 			if (__get_cpu_var(cpu_idle_state))
 				__get_cpu_var(cpu_idle_state) = 0;
 
@@ -284,19 +288,17 @@ cpu_idle (void)
 			if (!idle)
 				idle = default_idle;
 			(*idle)();
-		}
-
-		if (mark_idle)
-			(*mark_idle)(0);
-
+			if (mark_idle)
+				(*mark_idle)(0);
 #ifdef CONFIG_SMP
-		normal_xtp();
+			normal_xtp();
 #endif
+		}
 		preempt_enable_no_resched();
 		schedule();
 		preempt_disable();
 		check_pgt_cache();
-		if (cpu_is_offline(smp_processor_id()))
+		if (cpu_is_offline(cpu))
 			play_dead();
 	}
 }
-- 
cgit v1.1


From baf47fb66020e5c3fe2386680fa2d79d1f8e0052 Mon Sep 17 00:00:00 2001
From: Panagiotis Issaris <takis@issaris.org>
Date: Wed, 9 Nov 2005 02:08:42 +0100
Subject: [IA64] Replace kcalloc(1, with kzalloc.

Conversion from kcalloc(1, to kzalloc.

Signed-off-by: Panagiotis Issaris <takis@issaris.org>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/kernel/efi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/ia64/kernel')

diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c
index f72ea6a..a3aa45c 100644
--- a/arch/ia64/kernel/efi.c
+++ b/arch/ia64/kernel/efi.c
@@ -987,7 +987,7 @@ efi_initialize_iomem_resources(struct resource *code_resource,
 				break;
 		}
 
-		if ((res = kcalloc(1, sizeof(struct resource), GFP_KERNEL)) == NULL) {
+		if ((res = kzalloc(sizeof(struct resource), GFP_KERNEL)) == NULL) {
 			printk(KERN_ERR "failed to alocate resource for iomem\n");
 			return;
 		}
-- 
cgit v1.1


From 837cd0bdf54dd954cd6aa43d250f75ab5db79617 Mon Sep 17 00:00:00 2001
From: Robin Holt <holt@sgi.com>
Date: Fri, 11 Nov 2005 09:35:43 -0600
Subject: [IA64] 4-level page tables

This patch introduces 4-level page tables to ia64.  I have run
some benchmarks and found nothing interesting.  Performance has
consistently fallen within the noise range.

It also introduces a config option (setting the default to 3
levels).  The config option prevents having 4 level page
tables with 64k base page size.

Signed-off-by: Robin Holt <holt@sgi.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/kernel/ivt.S | 63 ++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 48 insertions(+), 15 deletions(-)

(limited to 'arch/ia64/kernel')

diff --git a/arch/ia64/kernel/ivt.S b/arch/ia64/kernel/ivt.S
index c13ca0d..e06f21f 100644
--- a/arch/ia64/kernel/ivt.S
+++ b/arch/ia64/kernel/ivt.S
@@ -114,7 +114,7 @@ ENTRY(vhpt_miss)
 	shl r21=r16,3				// shift bit 60 into sign bit
 	shr.u r17=r16,61			// get the region number into r17
 	;;
-	shr r22=r21,3
+	shr.u r22=r21,3
 #ifdef CONFIG_HUGETLB_PAGE
 	extr.u r26=r25,2,6
 	;;
@@ -140,20 +140,34 @@ ENTRY(vhpt_miss)
 (p6)	dep r17=r18,r19,3,(PAGE_SHIFT-3)	// r17=PTA + IFA(33,42)*8
 (p7)	dep r17=r18,r17,3,(PAGE_SHIFT-6)	// r17=PTA + (((IFA(61,63) << 7) | IFA(33,39))*8)
 	cmp.eq p7,p6=0,r21			// unused address bits all zeroes?
-	shr.u r18=r22,PMD_SHIFT			// shift L2 index into position
+#ifdef CONFIG_PGTABLE_4
+	shr.u r28=r22,PUD_SHIFT			// shift L2 index into position
+#else
+	shr.u r18=r22,PMD_SHIFT			// shift L3 index into position
+#endif
 	;;
 	ld8 r17=[r17]				// fetch the L1 entry (may be 0)
 	;;
 (p7)	cmp.eq p6,p7=r17,r0			// was L1 entry NULL?
-	dep r17=r18,r17,3,(PAGE_SHIFT-3)	// compute address of L2 page table entry
+#ifdef CONFIG_PGTABLE_4
+	dep r28=r28,r17,3,(PAGE_SHIFT-3)	// compute address of L2 page table entry
+	;;
+	shr.u r18=r22,PMD_SHIFT			// shift L3 index into position
+(p7)	ld8 r29=[r28]				// fetch the L2 entry (may be 0)
 	;;
-(p7)	ld8 r20=[r17]				// fetch the L2 entry (may be 0)
-	shr.u r19=r22,PAGE_SHIFT		// shift L3 index into position
+(p7)	cmp.eq.or.andcm p6,p7=r29,r0		// was L2 entry NULL?
+	dep r17=r18,r29,3,(PAGE_SHIFT-3)	// compute address of L3 page table entry
+#else
+	dep r17=r18,r17,3,(PAGE_SHIFT-3)	// compute address of L3 page table entry
+#endif
 	;;
-(p7)	cmp.eq.or.andcm p6,p7=r20,r0		// was L2 entry NULL?
-	dep r21=r19,r20,3,(PAGE_SHIFT-3)	// compute address of L3 page table entry
+(p7)	ld8 r20=[r17]				// fetch the L3 entry (may be 0)
+	shr.u r19=r22,PAGE_SHIFT		// shift L4 index into position
 	;;
-(p7)	ld8 r18=[r21]				// read the L3 PTE
+(p7)	cmp.eq.or.andcm p6,p7=r20,r0		// was L3 entry NULL?
+	dep r21=r19,r20,3,(PAGE_SHIFT-3)	// compute address of L4 page table entry
+	;;
+(p7)	ld8 r18=[r21]				// read the L4 PTE
 	mov r19=cr.isr				// cr.isr bit 0 tells us if this is an insn miss
 	;;
 (p7)	tbit.z p6,p7=r18,_PAGE_P_BIT		// page present bit cleared?
@@ -192,14 +206,21 @@ ENTRY(vhpt_miss)
 	 * between reading the pagetable and the "itc".  If so, flush the entry we
 	 * inserted and retry.
 	 */
-	ld8 r25=[r21]				// read L3 PTE again
-	ld8 r26=[r17]				// read L2 entry again
+	ld8 r25=[r21]				// read L4 entry again
+	ld8 r26=[r17]				// read L3 PTE again
+#ifdef CONFIG_PGTABLE_4
+	ld8 r18=[r28]				// read L2 entry again
+#endif
+	cmp.ne p6,p7=r0,r0
 	;;
-	cmp.ne p6,p7=r26,r20			// did L2 entry change
+	cmp.ne.or.andcm p6,p7=r26,r20		// did L3 entry change
+#ifdef CONFIG_PGTABLE_4
+	cmp.ne.or.andcm p6,p7=r29,r18		// did L4 PTE change
+#endif
 	mov r27=PAGE_SHIFT<<2
 	;;
 (p6)	ptc.l r22,r27				// purge PTE page translation
-(p7)	cmp.ne.or.andcm p6,p7=r25,r18		// did L3 PTE change
+(p7)	cmp.ne.or.andcm p6,p7=r25,r18		// did L4 PTE change
 	;;
 (p6)	ptc.l r16,r27				// purge translation
 #endif
@@ -432,18 +453,30 @@ ENTRY(nested_dtlb_miss)
 (p6)	dep r17=r18,r19,3,(PAGE_SHIFT-3)	// r17=PTA + IFA(33,42)*8
 (p7)	dep r17=r18,r17,3,(PAGE_SHIFT-6)	// r17=PTA + (((IFA(61,63) << 7) | IFA(33,39))*8)
 	cmp.eq p7,p6=0,r21			// unused address bits all zeroes?
-	shr.u r18=r22,PMD_SHIFT			// shift L2 index into position
+#ifdef CONFIG_PGTABLE_4
+	shr.u r18=r22,PUD_SHIFT			// shift L2 index into position
+#else
+	shr.u r18=r22,PMD_SHIFT			// shift L3 index into position
+#endif
 	;;
 	ld8 r17=[r17]				// fetch the L1 entry (may be 0)
 	;;
 (p7)	cmp.eq p6,p7=r17,r0			// was L1 entry NULL?
 	dep r17=r18,r17,3,(PAGE_SHIFT-3)	// compute address of L2 page table entry
 	;;
+#ifdef CONFIG_PGTABLE_4
 (p7)	ld8 r17=[r17]				// fetch the L2 entry (may be 0)
-	shr.u r19=r22,PAGE_SHIFT		// shift L3 index into position
+	shr.u r18=r22,PMD_SHIFT			// shift L3 index into position
 	;;
 (p7)	cmp.eq.or.andcm p6,p7=r17,r0		// was L2 entry NULL?
-	dep r17=r19,r17,3,(PAGE_SHIFT-3)	// compute address of L3 page table entry
+	dep r17=r18,r17,3,(PAGE_SHIFT-3)	// compute address of L2 page table entry
+	;;
+#endif
+(p7)	ld8 r17=[r17]				// fetch the L3 entry (may be 0)
+	shr.u r19=r22,PAGE_SHIFT		// shift L4 index into position
+	;;
+(p7)	cmp.eq.or.andcm p6,p7=r17,r0		// was L3 entry NULL?
+	dep r17=r19,r17,3,(PAGE_SHIFT-3)	// compute address of L4 page table entry
 (p6)	br.cond.spnt page_fault
 	mov b0=r30
 	br.sptk.many b0				// return to continuation point
-- 
cgit v1.1


From 1e185b97b4364063f1135604b87f8d8469944233 Mon Sep 17 00:00:00 2001
From: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Date: Tue, 15 Nov 2005 14:37:05 -0800
Subject: [PATCH] ia64: cpu_idle performance bug fix

Our performance validation on 2.6.15-rc1 caught a disastrous performance
regression on ia64 with netperf (-98%) and volanomark (-58%) compares to
previous kernel version 2.6.14-git7.  See the following chart (result
group 1 & 2).

  http://kernel-perf.sourceforge.net/results.machine_id=26.html

We have root caused it to commit 64c7c8f88559624abdbe12b5da6502e8879f8d28

This changeset broke the ia64 task resched notification.  In
sched.c:resched_task(), a reschedule IPI is conditioned upon
TIF_POLLING_NRFLAG.  However, the above changeset unconditionally set
the polling thread flag for idle tasks regardless whether pal_halt_light
is in use or not.  As a result, resched IPI is not sent from
resched_task().  And since the default behavior on ia64 is to use
pal_halt_light, we end up delaying the rescheduling task until next
timer tick, and thus cause the performance regression.

This fixes the performance bug.  I'm glad our performance suite is
turning up bad performance bug like this in time.

Signed-off-by: Ken Chen <kenneth.w.chen@intel.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ia64/kernel/process.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'arch/ia64/kernel')

diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
index e92ea64..4305d2b 100644
--- a/arch/ia64/kernel/process.c
+++ b/arch/ia64/kernel/process.c
@@ -202,12 +202,9 @@ default_idle (void)
 {
 	local_irq_enable();
 	while (!need_resched()) {
-		if (can_do_pal_halt) {
-			local_irq_disable();
-			if (!need_resched())
-				safe_halt();
-			local_irq_enable();
-		} else
+		if (can_do_pal_halt)
+			safe_halt();
+		else
 			cpu_relax();
 	}
 }
@@ -272,10 +269,14 @@ cpu_idle (void)
 {
 	void (*mark_idle)(int) = ia64_mark_idle;
   	int cpu = smp_processor_id();
-	set_thread_flag(TIF_POLLING_NRFLAG);
 
 	/* endless idle loop with no priority at all */
 	while (1) {
+		if (can_do_pal_halt)
+			clear_thread_flag(TIF_POLLING_NRFLAG);
+		else
+			set_thread_flag(TIF_POLLING_NRFLAG);
+
 		if (!need_resched()) {
 			void (*idle)(void);
 #ifdef CONFIG_SMP
-- 
cgit v1.1


From fedb25fae72bc2c3709448a43be067439643da87 Mon Sep 17 00:00:00 2001
From: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Date: Thu, 17 Nov 2005 01:38:42 -0800
Subject: [IA64] 4 level page table bug fix in vhpt_miss

From source code inspection, I think there is a bug with 4 level
page table with vhpt_miss handler.  In the code path of rechecking
page table entry against previously read value after tlb insertion,
*pte value in register r18 was overwritten with value newly read
from pud pointer, render the check of new *pte against previous
*pte completely wrong.  Though the bug is none fatal and the penalty
is to purge the entry and retry.  For functional correctness, it
should be fixed.  The fix is to use a different register so new
*pud don't trash *pte.  (btw, the comments in the cmp statement is
wrong as well, which I will address in the next patch).

Signed-off-by: Ken Chen <kenneth.w.chen@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/kernel/ivt.S | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/ia64/kernel')

diff --git a/arch/ia64/kernel/ivt.S b/arch/ia64/kernel/ivt.S
index e06f21f..c71c792 100644
--- a/arch/ia64/kernel/ivt.S
+++ b/arch/ia64/kernel/ivt.S
@@ -209,13 +209,13 @@ ENTRY(vhpt_miss)
 	ld8 r25=[r21]				// read L4 entry again
 	ld8 r26=[r17]				// read L3 PTE again
 #ifdef CONFIG_PGTABLE_4
-	ld8 r18=[r28]				// read L2 entry again
+	ld8 r19=[r28]				// read L2 entry again
 #endif
 	cmp.ne p6,p7=r0,r0
 	;;
 	cmp.ne.or.andcm p6,p7=r26,r20		// did L3 entry change
 #ifdef CONFIG_PGTABLE_4
-	cmp.ne.or.andcm p6,p7=r29,r18		// did L4 PTE change
+	cmp.ne.or.andcm p6,p7=r19,r29		// did L4 PTE change
 #endif
 	mov r27=PAGE_SHIFT<<2
 	;;
-- 
cgit v1.1


From e8aabc47168d24eabc08418db4e034a4c625721c Mon Sep 17 00:00:00 2001
From: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Date: Thu, 17 Nov 2005 01:55:34 -0800
Subject: [IA64] polish comments for tlb fault handler in ivt.S

Polish the comments specifically in vhpt_miss and nested_dtlb_miss
handlers.  I think it's better to explicitly name each page table
level with its name instead of numerically name them.  i.e., use
pgd, pud, pmd, and pte instead of referring as L1, L2, L3 etc.
Along the line, remove some magic number in the comments like:
"PTA + (((IFA(61,63) << 7) | IFA(33,39))*8)".  No code change at
all, pure comment update.  Feel free to shoot anything you have,
darts or tomahawk cruise missile.  I will duck behind a bunker ;-)

Signed-off-by: Ken Chen <kenneth.w.chen@intel.com>
Acked-by: Robin Holt <holt@sgi.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/kernel/ivt.S | 133 ++++++++++++++++++++++++++-----------------------
 1 file changed, 71 insertions(+), 62 deletions(-)

(limited to 'arch/ia64/kernel')

diff --git a/arch/ia64/kernel/ivt.S b/arch/ia64/kernel/ivt.S
index c71c792..301f2e9 100644
--- a/arch/ia64/kernel/ivt.S
+++ b/arch/ia64/kernel/ivt.S
@@ -91,16 +91,17 @@ ENTRY(vhpt_miss)
 	 * (the "original") TLB miss, which may either be caused by an instruction
 	 * fetch or a data access (or non-access).
 	 *
-	 * What we do here is normal TLB miss handing for the _original_ miss, followed
-	 * by inserting the TLB entry for the virtual page table page that the VHPT
-	 * walker was attempting to access.  The latter gets inserted as long
-	 * as both L1 and L2 have valid mappings for the faulting address.
-	 * The TLB entry for the original miss gets inserted only if
-	 * the L3 entry indicates that the page is present.
+	 * What we do here is normal TLB miss handing for the _original_ miss,
+	 * followed by inserting the TLB entry for the virtual page table page
+	 * that the VHPT walker was attempting to access.  The latter gets
+	 * inserted as long as page table entry above pte level have valid
+	 * mappings for the faulting address.  The TLB entry for the original
+	 * miss gets inserted only if the pte entry indicates that the page is
+	 * present.
 	 *
 	 * do_page_fault gets invoked in the following cases:
 	 *	- the faulting virtual address uses unimplemented address bits
-	 *	- the faulting virtual address has no L1, L2, or L3 mapping
+	 *	- the faulting virtual address has no valid page table mapping
 	 */
 	mov r16=cr.ifa				// get address that caused the TLB miss
 #ifdef CONFIG_HUGETLB_PAGE
@@ -126,7 +127,7 @@ ENTRY(vhpt_miss)
 #endif
 	;;
 	cmp.eq p6,p7=5,r17			// is IFA pointing into to region 5?
-	shr.u r18=r22,PGDIR_SHIFT		// get bits 33-63 of the faulting address
+	shr.u r18=r22,PGDIR_SHIFT		// get bottom portion of pgd index bit
 	;;
 (p7)	dep r17=r17,r19,(PAGE_SHIFT-3),3	// put region number bits in place
 
@@ -137,38 +138,38 @@ ENTRY(vhpt_miss)
 (p6)	shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT
 (p7)	shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT-3
 	;;
-(p6)	dep r17=r18,r19,3,(PAGE_SHIFT-3)	// r17=PTA + IFA(33,42)*8
-(p7)	dep r17=r18,r17,3,(PAGE_SHIFT-6)	// r17=PTA + (((IFA(61,63) << 7) | IFA(33,39))*8)
+(p6)	dep r17=r18,r19,3,(PAGE_SHIFT-3)	// r17=pgd_offset for region 5
+(p7)	dep r17=r18,r17,3,(PAGE_SHIFT-6)	// r17=pgd_offset for region[0-4]
 	cmp.eq p7,p6=0,r21			// unused address bits all zeroes?
 #ifdef CONFIG_PGTABLE_4
-	shr.u r28=r22,PUD_SHIFT			// shift L2 index into position
+	shr.u r28=r22,PUD_SHIFT			// shift pud index into position
 #else
-	shr.u r18=r22,PMD_SHIFT			// shift L3 index into position
+	shr.u r18=r22,PMD_SHIFT			// shift pmd index into position
 #endif
 	;;
-	ld8 r17=[r17]				// fetch the L1 entry (may be 0)
+	ld8 r17=[r17]				// get *pgd (may be 0)
 	;;
-(p7)	cmp.eq p6,p7=r17,r0			// was L1 entry NULL?
+(p7)	cmp.eq p6,p7=r17,r0			// was pgd_present(*pgd) == NULL?
 #ifdef CONFIG_PGTABLE_4
-	dep r28=r28,r17,3,(PAGE_SHIFT-3)	// compute address of L2 page table entry
+	dep r28=r28,r17,3,(PAGE_SHIFT-3)	// r28=pud_offset(pgd,addr)
 	;;
-	shr.u r18=r22,PMD_SHIFT			// shift L3 index into position
-(p7)	ld8 r29=[r28]				// fetch the L2 entry (may be 0)
+	shr.u r18=r22,PMD_SHIFT			// shift pmd index into position
+(p7)	ld8 r29=[r28]				// get *pud (may be 0)
 	;;
-(p7)	cmp.eq.or.andcm p6,p7=r29,r0		// was L2 entry NULL?
-	dep r17=r18,r29,3,(PAGE_SHIFT-3)	// compute address of L3 page table entry
+(p7)	cmp.eq.or.andcm p6,p7=r29,r0		// was pud_present(*pud) == NULL?
+	dep r17=r18,r29,3,(PAGE_SHIFT-3)	// r17=pmd_offset(pud,addr)
 #else
-	dep r17=r18,r17,3,(PAGE_SHIFT-3)	// compute address of L3 page table entry
+	dep r17=r18,r17,3,(PAGE_SHIFT-3)	// r17=pmd_offset(pgd,addr)
 #endif
 	;;
-(p7)	ld8 r20=[r17]				// fetch the L3 entry (may be 0)
-	shr.u r19=r22,PAGE_SHIFT		// shift L4 index into position
+(p7)	ld8 r20=[r17]				// get *pmd (may be 0)
+	shr.u r19=r22,PAGE_SHIFT		// shift pte index into position
 	;;
-(p7)	cmp.eq.or.andcm p6,p7=r20,r0		// was L3 entry NULL?
-	dep r21=r19,r20,3,(PAGE_SHIFT-3)	// compute address of L4 page table entry
+(p7)	cmp.eq.or.andcm p6,p7=r20,r0		// was pmd_present(*pmd) == NULL?
+	dep r21=r19,r20,3,(PAGE_SHIFT-3)	// r21=pte_offset(pmd,addr)
 	;;
-(p7)	ld8 r18=[r21]				// read the L4 PTE
-	mov r19=cr.isr				// cr.isr bit 0 tells us if this is an insn miss
+(p7)	ld8 r18=[r21]				// read *pte
+	mov r19=cr.isr				// cr.isr bit 32 tells us if this is an insn miss
 	;;
 (p7)	tbit.z p6,p7=r18,_PAGE_P_BIT		// page present bit cleared?
 	mov r22=cr.iha				// get the VHPT address that caused the TLB miss
@@ -202,25 +203,33 @@ ENTRY(vhpt_miss)
 	dv_serialize_data
 
 	/*
-	 * Re-check L2 and L3 pagetable.  If they changed, we may have received a ptc.g
+	 * Re-check pagetable entry.  If they changed, we may have received a ptc.g
 	 * between reading the pagetable and the "itc".  If so, flush the entry we
-	 * inserted and retry.
+	 * inserted and retry.  At this point, we have:
+	 *
+	 * r28 = equivalent of pud_offset(pgd, ifa)
+	 * r17 = equivalent of pmd_offset(pud, ifa)
+	 * r21 = equivalent of pte_offset(pmd, ifa)
+	 *
+	 * r29 = *pud
+	 * r20 = *pmd
+	 * r18 = *pte
 	 */
-	ld8 r25=[r21]				// read L4 entry again
-	ld8 r26=[r17]				// read L3 PTE again
+	ld8 r25=[r21]				// read *pte again
+	ld8 r26=[r17]				// read *pmd again
 #ifdef CONFIG_PGTABLE_4
-	ld8 r19=[r28]				// read L2 entry again
+	ld8 r19=[r28]				// read *pud again
 #endif
 	cmp.ne p6,p7=r0,r0
 	;;
-	cmp.ne.or.andcm p6,p7=r26,r20		// did L3 entry change
+	cmp.ne.or.andcm p6,p7=r26,r20		// did *pmd change
 #ifdef CONFIG_PGTABLE_4
-	cmp.ne.or.andcm p6,p7=r19,r29		// did L4 PTE change
+	cmp.ne.or.andcm p6,p7=r19,r29		// did *pud change
 #endif
 	mov r27=PAGE_SHIFT<<2
 	;;
 (p6)	ptc.l r22,r27				// purge PTE page translation
-(p7)	cmp.ne.or.andcm p6,p7=r25,r18		// did L4 PTE change
+(p7)	cmp.ne.or.andcm p6,p7=r25,r18		// did *pte change
 	;;
 (p6)	ptc.l r16,r27				// purge translation
 #endif
@@ -235,19 +244,19 @@ END(vhpt_miss)
 ENTRY(itlb_miss)
 	DBG_FAULT(1)
 	/*
-	 * The ITLB handler accesses the L3 PTE via the virtually mapped linear
+	 * The ITLB handler accesses the PTE via the virtually mapped linear
 	 * page table.  If a nested TLB miss occurs, we switch into physical
-	 * mode, walk the page table, and then re-execute the L3 PTE read
-	 * and go on normally after that.
+	 * mode, walk the page table, and then re-execute the PTE read and
+	 * go on normally after that.
 	 */
 	mov r16=cr.ifa				// get virtual address
 	mov r29=b0				// save b0
 	mov r31=pr				// save predicates
 .itlb_fault:
-	mov r17=cr.iha				// get virtual address of L3 PTE
+	mov r17=cr.iha				// get virtual address of PTE
 	movl r30=1f				// load nested fault continuation point
 	;;
-1:	ld8 r18=[r17]				// read L3 PTE
+1:	ld8 r18=[r17]				// read *pte
 	;;
 	mov b0=r29
 	tbit.z p6,p0=r18,_PAGE_P_BIT		// page present bit cleared?
@@ -262,7 +271,7 @@ ENTRY(itlb_miss)
 	 */
 	dv_serialize_data
 
-	ld8 r19=[r17]				// read L3 PTE again and see if same
+	ld8 r19=[r17]				// read *pte again and see if same
 	mov r20=PAGE_SHIFT<<2			// setup page size for purge
 	;;
 	cmp.ne p7,p0=r18,r19
@@ -279,19 +288,19 @@ END(itlb_miss)
 ENTRY(dtlb_miss)
 	DBG_FAULT(2)
 	/*
-	 * The DTLB handler accesses the L3 PTE via the virtually mapped linear
+	 * The DTLB handler accesses the PTE via the virtually mapped linear
 	 * page table.  If a nested TLB miss occurs, we switch into physical
-	 * mode, walk the page table, and then re-execute the L3 PTE read
-	 * and go on normally after that.
+	 * mode, walk the page table, and then re-execute the PTE read and
+	 * go on normally after that.
 	 */
 	mov r16=cr.ifa				// get virtual address
 	mov r29=b0				// save b0
 	mov r31=pr				// save predicates
 dtlb_fault:
-	mov r17=cr.iha				// get virtual address of L3 PTE
+	mov r17=cr.iha				// get virtual address of PTE
 	movl r30=1f				// load nested fault continuation point
 	;;
-1:	ld8 r18=[r17]				// read L3 PTE
+1:	ld8 r18=[r17]				// read *pte
 	;;
 	mov b0=r29
 	tbit.z p6,p0=r18,_PAGE_P_BIT		// page present bit cleared?
@@ -306,7 +315,7 @@ dtlb_fault:
 	 */
 	dv_serialize_data
 
-	ld8 r19=[r17]				// read L3 PTE again and see if same
+	ld8 r19=[r17]				// read *pte again and see if same
 	mov r20=PAGE_SHIFT<<2			// setup page size for purge
 	;;
 	cmp.ne p7,p0=r18,r19
@@ -420,7 +429,7 @@ ENTRY(nested_dtlb_miss)
 	 *		r30:	continuation address
 	 *		r31:	saved pr
 	 *
-	 * Output:	r17:	physical address of L3 PTE of faulting address
+	 * Output:	r17:	physical address of PTE of faulting address
 	 *		r29:	saved b0
 	 *		r30:	continuation address
 	 *		r31:	saved pr
@@ -450,33 +459,33 @@ ENTRY(nested_dtlb_miss)
 (p6)	shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT
 (p7)	shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT-3
 	;;
-(p6)	dep r17=r18,r19,3,(PAGE_SHIFT-3)	// r17=PTA + IFA(33,42)*8
-(p7)	dep r17=r18,r17,3,(PAGE_SHIFT-6)	// r17=PTA + (((IFA(61,63) << 7) | IFA(33,39))*8)
+(p6)	dep r17=r18,r19,3,(PAGE_SHIFT-3)	// r17=pgd_offset for region 5
+(p7)	dep r17=r18,r17,3,(PAGE_SHIFT-6)	// r17=pgd_offset for region[0-4]
 	cmp.eq p7,p6=0,r21			// unused address bits all zeroes?
 #ifdef CONFIG_PGTABLE_4
-	shr.u r18=r22,PUD_SHIFT			// shift L2 index into position
+	shr.u r18=r22,PUD_SHIFT			// shift pud index into position
 #else
-	shr.u r18=r22,PMD_SHIFT			// shift L3 index into position
+	shr.u r18=r22,PMD_SHIFT			// shift pmd index into position
 #endif
 	;;
-	ld8 r17=[r17]				// fetch the L1 entry (may be 0)
+	ld8 r17=[r17]				// get *pgd (may be 0)
 	;;
-(p7)	cmp.eq p6,p7=r17,r0			// was L1 entry NULL?
-	dep r17=r18,r17,3,(PAGE_SHIFT-3)	// compute address of L2 page table entry
+(p7)	cmp.eq p6,p7=r17,r0			// was pgd_present(*pgd) == NULL?
+	dep r17=r18,r17,3,(PAGE_SHIFT-3)	// r17=p[u|m]d_offset(pgd,addr)
 	;;
 #ifdef CONFIG_PGTABLE_4
-(p7)	ld8 r17=[r17]				// fetch the L2 entry (may be 0)
-	shr.u r18=r22,PMD_SHIFT			// shift L3 index into position
+(p7)	ld8 r17=[r17]				// get *pud (may be 0)
+	shr.u r18=r22,PMD_SHIFT			// shift pmd index into position
 	;;
-(p7)	cmp.eq.or.andcm p6,p7=r17,r0		// was L2 entry NULL?
-	dep r17=r18,r17,3,(PAGE_SHIFT-3)	// compute address of L2 page table entry
+(p7)	cmp.eq.or.andcm p6,p7=r17,r0		// was pud_present(*pud) == NULL?
+	dep r17=r18,r17,3,(PAGE_SHIFT-3)	// r17=pmd_offset(pud,addr)
 	;;
 #endif
-(p7)	ld8 r17=[r17]				// fetch the L3 entry (may be 0)
-	shr.u r19=r22,PAGE_SHIFT		// shift L4 index into position
+(p7)	ld8 r17=[r17]				// get *pmd (may be 0)
+	shr.u r19=r22,PAGE_SHIFT		// shift pte index into position
 	;;
-(p7)	cmp.eq.or.andcm p6,p7=r17,r0		// was L3 entry NULL?
-	dep r17=r19,r17,3,(PAGE_SHIFT-3)	// compute address of L4 page table entry
+(p7)	cmp.eq.or.andcm p6,p7=r17,r0		// was pmd_present(*pmd) == NULL?
+	dep r17=r19,r17,3,(PAGE_SHIFT-3)	// r17=pte_offset(pmd,addr);
 (p6)	br.cond.spnt page_fault
 	mov b0=r30
 	br.sptk.many b0				// return to continuation point
-- 
cgit v1.1


From 8bf1101bd52573e0573e374d56d2feecdbb5e444 Mon Sep 17 00:00:00 2001
From: Jim Keniston <jkenisto@us.ibm.com>
Date: Wed, 23 Nov 2005 13:37:42 -0800
Subject: [PATCH] kprobes: Fix return probes on sys_execve

Fix a bug in kprobes that can cause an Oops or even a crash when a return
probe is installed on one of the following functions: sys_execve,
do_execve, load_*_binary, flush_old_exec, or flush_thread.  The fix is to
remove the call to kprobe_flush_task() in flush_thread().  This fix has
been tested on all architectures for which the return-probes feature has
been implemented (i386, x86_64, ppc64, ia64).  Please apply.

BACKGROUND

Up to now, we have called kprobe_flush_task() under two situations: when a
task exits, and when it execs.  Flushing kretprobe_instances on exit is
correct because (a) do_exit() doesn't return, and (b) one or more
return-probed functions may be active when a task calls do_exit().  Neither
is the case for sys_execve() and its callees.

Initially, the mistaken call to kprobe_flush_task() on exec was harmless
because we put the "real" return address of each active probed function
back in the stack, just to be safe, when we recycled its
kretprobe_instance.  When support for ppc64 and ia64 was added, this safety
measure couldn't be employed, and was eventually dropped even for i386 and
x86_64.  sys_execve() and its callees were informally blacklisted for
return probes until this fix was developed.

Acked-by: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Signed-off-by: Jim Keniston <jkenisto@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ia64/kernel/process.c | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'arch/ia64/kernel')

diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
index 4305d2b..2e33665 100644
--- a/arch/ia64/kernel/process.c
+++ b/arch/ia64/kernel/process.c
@@ -718,13 +718,6 @@ kernel_thread_helper (int (*fn)(void *), void *arg)
 void
 flush_thread (void)
 {
-	/*
-	 * Remove function-return probe instances associated with this task
-	 * and put them back on the free list. Do not insert an exit probe for
-	 * this function, it will be disabled by kprobe_flush_task if you do.
-	 */
-	kprobe_flush_task(current);
-
 	/* drop floating-point and debug-register state if it exists: */
 	current->thread.flags &= ~(IA64_THREAD_FPH_VALID | IA64_THREAD_DBG_VALID);
 	ia64_drop_fpu(current);
-- 
cgit v1.1


From b77dae5293efba42ea1ff04d410ee68e66d5b0cf Mon Sep 17 00:00:00 2001
From: Dean Roe <roe@sgi.com>
Date: Wed, 9 Nov 2005 14:25:06 -0600
Subject: [IA64] - Make pfn_valid more precise for SGI Altix systems

A single SGI Altix system can be divided into multiple partitions,
each running their own instance of the Linux kernel.  pfn_valid()
is currently not optimal for any but the first partition, since it
does not compare the pfn with min_low_pfn before calling the more
costly ia64_pfn_valid().

Signed-off-by: Dean Roe <roe@sgi.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/kernel/ia64_ksyms.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/ia64/kernel')

diff --git a/arch/ia64/kernel/ia64_ksyms.c b/arch/ia64/kernel/ia64_ksyms.c
index 0157281..5db9d3b 100644
--- a/arch/ia64/kernel/ia64_ksyms.c
+++ b/arch/ia64/kernel/ia64_ksyms.c
@@ -42,6 +42,7 @@ EXPORT_SYMBOL(clear_page);
 
 #ifdef CONFIG_VIRTUAL_MEM_MAP
 #include <linux/bootmem.h>
+EXPORT_SYMBOL(min_low_pfn);	/* defined by bootmem.c, but not exported by generic code */
 EXPORT_SYMBOL(max_low_pfn);	/* defined by bootmem.c, but not exported by generic code */
 #endif
 
-- 
cgit v1.1


From 5a94bcfd2a18edcf368b3128c7df07b58e529932 Mon Sep 17 00:00:00 2001
From: Keshavamurthy Anil S <anil.s.keshavamurthy@intel.com>
Date: Tue, 22 Nov 2005 14:15:49 -0800
Subject: [IA64] Remove getting break_num by decoding instruction

break.b always sets cr.iim to 0 and the current code tries to
get the break_num by decoding instruction. However, their
seems to be a race condition while reading the regs->cr_iip,
as on other cpu the break.b at regs->cr_iip might have been
replaced with the original instruction as a result of
unregister_kprobe() and hence decoding instruction to
obtain break_num will result in wrong value in this case.

Also includes changes to kprobes.c which now has to handle
break number zero.

Signed-off-by: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/kernel/kprobes.c |  2 +-
 arch/ia64/kernel/traps.c   | 18 ------------------
 2 files changed, 1 insertion(+), 19 deletions(-)

(limited to 'arch/ia64/kernel')

diff --git a/arch/ia64/kernel/kprobes.c b/arch/ia64/kernel/kprobes.c
index 801eeae..2895d6e 100644
--- a/arch/ia64/kernel/kprobes.c
+++ b/arch/ia64/kernel/kprobes.c
@@ -740,7 +740,7 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
 	switch(val) {
 	case DIE_BREAK:
 		/* err is break number from ia64_bad_break() */
-		if (args->err == 0x80200 || args->err == 0x80300)
+		if (args->err == 0x80200 || args->err == 0x80300 || args->err == 0)
 			if (pre_kprobes_handler(args))
 				ret = NOTIFY_STOP;
 		break;
diff --git a/arch/ia64/kernel/traps.c b/arch/ia64/kernel/traps.c
index fba5fdd..d3e0ecb 100644
--- a/arch/ia64/kernel/traps.c
+++ b/arch/ia64/kernel/traps.c
@@ -132,24 +132,6 @@ __kprobes ia64_bad_break (unsigned long break_num, struct pt_regs *regs)
 	siginfo_t siginfo;
 	int sig, code;
 
-	/* break.b always sets cr.iim to 0, which causes problems for
-	 * debuggers.  Get the real break number from the original instruction,
-	 * but only for kernel code.  User space break.b is left alone, to
-	 * preserve the existing behaviour.  All break codings have the same
-	 * format, so there is no need to check the slot type.
-	 */
-	if (break_num == 0 && !user_mode(regs)) {
-		struct ia64_psr *ipsr = ia64_psr(regs);
-		unsigned long *bundle = (unsigned long *)regs->cr_iip;
-		unsigned long slot;
-		switch (ipsr->ri) {
-		      case 0:  slot = (bundle[0] >>  5); break;
-		      case 1:  slot = (bundle[0] >> 46) | (bundle[1] << 18); break;
-		      default: slot = (bundle[1] >> 23); break;
-		}
-		break_num = ((slot >> 36 & 1) << 20) | (slot >> 6 & 0xfffff);
-	}
-
 	/* SIGILL, SIGFPE, SIGSEGV, and SIGBUS want these field initialized: */
 	siginfo.si_addr = (void __user *) (regs->cr_iip + ia64_psr(regs)->ri);
 	siginfo.si_imm = break_num;
-- 
cgit v1.1