From 1af5ba514f0c2f2e2af965a4ffa5e8ab269271b9 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Fri, 25 Sep 2009 15:19:47 -0700 Subject: x86: Clean up and add missing log levels for k8 Convert all printk's in arch/x86/mm/k8topology_64.c to use pr_info() or pr_err() appropriately. Adds log levels for messages currently lacking them. Signed-off-by: David Rientjes Cc: Yinghai Lu Cc: Balbir Singh Cc: Ankita Garg Cc: Len Brown LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/mm/k8topology_64.c | 39 +++++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 20 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c index 268f825..a81561a 100644 --- a/arch/x86/mm/k8topology_64.c +++ b/arch/x86/mm/k8topology_64.c @@ -91,14 +91,14 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) if (nb < 0) return nb; - printk(KERN_INFO "Scanning NUMA topology in Northbridge %d\n", nb); + pr_info("Scanning NUMA topology in Northbridge %d\n", nb); reg = read_pci_config(0, nb, 0, 0x60); numnodes = ((reg >> 4) & 0xF) + 1; if (numnodes <= 1) return -1; - printk(KERN_INFO "Number of nodes %d\n", numnodes); + pr_info("Number of nodes %d\n", numnodes); memset(&nodes, 0, sizeof(nodes)); prevbase = 0; @@ -111,28 +111,28 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) nodeid = limit & 7; if ((base & 3) == 0) { if (i < numnodes) - printk("Skipping disabled node %d\n", i); + pr_info("Skipping disabled node %d\n", i); continue; } if (nodeid >= numnodes) { - printk("Ignoring excess node %d (%lx:%lx)\n", nodeid, - base, limit); + pr_info("Ignoring excess node %d (%lx:%lx)\n", nodeid, + base, limit); continue; } if (!limit) { - printk(KERN_INFO "Skipping node entry %d (base %lx)\n", - i, base); + pr_info("Skipping node entry %d (base %lx)\n", + i, base); continue; } if ((base >> 8) & 3 || (limit >> 8) & 3) { - printk(KERN_ERR "Node %d using interleaving mode %lx/%lx\n", - nodeid, (base>>8)&3, (limit>>8) & 3); + pr_err("Node %d using interleaving mode %lx/%lx\n", + nodeid, (base >> 8) & 3, (limit >> 8) & 3); return -1; } if (node_isset(nodeid, node_possible_map)) { - printk(KERN_INFO "Node %d already present. Skipping\n", - nodeid); + pr_info("Node %d already present, skipping\n", + nodeid); continue; } @@ -154,24 +154,24 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) if (limit > end) limit = end; if (limit == base) { - printk(KERN_ERR "Empty node %d\n", nodeid); + pr_err("Empty node %d\n", nodeid); continue; } if (limit < base) { - printk(KERN_ERR "Node %d bogus settings %lx-%lx.\n", + pr_err("Node %d bogus settings %lx-%lx.\n", nodeid, base, limit); continue; } /* Could sort here, but pun for now. Should not happen anyroads. */ if (prevbase > base) { - printk(KERN_ERR "Node map not sorted %lx,%lx\n", + pr_err("Node map not sorted %lx,%lx\n", prevbase, base); return -1; } - printk(KERN_INFO "Node %d MemBase %016lx Limit %016lx\n", - nodeid, base, limit); + pr_info("Node %d MemBase %016lx Limit %016lx\n", + nodeid, base, limit); found++; @@ -188,10 +188,10 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) memnode_shift = compute_hash_shift(nodes, 8, NULL); if (memnode_shift < 0) { - printk(KERN_ERR "No NUMA node hash function found. Contact maintainer\n"); + pr_err("No NUMA node hash function found. Contact maintainer\n"); return -1; } - printk(KERN_INFO "Using node hash shift of %d\n", memnode_shift); + pr_info("Using node hash shift of %d\n", memnode_shift); /* use the coreid bits from early_identify_cpu */ bits = boot_cpu_data.x86_coreid_bits; @@ -200,8 +200,7 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) /* need to get boot_cpu_id early for system with apicid lifting */ early_get_boot_cpu_id(); if (boot_cpu_physical_apicid > 0) { - printk(KERN_INFO "BSP APIC ID: %02x\n", - boot_cpu_physical_apicid); + pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid); apicid_base = boot_cpu_physical_apicid; } -- cgit v1.1 From 8ee2debce32412118cf8c239e0026ace56ea1425 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Fri, 25 Sep 2009 15:20:00 -0700 Subject: x86: Export k8 physical topology To eventually interleave emulated nodes over physical nodes, we need to know the physical topology of the machine without actually registering it. This does the k8 node setup in two parts: detection and registration. NUMA emulation can then used the physical topology detected to setup the address ranges of emulated nodes accordingly. If emulation isn't used, the k8 nodes are registered as normal. Two formals are added to the x86 NUMA setup functions: `acpi' and `k8'. These represent whether ACPI or K8 NUMA has been detected; both cannot be true at the same time. This specifies to the NUMA emulation code whether an underlying physical NUMA topology exists and which interface to use. This patch deals solely with separating the k8 setup path into Northbridge detection and registration steps and leaves the ACPI changes for a subsequent patch. The `acpi' formal is added here, however, to avoid touching all the header files again in the next patch. This approach also ensures emulated nodes will not span physical nodes so the true memory latency is not misrepresented. k8_get_nodes() may now be used to export the k8 physical topology of the machine for NUMA emulation. Signed-off-by: David Rientjes Cc: Andreas Herrmann Cc: Yinghai Lu Cc: Balbir Singh Cc: Ankita Garg Cc: Len Brown LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 4 ++-- arch/x86/mm/init_64.c | 3 ++- arch/x86/mm/k8topology_64.c | 52 +++++++++++++++++++++++++++++++++------------ arch/x86/mm/numa_32.c | 4 ++-- arch/x86/mm/numa_64.c | 6 +++--- 5 files changed, 48 insertions(+), 21 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 30938c1..5e32b07 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -703,8 +703,8 @@ void __init find_low_pfn_range(void) } #ifndef CONFIG_NEED_MULTIPLE_NODES -void __init initmem_init(unsigned long start_pfn, - unsigned long end_pfn) +void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, + int acpi, int k8) { #ifdef CONFIG_HIGHMEM highstart_pfn = highend_pfn = max_pfn; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 5a4398a..c20d30b 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -568,7 +568,8 @@ kernel_physical_mapping_init(unsigned long start, } #ifndef CONFIG_NUMA -void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn) +void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, + int acpi, int k8) { unsigned long bootmap_size, bootmap; diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c index a81561a..b9e2dbf 100644 --- a/arch/x86/mm/k8topology_64.c +++ b/arch/x86/mm/k8topology_64.c @@ -24,6 +24,9 @@ #include #include +static struct bootnode __initdata nodes[8]; +static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE; + static __init int find_northbridge(void) { int num; @@ -76,12 +79,26 @@ static __init void early_get_boot_cpu_id(void) early_init_lapic_mapping(); } -int __init k8_scan_nodes(unsigned long start, unsigned long end) +int __init k8_get_nodes(struct bootnode *physnodes) { - unsigned numnodes, cores, bits, apicid_base; + int i; + int ret = 0; + + for_each_node_mask(i, nodes_parsed) { + physnodes[ret].start = nodes[i].start; + physnodes[ret].end = nodes[i].end; + ret++; + } + return ret; +} + +int __init k8_numa_init(unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned long start = PFN_PHYS(start_pfn); + unsigned long end = PFN_PHYS(end_pfn); + unsigned numnodes; unsigned long prevbase; - struct bootnode nodes[8]; - int i, j, nb, found = 0; + int i, nb, found = 0; u32 nodeid, reg; if (!early_pci_allowed()) @@ -98,9 +115,8 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) if (numnodes <= 1) return -1; - pr_info("Number of nodes %d\n", numnodes); + pr_info("Number of physical nodes %d\n", numnodes); - memset(&nodes, 0, sizeof(nodes)); prevbase = 0; for (i = 0; i < 8; i++) { unsigned long base, limit; @@ -130,7 +146,7 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) nodeid, (base >> 8) & 3, (limit >> 8) & 3); return -1; } - if (node_isset(nodeid, node_possible_map)) { + if (node_isset(nodeid, nodes_parsed)) { pr_info("Node %d already present, skipping\n", nodeid); continue; @@ -141,8 +157,8 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) limit |= (1<<24)-1; limit++; - if (limit > max_pfn << PAGE_SHIFT) - limit = max_pfn << PAGE_SHIFT; + if (limit > end) + limit = end; if (limit <= base) continue; @@ -180,12 +196,23 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) prevbase = base; - node_set(nodeid, node_possible_map); + node_set(nodeid, nodes_parsed); } if (!found) return -1; + return 0; +} +int __init k8_scan_nodes(void) +{ + unsigned int bits; + unsigned int cores; + unsigned int apicid_base; + int i; + + BUG_ON(nodes_empty(nodes_parsed)); + node_possible_map = nodes_parsed; memnode_shift = compute_hash_shift(nodes, 8, NULL); if (memnode_shift < 0) { pr_err("No NUMA node hash function found. Contact maintainer\n"); @@ -204,9 +231,8 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) apicid_base = boot_cpu_physical_apicid; } - for (i = 0; i < 8; i++) { - if (nodes[i].start == nodes[i].end) - continue; + for_each_node_mask(i, node_possible_map) { + int j; e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index d253006..b20760c 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -347,8 +347,8 @@ static void init_remap_allocator(int nid) (ulong) node_remap_end_vaddr[nid]); } -void __init initmem_init(unsigned long start_pfn, - unsigned long end_pfn) +void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, + int acpi, int k8) { int nid; long kva_target_pfn; diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 459913b..dad5f42 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -524,7 +524,8 @@ out: } #endif /* CONFIG_NUMA_EMU */ -void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn) +void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn, + int acpi, int k8) { int i; @@ -547,8 +548,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn) #endif #ifdef CONFIG_K8_NUMA - if (!numa_off && !k8_scan_nodes(start_pfn< Date: Fri, 25 Sep 2009 15:20:04 -0700 Subject: x86: Export srat physical topology This is the counterpart to "x86: export k8 physical topology" for SRAT. It is not as invasive because the acpi code already seperates node setup into detection and registration steps, with the exception of registering e820 active regions in acpi_numa_memory_affinity_init(). This is now moved to acpi_scan_nodes() if NUMA emulation is disabled or deferred. acpi_numa_init() now returns a value which specifies whether an underlying SRAT was located. If so, that topology can be used by the emulation code to interleave emulated nodes over physical nodes or to register the nodes for ACPI. acpi_get_nodes() may now be used to export the srat physical topology of the machine for NUMA emulation. Signed-off-by: David Rientjes Cc: Andreas Herrmann Cc: Yinghai Lu Cc: Balbir Singh Cc: Ankita Garg Cc: Len Brown LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/mm/numa_64.c | 4 ++-- arch/x86/mm/srat_64.c | 28 +++++++++++++++++++++------- 2 files changed, 23 insertions(+), 9 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index dad5f42..d1a3d94 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -540,8 +540,8 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn, #endif #ifdef CONFIG_ACPI_NUMA - if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, - last_pfn << PAGE_SHIFT)) + if (!numa_off && acpi && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, + last_pfn << PAGE_SHIFT)) return; nodes_clear(node_possible_map); nodes_clear(node_online_map); diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index dbb5381..891cbe6 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -290,8 +290,6 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm, start, end); - e820_register_active_regions(node, start >> PAGE_SHIFT, - end >> PAGE_SHIFT); if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) { update_nodes_add(node, start, end); @@ -338,6 +336,19 @@ static int __init nodes_cover_memory(const struct bootnode *nodes) void __init acpi_numa_arch_fixup(void) {} +int __init acpi_get_nodes(struct bootnode *physnodes) +{ + int i; + int ret = 0; + + for_each_node_mask(i, nodes_parsed) { + physnodes[ret].start = nodes[i].start; + physnodes[ret].end = nodes[i].end; + ret++; + } + return ret; +} + /* Use the information discovered above to actually set up the nodes. */ int __init acpi_scan_nodes(unsigned long start, unsigned long end) { @@ -350,11 +361,6 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) for (i = 0; i < MAX_NUMNODES; i++) cutoff_node(i, start, end); - if (!nodes_cover_memory(nodes)) { - bad_srat(); - return -1; - } - memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks, memblk_nodeid); if (memnode_shift < 0) { @@ -364,6 +370,14 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) return -1; } + for_each_node_mask(i, nodes_parsed) + e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, + nodes[i].end >> PAGE_SHIFT); + if (!nodes_cover_memory(nodes)) { + bad_srat(); + return -1; + } + /* Account for nodes with cpus and no memory */ nodes_or(node_possible_map, nodes_parsed, cpu_nodes_parsed); -- cgit v1.1 From adc1938994f7f1112d335d998b5218b0aa680ad6 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Fri, 25 Sep 2009 15:20:09 -0700 Subject: x86: Interleave emulated nodes over physical nodes Add interleaved NUMA emulation support This patch interleaves emulated nodes over the system's physical nodes. This is required for interleave optimizations since mempolicies, for example, operate by iterating over a nodemask and act without knowledge of node distances. It can also be used for testing memory latencies and NUMA bugs in the kernel. There're a couple of ways to do this: - divide the number of emulated nodes by the number of physical nodes and allocate the result on each physical node, or - allocate each successive emulated node on a different physical node until all memory is exhausted. The disadvantage of the first option is, depending on the asymmetry in node capacities of each physical node, emulated nodes may substantially differ in size on a particular physical node compared to another. The disadvantage of the second option is, also depending on the asymmetry in node capacities of each physical node, there may be more emulated nodes allocated on a single physical node as another. This patch implements the second option; we sacrifice the possibility that we may have slightly more emulated nodes on a particular physical node compared to another in lieu of node size asymmetry. [ Note that "node capacity" of a physical node is not only a function of its addressable range, but also is affected by subtracting out the amount of reserved memory over that range. NUMA emulation only deals with available, non-reserved memory quantities. ] We ensure there is at least a minimal amount of available memory allocated to each node. We also make sure that at least this amount of available memory is available in ZONE_DMA32 for any node that includes both ZONE_DMA32 and ZONE_NORMAL. This patch also cleans the emulation code up by no longer passing the statically allocated struct bootnode array among the various functions. This init.data array is not allocated on the stack since it may be very large and thus it may be accessed at file scope. The WARN_ON() for nodes_cover_memory() when faking proximity domains is removed since it relies on successive nodes always having greater start addresses than previous nodes; with interleaving this is no longer always true. Signed-off-by: David Rientjes Cc: Linus Torvalds Cc: Andreas Herrmann Cc: Yinghai Lu Cc: Balbir Singh Cc: Ankita Garg Cc: Len Brown LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/mm/numa_64.c | 211 +++++++++++++++++++++++++++++++++++++++++++------- arch/x86/mm/srat_64.c | 1 - 2 files changed, 184 insertions(+), 28 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index d1a3d94..086f98a 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -306,8 +306,71 @@ void __init numa_init_array(void) #ifdef CONFIG_NUMA_EMU /* Numa emulation */ +static struct bootnode nodes[MAX_NUMNODES] __initdata; +static struct bootnode physnodes[MAX_NUMNODES] __initdata; static char *cmdline __initdata; +static int __init setup_physnodes(unsigned long start, unsigned long end, + int acpi, int k8) +{ + int nr_nodes = 0; + int ret = 0; + int i; + +#ifdef CONFIG_ACPI_NUMA + if (acpi) + nr_nodes = acpi_get_nodes(physnodes); +#endif +#ifdef CONFIG_K8_NUMA + if (k8) + nr_nodes = k8_get_nodes(physnodes); +#endif + /* + * Basic sanity checking on the physical node map: there may be errors + * if the SRAT or K8 incorrectly reported the topology or the mem= + * kernel parameter is used. + */ + for (i = 0; i < nr_nodes; i++) { + if (physnodes[i].start == physnodes[i].end) + continue; + if (physnodes[i].start > end) { + physnodes[i].end = physnodes[i].start; + continue; + } + if (physnodes[i].end < start) { + physnodes[i].start = physnodes[i].end; + continue; + } + if (physnodes[i].start < start) + physnodes[i].start = start; + if (physnodes[i].end > end) + physnodes[i].end = end; + } + + /* + * Remove all nodes that have no memory or were truncated because of the + * limited address range. + */ + for (i = 0; i < nr_nodes; i++) { + if (physnodes[i].start == physnodes[i].end) + continue; + physnodes[ret].start = physnodes[i].start; + physnodes[ret].end = physnodes[i].end; + ret++; + } + + /* + * If no physical topology was detected, a single node is faked to cover + * the entire address space. + */ + if (!ret) { + physnodes[ret].start = start; + physnodes[ret].end = end; + ret = 1; + } + return ret; +} + /* * Setups up nid to range from addr to addr + size. If the end * boundary is greater than max_addr, then max_addr is used instead. @@ -315,11 +378,9 @@ static char *cmdline __initdata; * allocation past addr and -1 otherwise. addr is adjusted to be at * the end of the node. */ -static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr, - u64 size, u64 max_addr) +static int __init setup_node_range(int nid, u64 *addr, u64 size, u64 max_addr) { int ret = 0; - nodes[nid].start = *addr; *addr += size; if (*addr >= max_addr) { @@ -335,12 +396,111 @@ static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr, } /* + * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr + * to max_addr. The return value is the number of nodes allocated. + */ +static int __init split_nodes_interleave(u64 addr, u64 max_addr, + int nr_phys_nodes, int nr_nodes) +{ + nodemask_t physnode_mask = NODE_MASK_NONE; + u64 size; + int big; + int ret = 0; + int i; + + if (nr_nodes <= 0) + return -1; + if (nr_nodes > MAX_NUMNODES) { + pr_info("numa=fake=%d too large, reducing to %d\n", + nr_nodes, MAX_NUMNODES); + nr_nodes = MAX_NUMNODES; + } + + size = (max_addr - addr - e820_hole_size(addr, max_addr)) / nr_nodes; + /* + * Calculate the number of big nodes that can be allocated as a result + * of consolidating the remainder. + */ + big = ((size & ~FAKE_NODE_MIN_HASH_MASK) & nr_nodes) / + FAKE_NODE_MIN_SIZE; + + size &= FAKE_NODE_MIN_HASH_MASK; + if (!size) { + pr_err("Not enough memory for each node. " + "NUMA emulation disabled.\n"); + return -1; + } + + for (i = 0; i < nr_phys_nodes; i++) + if (physnodes[i].start != physnodes[i].end) + node_set(i, physnode_mask); + + /* + * Continue to fill physical nodes with fake nodes until there is no + * memory left on any of them. + */ + while (nodes_weight(physnode_mask)) { + for_each_node_mask(i, physnode_mask) { + u64 end = physnodes[i].start + size; + u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN); + + if (ret < big) + end += FAKE_NODE_MIN_SIZE; + + /* + * Continue to add memory to this fake node if its + * non-reserved memory is less than the per-node size. + */ + while (end - physnodes[i].start - + e820_hole_size(physnodes[i].start, end) < size) { + end += FAKE_NODE_MIN_SIZE; + if (end > physnodes[i].end) { + end = physnodes[i].end; + break; + } + } + + /* + * If there won't be at least FAKE_NODE_MIN_SIZE of + * non-reserved memory in ZONE_DMA32 for the next node, + * this one must extend to the boundary. + */ + if (end < dma32_end && dma32_end - end - + e820_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) + end = dma32_end; + + /* + * If there won't be enough non-reserved memory for the + * next node, this one must extend to the end of the + * physical node. + */ + if (physnodes[i].end - end - + e820_hole_size(end, physnodes[i].end) < size) + end = physnodes[i].end; + + /* + * Avoid allocating more nodes than requested, which can + * happen as a result of rounding down each node's size + * to FAKE_NODE_MIN_SIZE. + */ + if (nodes_weight(physnode_mask) + ret >= nr_nodes) + end = physnodes[i].end; + + if (setup_node_range(ret++, &physnodes[i].start, + end - physnodes[i].start, + physnodes[i].end) < 0) + node_clear(i, physnode_mask); + } + } + return ret; +} + +/* * Splits num_nodes nodes up equally starting at node_start. The return value * is the number of nodes split up and addr is adjusted to be at the end of the * last node allocated. */ -static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr, - u64 max_addr, int node_start, +static int __init split_nodes_equally(u64 *addr, u64 max_addr, int node_start, int num_nodes) { unsigned int big; @@ -388,7 +548,7 @@ static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr, break; } } - if (setup_node_range(i, nodes, addr, end - *addr, max_addr) < 0) + if (setup_node_range(i, addr, end - *addr, max_addr) < 0) break; } return i - node_start + 1; @@ -399,12 +559,12 @@ static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr, * always assigned to a final node and can be asymmetric. Returns the number of * nodes split. */ -static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr, - u64 max_addr, int node_start, u64 size) +static int __init split_nodes_by_size(u64 *addr, u64 max_addr, int node_start, + u64 size) { int i = node_start; size = (size << 20) & FAKE_NODE_MIN_HASH_MASK; - while (!setup_node_range(i++, nodes, addr, size, max_addr)) + while (!setup_node_range(i++, addr, size, max_addr)) ; return i - node_start; } @@ -413,15 +573,15 @@ static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr, * Sets up the system RAM area from start_pfn to last_pfn according to the * numa=fake command-line option. */ -static struct bootnode nodes[MAX_NUMNODES] __initdata; - -static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn) +static int __init numa_emulation(unsigned long start_pfn, + unsigned long last_pfn, int acpi, int k8) { u64 size, addr = start_pfn << PAGE_SHIFT; u64 max_addr = last_pfn << PAGE_SHIFT; int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i; + int num_phys_nodes; - memset(&nodes, 0, sizeof(nodes)); + num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8); /* * If the numa=fake command-line is just a single number N, split the * system RAM into N fake nodes. @@ -429,7 +589,8 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) { long n = simple_strtol(cmdline, NULL, 0); - num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, n); + num_nodes = split_nodes_interleave(addr, max_addr, + num_phys_nodes, n); if (num_nodes < 0) return num_nodes; goto out; @@ -456,8 +617,8 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK; if (size) for (i = 0; i < coeff; i++, num_nodes++) - if (setup_node_range(num_nodes, nodes, - &addr, size, max_addr) < 0) + if (setup_node_range(num_nodes, &addr, + size, max_addr) < 0) goto done; if (!*cmdline) break; @@ -473,7 +634,7 @@ done: if (addr < max_addr) { if (coeff_flag && coeff < 0) { /* Split remaining nodes into num-sized chunks */ - num_nodes += split_nodes_by_size(nodes, &addr, max_addr, + num_nodes += split_nodes_by_size(&addr, max_addr, num_nodes, num); goto out; } @@ -482,7 +643,7 @@ done: /* Split remaining nodes into coeff chunks */ if (coeff <= 0) break; - num_nodes += split_nodes_equally(nodes, &addr, max_addr, + num_nodes += split_nodes_equally(&addr, max_addr, num_nodes, coeff); break; case ',': @@ -490,8 +651,8 @@ done: break; default: /* Give one final node */ - setup_node_range(num_nodes, nodes, &addr, - max_addr - addr, max_addr); + setup_node_range(num_nodes, &addr, max_addr - addr, + max_addr); num_nodes++; } } @@ -505,14 +666,10 @@ out: } /* - * We need to vacate all active ranges that may have been registered by - * SRAT and set acpi_numa to -1 so that srat_disabled() always returns - * true. NUMA emulation has succeeded so we will not scan ACPI nodes. + * We need to vacate all active ranges that may have been registered for + * the e820 memory map. */ remove_all_active_ranges(); -#ifdef CONFIG_ACPI_NUMA - acpi_numa = -1; -#endif for_each_node_mask(i, node_possible_map) { e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, nodes[i].end >> PAGE_SHIFT); @@ -533,7 +690,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn, nodes_clear(node_online_map); #ifdef CONFIG_NUMA_EMU - if (cmdline && !numa_emulation(start_pfn, last_pfn)) + if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, k8)) return; nodes_clear(node_possible_map); nodes_clear(node_online_map); diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index 891cbe6..34aa438 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -468,7 +468,6 @@ void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes) for (i = 0; i < num_nodes; i++) if (fake_nodes[i].start != fake_nodes[i].end) node_set(i, nodes_parsed); - WARN_ON(!nodes_cover_memory(fake_nodes)); } static int null_slit_node_compare(int a, int b) -- cgit v1.1 From b9af7c0d44b8bb71e3af5e94688d076414aa8c87 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Wed, 14 Oct 2009 14:46:55 -0700 Subject: x86-64: preserve large page mapping for 1st 2MB kernel txt with CONFIG_DEBUG_RODATA In the first 2MB, kernel text is co-located with kernel static page tables setup by head_64.S. CONFIG_DEBUG_RODATA chops this 2MB large page mapping to small 4KB pages as we mark the kernel text as RO, leaving the static page tables as RW. With CONFIG_DEBUG_RODATA disabled, OLTP run on NHM-EP shows 1% improvement with 2% reduction in system time and 1% improvement in iowait idle time. To recover this, move the kernel static page tables to .data section, so that we don't have to break the first 2MB of kernel text to small pages with CONFIG_DEBUG_RODATA. Signed-off-by: Suresh Siddha LKML-Reference: <20091014220254.063193621@sbs-t61.sc.intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/mm/init_64.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index c20d30b..7dafd41 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -699,7 +699,7 @@ static int kernel_set_to_readonly; void set_kernel_text_rw(void) { - unsigned long start = PFN_ALIGN(_stext); + unsigned long start = PFN_ALIGN(_text); unsigned long end = PFN_ALIGN(__start_rodata); if (!kernel_set_to_readonly) @@ -713,7 +713,7 @@ void set_kernel_text_rw(void) void set_kernel_text_ro(void) { - unsigned long start = PFN_ALIGN(_stext); + unsigned long start = PFN_ALIGN(_text); unsigned long end = PFN_ALIGN(__start_rodata); if (!kernel_set_to_readonly) @@ -727,7 +727,7 @@ void set_kernel_text_ro(void) void mark_rodata_ro(void) { - unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata); + unsigned long start = PFN_ALIGN(_text), end = PFN_ALIGN(__end_rodata); unsigned long rodata_start = ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK; -- cgit v1.1 From 74e081797bd9d2a7d8005fe519e719df343a2ba8 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Wed, 14 Oct 2009 14:46:56 -0700 Subject: x86-64: align RODATA kernel section to 2MB with CONFIG_DEBUG_RODATA CONFIG_DEBUG_RODATA chops the large pages spanning boundaries of kernel text/rodata/data to small 4KB pages as they are mapped with different attributes (text as RO, RODATA as RO and NX etc). On x86_64, preserve the large page mappings for kernel text/rodata/data boundaries when CONFIG_DEBUG_RODATA is enabled. This is done by allowing the RODATA section to be hugepage aligned and having same RWX attributes for the 2MB page boundaries Extra Memory pages padding the sections will be freed during the end of the boot and the kernel identity mappings will have different RWX permissions compared to the kernel text mappings. Kernel identity mappings to these physical pages will be mapped with smaller pages but large page mappings are still retained for kernel text,rodata,data mappings. Signed-off-by: Suresh Siddha LKML-Reference: <20091014220254.190119924@sbs-t61.sc.intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/mm/init_64.c | 14 +++++++++++++- arch/x86/mm/pageattr.c | 14 ++++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 7dafd41..0ed09fa 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -727,9 +727,13 @@ void set_kernel_text_ro(void) void mark_rodata_ro(void) { - unsigned long start = PFN_ALIGN(_text), end = PFN_ALIGN(__end_rodata); + unsigned long start = PFN_ALIGN(_text); unsigned long rodata_start = ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK; + unsigned long end = (unsigned long) &__end_rodata_hpage_align; + unsigned long text_end = PAGE_ALIGN((unsigned long) &__stop___ex_table); + unsigned long rodata_end = PAGE_ALIGN((unsigned long) &__end_rodata); + unsigned long data_start = (unsigned long) &_sdata; printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", (end - start) >> 10); @@ -752,6 +756,14 @@ void mark_rodata_ro(void) printk(KERN_INFO "Testing CPA: again\n"); set_memory_ro(start, (end-start) >> PAGE_SHIFT); #endif + + free_init_pages("unused kernel memory", + (unsigned long) page_address(virt_to_page(text_end)), + (unsigned long) + page_address(virt_to_page(rodata_start))); + free_init_pages("unused kernel memory", + (unsigned long) page_address(virt_to_page(rodata_end)), + (unsigned long) page_address(virt_to_page(data_start))); } #endif diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index dd38bfb..b494fc4 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -279,6 +279,20 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, __pa((unsigned long)__end_rodata) >> PAGE_SHIFT)) pgprot_val(forbidden) |= _PAGE_RW; +#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA) + /* + * Kernel text mappings for the large page aligned .rodata section + * will be read-only. For the kernel identity mappings covering + * the holes caused by this alignment can be anything. + * + * This will preserve the large page mappings for kernel text/data + * at no extra cost. + */ + if (within(address, (unsigned long)_text, + (unsigned long)__end_rodata_hpage_align)) + pgprot_val(forbidden) |= _PAGE_RW; +#endif + prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden)); return prot; -- cgit v1.1 From b1258ac2963d42ee7e807d2993d15e3dd39ff4b0 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 22 Oct 2009 11:27:22 +0900 Subject: x86: Remove pfn in add_one_highpage_init() commit cc9f7a0ccf000d4db5fbdc7b0ae48eefea102f69 changed add_one_highpage_init. We don't use pfn any more. Let's remove unnecessary argument. This patch doesn't chage function behavior. This patch is based on v2.6.32-rc5. Signed-off-by: Minchan Kim Cc: Yinghai Lu LKML-Reference: <20091022112722.adc8e55c.minchan.kim@barrios-desktop> Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 5e32b07..f64d0d5 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -412,7 +412,7 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base) pkmap_page_table = pte; } -static void __init add_one_highpage_init(struct page *page, int pfn) +static void __init add_one_highpage_init(struct page *page) { ClearPageReserved(page); init_page_count(page); @@ -445,7 +445,7 @@ static int __init add_highpages_work_fn(unsigned long start_pfn, if (!pfn_valid(node_pfn)) continue; page = pfn_to_page(node_pfn); - add_one_highpage_init(page, node_pfn); + add_one_highpage_init(page); } return 0; -- cgit v1.1 From 883242dd0e5faaba041528a9a99f483f2a656c83 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 27 Oct 2009 13:15:11 -0400 Subject: tracing: allow to change permissions for text with dynamic ftrace enabled The commit 74e081797bd9d2a7d8005fe519e719df343a2ba8 x86-64: align RODATA kernel section to 2MB with CONFIG_DEBUG_RODATA prevents text sections from becoming read/write using set_memory_rw. The dynamic ftrace changes all text pages to read/write just before converting the calls to tracing to nops, and vice versa. I orginally just added a flag to allow this transaction when ftrace did the change, but I also found that when the CPA testing was running it would remove the read/write as well, and ftrace does not do the text conversion on boot up, and the CPA changes caused the dynamic tracer to fail on self tests. The current solution I have is to simply not to prevent change_page_attr from setting the RW bit for kernel text pages. Reported-by: Ingo Molnar Cc: Suresh Siddha Cc: H. Peter Anvin Signed-off-by: Steven Rostedt --- arch/x86/mm/pageattr.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index b494fc4..78d3168 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -279,7 +279,8 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, __pa((unsigned long)__end_rodata) >> PAGE_SHIFT)) pgprot_val(forbidden) |= _PAGE_RW; -#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA) +#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA) && \ + !defined(CONFIG_DYNAMIC_FTRACE) /* * Kernel text mappings for the large page aligned .rodata section * will be read-only. For the kernel identity mappings covering -- cgit v1.1 From 502f660466ba7a66711ffdf414b1f7f1131dcbf7 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Wed, 28 Oct 2009 18:46:56 -0800 Subject: x86, cpa: Fix kernel text RO checks in static_protection() Steven Rostedt reported that we are unconditionally making the kernel text mapping as read-only. i.e., if someone does cpa() to the kernel text area for setting/clearing any page table attribute, we unconditionally clear the read-write attribute for the kernel text mapping that is set at compile time. We should delay (to forbid the write attribute) and enforce only after the kernel has mapped the text as read-only. Reported-by: Steven Rostedt Signed-off-by: Suresh Siddha Acked-by: Steven Rostedt Tested-by: Steven Rostedt LKML-Reference: <20091029024820.996634347@sbs-t61.sc.intel.com> [ marked kernel_set_to_readonly as __read_mostly ] Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 2 +- arch/x86/mm/init_64.c | 2 +- arch/x86/mm/pageattr.c | 10 ++++++---- 3 files changed, 8 insertions(+), 6 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index f64d0d5..c973f8e 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -997,7 +997,7 @@ static noinline int do_test_wp_bit(void) const int rodata_test_data = 0xC3; EXPORT_SYMBOL_GPL(rodata_test_data); -static int kernel_set_to_readonly; +int kernel_set_to_readonly __read_mostly; void set_kernel_text_rw(void) { diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 0ed09fa..4b507c0 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -695,7 +695,7 @@ void __init mem_init(void) const int rodata_test_data = 0xC3; EXPORT_SYMBOL_GPL(rodata_test_data); -static int kernel_set_to_readonly; +int kernel_set_to_readonly; void set_kernel_text_rw(void) { diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 78d3168..8d1e8d9 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -282,14 +282,16 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, #if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA) && \ !defined(CONFIG_DYNAMIC_FTRACE) /* - * Kernel text mappings for the large page aligned .rodata section - * will be read-only. For the kernel identity mappings covering - * the holes caused by this alignment can be anything. + * Once the kernel maps the text as RO (kernel_set_to_readonly is set), + * kernel text mappings for the large page aligned text, rodata sections + * will be always read-only. For the kernel identity mappings covering + * the holes caused by this alignment can be anything that user asks. * * This will preserve the large page mappings for kernel text/data * at no extra cost. */ - if (within(address, (unsigned long)_text, + if (kernel_set_to_readonly && + within(address, (unsigned long)_text, (unsigned long)__end_rodata_hpage_align)) pgprot_val(forbidden) |= _PAGE_RW; #endif -- cgit v1.1 From 55ca3cc1746335bb6ef1d3894ddb6d0c729b3518 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Wed, 28 Oct 2009 18:46:57 -0800 Subject: x86_64, ftrace: Make ftrace use kernel identity mapping to modify code On x86_64, kernel text mappings are mapped read-only with CONFIG_DEBUG_RODATA. So use the kernel identity mapping instead of the kernel text mapping to modify the kernel text. Signed-off-by: Suresh Siddha Acked-by: Steven Rostedt Tested-by: Steven Rostedt LKML-Reference: <20091029024821.080941108@sbs-t61.sc.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 8d1e8d9..09a140c 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -279,8 +279,7 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, __pa((unsigned long)__end_rodata) >> PAGE_SHIFT)) pgprot_val(forbidden) |= _PAGE_RW; -#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA) && \ - !defined(CONFIG_DYNAMIC_FTRACE) +#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA) /* * Once the kernel maps the text as RO (kernel_set_to_readonly is set), * kernel text mappings for the large page aligned text, rodata sections -- cgit v1.1 From e7d23dde9b7ebb575e2bcee2abefc9ec1e4adde9 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Wed, 28 Oct 2009 18:46:58 -0800 Subject: x86_64, cpa: Use only text section in set_kernel_text_rw/ro set_kernel_text_rw()/set_kernel_text_ro() are marking pages starting from _text to __start_rodata as RW or RO. With CONFIG_DEBUG_RODATA, there might be free pages (associated with padding the sections to 2MB large page boundary) between text and rodata sections that are given back to page allocator. So we should use only use the start (__text) and end (__stop___ex_table) of the text section in set_kernel_text_rw()/set_kernel_text_ro(). Signed-off-by: Suresh Siddha Acked-by: Steven Rostedt Tested-by: Steven Rostedt LKML-Reference: <20091029024821.164525222@sbs-t61.sc.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 4b507c0..5198b9b 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -700,7 +700,7 @@ int kernel_set_to_readonly; void set_kernel_text_rw(void) { unsigned long start = PFN_ALIGN(_text); - unsigned long end = PFN_ALIGN(__start_rodata); + unsigned long end = PFN_ALIGN(__stop___ex_table); if (!kernel_set_to_readonly) return; @@ -708,13 +708,18 @@ void set_kernel_text_rw(void) pr_debug("Set kernel text: %lx - %lx for read write\n", start, end); + /* + * Make the kernel identity mapping for text RW. Kernel text + * mapping will always be RO. Refer to the comment in + * static_protections() in pageattr.c + */ set_memory_rw(start, (end - start) >> PAGE_SHIFT); } void set_kernel_text_ro(void) { unsigned long start = PFN_ALIGN(_text); - unsigned long end = PFN_ALIGN(__start_rodata); + unsigned long end = PFN_ALIGN(__stop___ex_table); if (!kernel_set_to_readonly) return; @@ -722,6 +727,9 @@ void set_kernel_text_ro(void) pr_debug("Set kernel text: %lx - %lx for read only\n", start, end); + /* + * Set the kernel identity mapping for text RO. + */ set_memory_ro(start, (end - start) >> PAGE_SHIFT); } -- cgit v1.1 From 583140afb989f24d115e80be5c91e503b58ccfc0 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 13 Nov 2009 15:28:15 -0800 Subject: x86, pageattr: Make set_memory_(x|nx) aware of NX support Make set_memory_x/set_memory_nx directly aware of if NX is supported in the system or not, rather than requiring that every caller assesses that support independently. Signed-off-by: H. Peter Anvin Cc: Huang Ying Cc: Venkatesh Pallipadi Cc: Suresh Siddha Cc: Tejun Heo Cc: Tim Starling Cc: Hannes Eder LKML-Reference: <1258154897-6770-4-git-send-email-hpa@zytor.com> Acked-by: Kees Cook --- arch/x86/mm/pageattr.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 09a140c..1d4eb93 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -1085,12 +1085,18 @@ EXPORT_SYMBOL(set_memory_array_wb); int set_memory_x(unsigned long addr, int numpages) { + if (!(__supported_pte_mask & _PAGE_NX)) + return 0; + return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0); } EXPORT_SYMBOL(set_memory_x); int set_memory_nx(unsigned long addr, int numpages) { + if (!(__supported_pte_mask & _PAGE_NX)) + return 0; + return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0); } EXPORT_SYMBOL(set_memory_nx); -- cgit v1.1 From 4763ed4d45522b876c97e1f7f4b659d211f75571 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 13 Nov 2009 15:28:16 -0800 Subject: x86, mm: Clean up and simplify NX enablement The 32- and 64-bit code used very different mechanisms for enabling NX, but even the 32-bit code was enabling NX in head_32.S if it is available. Furthermore, we had a bewildering collection of tests for the available of NX. This patch: a) merges the 32-bit set_nx() and the 64-bit check_efer() function into a single x86_configure_nx() function. EFER control is left to the head code. b) eliminates the nx_enabled variable entirely. Things that need to test for NX enablement can verify __supported_pte_mask directly, and cpu_has_nx gives the supported status of NX. Signed-off-by: H. Peter Anvin Cc: Tejun Heo Cc: Brian Gerst Cc: Yinghai Lu Cc: Pekka Enberg Cc: Vegard Nossum Cc: Jeremy Fitzhardinge Cc: Chris Wright LKML-Reference: <1258154897-6770-5-git-send-email-hpa@zytor.com> Acked-by: Kees Cook --- arch/x86/mm/init.c | 4 ++-- arch/x86/mm/setup_nx.c | 43 ++++++------------------------------------- 2 files changed, 8 insertions(+), 39 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 73ffd55..27ec2c2 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -146,8 +146,8 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, use_gbpages = direct_gbpages; #endif - set_nx(); - if (nx_enabled) + /* XXX: replace this with Kees' improved messages */ + if (__supported_pte_mask & _PAGE_NX) printk(KERN_INFO "NX (Execute Disable) protection: active\n"); /* Enable PSE if available */ diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c index 513d8ed..355818b 100644 --- a/arch/x86/mm/setup_nx.c +++ b/arch/x86/mm/setup_nx.c @@ -3,10 +3,8 @@ #include #include +#include -int nx_enabled; - -#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) static int disable_nx __cpuinitdata; /* @@ -22,48 +20,19 @@ static int __init noexec_setup(char *str) if (!str) return -EINVAL; if (!strncmp(str, "on", 2)) { - __supported_pte_mask |= _PAGE_NX; disable_nx = 0; } else if (!strncmp(str, "off", 3)) { disable_nx = 1; - __supported_pte_mask &= ~_PAGE_NX; } + x86_configure_nx(); return 0; } early_param("noexec", noexec_setup); -#endif - -#ifdef CONFIG_X86_PAE -void __init set_nx(void) -{ - unsigned int v[4], l, h; - - if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) { - cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]); - - if ((v[3] & (1 << 20)) && !disable_nx) { - rdmsr(MSR_EFER, l, h); - l |= EFER_NX; - wrmsr(MSR_EFER, l, h); - nx_enabled = 1; - __supported_pte_mask |= _PAGE_NX; - } - } -} -#else -void set_nx(void) -{ -} -#endif -#ifdef CONFIG_X86_64 -void __cpuinit check_efer(void) +void __cpuinit x86_configure_nx(void) { - unsigned long efer; - - rdmsrl(MSR_EFER, efer); - if (!(efer & EFER_NX) || disable_nx) + if (cpu_has_nx && !disable_nx) + __supported_pte_mask |= _PAGE_NX; + else __supported_pte_mask &= ~_PAGE_NX; } -#endif - -- cgit v1.1 From 4b0f3b81eb33ef18283aa71440cccfede1753ae0 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 13 Nov 2009 15:28:17 -0800 Subject: x86, mm: Report state of NX protections during boot It is possible for x86_64 systems to lack the NX bit either due to the hardware lacking support or the BIOS having turned off the CPU capability, so NX status should be reported. Additionally, anyone booting NX-capable CPUs in 32bit mode without PAE will lack NX functionality, so this change provides feedback for that case as well. Signed-off-by: Kees Cook Signed-off-by: H. Peter Anvin LKML-Reference: <1258154897-6770-6-git-send-email-hpa@zytor.com> --- arch/x86/mm/init.c | 4 ---- arch/x86/mm/setup_nx.c | 22 ++++++++++++++++++++++ 2 files changed, 22 insertions(+), 4 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 27ec2c2..d406c52 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -146,10 +146,6 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, use_gbpages = direct_gbpages; #endif - /* XXX: replace this with Kees' improved messages */ - if (__supported_pte_mask & _PAGE_NX) - printk(KERN_INFO "NX (Execute Disable) protection: active\n"); - /* Enable PSE if available */ if (cpu_has_pse) set_in_cr4(X86_CR4_PSE); diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c index 355818b..a3250aa 100644 --- a/arch/x86/mm/setup_nx.c +++ b/arch/x86/mm/setup_nx.c @@ -36,3 +36,25 @@ void __cpuinit x86_configure_nx(void) else __supported_pte_mask &= ~_PAGE_NX; } + +void __init x86_report_nx(void) +{ + if (!cpu_has_nx) { + printk(KERN_NOTICE "Notice: NX (Execute Disable) protection " + "missing in CPU or disabled in BIOS!\n"); + } else { +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) + if (disable_nx) { + printk(KERN_INFO "NX (Execute Disable) protection: " + "disabled by kernel command line option\n"); + } else { + printk(KERN_INFO "NX (Execute Disable) protection: " + "active\n"); + } +#else + /* 32bit non-PAE kernel, NX cannot be used */ + printk(KERN_NOTICE "Notice: NX (Execute Disable) protection " + "cannot be enabled: non-PAE kernel!\n"); +#endif + } +} -- cgit v1.1 From 350f8f5631922c7848ec4b530c111cb8c2ff7caa Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 13 Nov 2009 11:54:40 +0000 Subject: x86: Eliminate redundant/contradicting cache line size config options Rather than having X86_L1_CACHE_BYTES and X86_L1_CACHE_SHIFT (with inconsistent defaults), just having the latter suffices as the former can be easily calculated from it. To be consistent, also change X86_INTERNODE_CACHE_BYTES to X86_INTERNODE_CACHE_SHIFT, and set it to 7 (128 bytes) for NUMA to account for last level cache line size (which here matters more than L1 cache line size). Finally, make sure the default value for X86_L1_CACHE_SHIFT, when X86_GENERIC is selected, is being seen before that for the individual CPU model options (other than on x86-64, where GENERIC_CPU is part of the choice construct, X86_GENERIC is a separate option on ix86). Signed-off-by: Jan Beulich Acked-by: Ravikiran Thirumalai Acked-by: Nick Piggin LKML-Reference: <4AFD5710020000780001F8F0@vpn.id2.novell.com> Signed-off-by: Ingo Molnar --- arch/x86/mm/tlb.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 36fe08e..65b58e4 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -8,6 +8,7 @@ #include #include +#include #include #include @@ -43,7 +44,7 @@ union smp_flush_state { spinlock_t tlbstate_lock; DECLARE_BITMAP(flush_cpumask, NR_CPUS); }; - char pad[CONFIG_X86_INTERNODE_CACHE_BYTES]; + char pad[INTERNODE_CACHE_BYTES]; } ____cacheline_internodealigned_in_smp; /* State is put into the per CPU data section, but padded -- cgit v1.1 From 021428ad1418cf3c386a1a0157140c3ea29b17ef Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 21 Nov 2009 00:23:37 -0800 Subject: x86, numa, bootmem: Only free bootmem on NUMA failure path In the NUMA bootmem setup failure path we freed nodedata_phys incorrectly. Signed-off-by: Yinghai Lu Cc: Thomas Gleixner Cc: H. Peter Anvin Cc: Rusty Russell Cc: David Rientjes Cc: Andrew Morton LKML-Reference: <4B07A739.3030104@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/mm/numa_64.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 086f98a..3acd870 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -239,8 +239,14 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) bootmap = early_node_mem(nodeid, bootmap_start, end, bootmap_pages<= end) - free_bootmem(nodedata_phys, pgdat_size); + if (nodedata_phys < start || nodedata_phys >= end) { + /* + * only need to free it if it is from other node + * bootmem + */ + if (nid != nodeid) + free_bootmem(nodedata_phys, pgdat_size); + } node_data[nodeid] = NULL; return; } -- cgit v1.1 From d9c2d5ac6af87b4491bff107113aaf16f6c2b2d9 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 21 Nov 2009 00:23:37 -0800 Subject: x86, numa: Use near(er) online node instead of roundrobin for NUMA CPU to node mapping is set via the following sequence: 1. numa_init_array(): Set up roundrobin from cpu to online node 2. init_cpu_to_node(): Set that according to apicid_to_node[] according to srat only handle the node that is online, and leave other cpu on node without ram (aka not online) to still roundrobin. 3. later call srat_detect_node for Intel/AMD, will use first_online node or nearby node. Problem is that setup_per_cpu_areas() is not called between 2 and 3, the per_cpu for cpu on node with ram is on different node, and could put that on node with two hops away. So try to optimize this and add find_near_online_node() and call init_cpu_to_node(). Signed-off-by: Yinghai Lu Cc: Tejun Heo Cc: Linus Torvalds Cc: Thomas Gleixner Cc: H. Peter Anvin Cc: Rusty Russell Cc: David Rientjes Cc: Andrew Morton LKML-Reference: <4B07A739.3030104@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/mm/numa_64.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 3acd870..83bbc70 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -764,6 +764,25 @@ static __init int numa_setup(char *opt) early_param("numa", numa_setup); #ifdef CONFIG_NUMA + +static __init int find_near_online_node(int node) +{ + int n, val; + int min_val = INT_MAX; + int best_node = -1; + + for_each_online_node(n) { + val = node_distance(node, n); + + if (val < min_val) { + min_val = val; + best_node = n; + } + } + + return best_node; +} + /* * Setup early cpu_to_node. * @@ -795,7 +814,7 @@ void __init init_cpu_to_node(void) if (node == NUMA_NO_NODE) continue; if (!node_online(node)) - continue; + node = find_near_online_node(node); numa_set_node(cpu, node); } } -- cgit v1.1 From fd12a0d69aee6d90fa9b9890db24368a897f8423 Mon Sep 17 00:00:00 2001 From: Jack Steiner Date: Thu, 19 Nov 2009 14:23:41 -0600 Subject: x86: UV SGI: Don't track GRU space in PAT GRU space is always mapped as WB in the page table. There is no need to track the mappings in the PAT. This also eliminates the "freeing invalid memtype" messages when the GRU space is unmapped. Signed-off-by: Jack Steiner LKML-Reference: <20091119202341.GA4420@sgi.com> [ v2: fix build failure ] Signed-off-by: Ingo Molnar --- arch/x86/mm/pat.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index e78cd0e..38a66ef 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -348,6 +349,11 @@ static int free_ram_pages_type(u64 start, u64 end) return 0; } +int default_is_untracked_pat_range(u64 start, u64 end) +{ + return is_ISA_range(start, end); +} + /* * req_type typically has one of the: * - _PAGE_CACHE_WB @@ -388,7 +394,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, } /* Low ISA region is always mapped WB in page table. No need to track */ - if (is_ISA_range(start, end - 1)) { + if (x86_platform.is_untracked_pat_range(start, end - 1)) { if (new_type) *new_type = _PAGE_CACHE_WB; return 0; @@ -499,7 +505,7 @@ int free_memtype(u64 start, u64 end) return 0; /* Low ISA region is always mapped WB. No need to track */ - if (is_ISA_range(start, end - 1)) + if (x86_platform.is_untracked_pat_range(start, end - 1)) return 0; is_range_ram = pat_pagerange_is_ram(start, end); @@ -582,7 +588,7 @@ static unsigned long lookup_memtype(u64 paddr) int rettype = _PAGE_CACHE_WB; struct memtype *entry; - if (is_ISA_range(paddr, paddr + PAGE_SIZE - 1)) + if (x86_platform.is_untracked_pat_range(paddr, paddr + PAGE_SIZE - 1)) return rettype; if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) { -- cgit v1.1 From 8a27138924f64d2f30c1022f909f74480046bc3f Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 23 Nov 2009 14:49:20 -0800 Subject: x86, mm: is_untracked_pat_range() takes a normal semiclosed range is_untracked_pat_range() -- like its components, is_ISA_range() and is_GRU_range(), takes a normal semiclosed interval (>=, <) whereas the PAT code called it as if it took a closed range (>=, <=). Fix. Although this is a bug, I believe it is non-manifest, simply because none of the callers will call this with non-page-aligned addresses. Signed-off-by: H. Peter Anvin Acked-by: Thomas Gleixner Acked-by: Suresh Siddha LKML-Reference: <20091119202341.GA4420@sgi.com> --- arch/x86/mm/pat.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 38a66ef..b5bc08c 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -394,7 +394,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, } /* Low ISA region is always mapped WB in page table. No need to track */ - if (x86_platform.is_untracked_pat_range(start, end - 1)) { + if (x86_platform.is_untracked_pat_range(start, end)) { if (new_type) *new_type = _PAGE_CACHE_WB; return 0; @@ -505,7 +505,7 @@ int free_memtype(u64 start, u64 end) return 0; /* Low ISA region is always mapped WB. No need to track */ - if (x86_platform.is_untracked_pat_range(start, end - 1)) + if (x86_platform.is_untracked_pat_range(start, end)) return 0; is_range_ram = pat_pagerange_is_ram(start, end); @@ -588,7 +588,7 @@ static unsigned long lookup_memtype(u64 paddr) int rettype = _PAGE_CACHE_WB; struct memtype *entry; - if (x86_platform.is_untracked_pat_range(paddr, paddr + PAGE_SIZE - 1)) + if (x86_platform.is_untracked_pat_range(paddr, paddr + PAGE_SIZE)) return rettype; if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) { -- cgit v1.1 From eb41c8be89dbe079f49202774e04a79ccac48a09 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 23 Nov 2009 14:46:07 -0800 Subject: x86, platform: Change is_untracked_pat_range() to bool; cleanup init - Change is_untracked_pat_range() to return bool. - Clean up the initialization of is_untracked_pat_range() -- by default, we simply point it at is_ISA_range() directly. - Move is_untracked_pat_range to the end of struct x86_platform, since it is the newest field. Signed-off-by: H. Peter Anvin Acked-by: Thomas Gleixner Cc: Jack Steiner LKML-Reference: <20091119202341.GA4420@sgi.com> --- arch/x86/mm/pat.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index b5bc08c..ef71251 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -349,11 +349,6 @@ static int free_ram_pages_type(u64 start, u64 end) return 0; } -int default_is_untracked_pat_range(u64 start, u64 end) -{ - return is_ISA_range(start, end); -} - /* * req_type typically has one of the: * - _PAGE_CACHE_WB -- cgit v1.1 From b24c2a925a9837cccf54d50aeac22ba0cbc15455 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 24 Nov 2009 02:48:18 -0800 Subject: x86: Move find_smp_config() earlier and avoid bootmem usage Move the find_smp_config() call to before bootmem is initialized. Use reserve_early() instead of reserve_bootmem() in it. This simplifies the code, we only need to call find_smp_config() once and can remove the now unneeded reserve parameter from x86_init_mpparse::find_smp_config. We thus also reduce x86's dependency on bootmem allocations. Signed-off-by: Yinghai Lu LKML-Reference: <4B0BB9F2.70907@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/mm/k8topology_64.c | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c index b9e2dbf..970ed57 100644 --- a/arch/x86/mm/k8topology_64.c +++ b/arch/x86/mm/k8topology_64.c @@ -57,18 +57,6 @@ static __init void early_get_boot_cpu_id(void) * need to get boot_cpu_id so can use that to create apicid_to_node * in k8_scan_nodes() */ - /* - * Find possible boot-time SMP configuration: - */ -#ifdef CONFIG_X86_MPPARSE - early_find_smp_config(); -#endif -#ifdef CONFIG_ACPI - /* - * Read APIC information from ACPI tables. - */ - early_acpi_boot_init(); -#endif #ifdef CONFIG_X86_MPPARSE /* * get boot-time SMP configuration: -- cgit v1.1 From dd4377b02d9f028006beed1b7b1695ee5d1498b6 Mon Sep 17 00:00:00 2001 From: Xiaotian Feng Date: Thu, 26 Nov 2009 19:53:48 +0800 Subject: x86/pat: Trivial: don't create debugfs for memtype if pat is disabled If pat is disabled (boot with nopat), there's no need to create debugfs for it, it's empty all the time. Signed-off-by: Xiaotian Feng Cc: Suresh Siddha Cc: Venkatesh Pallipadi Cc: H. Peter Anvin LKML-Reference: <1259236428-16329-1-git-send-email-dfeng@redhat.com> Signed-off-by: Ingo Molnar --- arch/x86/mm/pat.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index ef71251..a81b7e7 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -1019,8 +1019,10 @@ static const struct file_operations memtype_fops = { static int __init pat_memtype_list_init(void) { - debugfs_create_file("pat_memtype_list", S_IRUSR, arch_debugfs_dir, - NULL, &memtype_fops); + if (pat_enabled) { + debugfs_create_file("pat_memtype_list", S_IRUSR, + arch_debugfs_dir, NULL, &memtype_fops); + } return 0; } -- cgit v1.1