diff options
139 files changed, 2902 insertions, 2512 deletions
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 22d89aa3..8533f5f 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -767,6 +767,7 @@ The "Locked" indicates whether the mapping is locked in memory or not. MemTotal: 16344972 kB MemFree: 13634064 kB +MemAvailable: 14836172 kB Buffers: 3656 kB Cached: 1195708 kB SwapCached: 0 kB @@ -799,6 +800,14 @@ AnonHugePages: 49152 kB MemTotal: Total usable ram (i.e. physical ram minus a few reserved bits and the kernel binary code) MemFree: The sum of LowFree+HighFree +MemAvailable: An estimate of how much memory is available for starting new + applications, without swapping. Calculated from MemFree, + SReclaimable, the size of the file LRU lists, and the low + watermarks in each zone. + The estimate takes into account that the system needs some + page cache to function well, and that not all reclaimable + slab will be reclaimable, due to items being in use. The + impact of those factors will vary from system to system. Buffers: Relatively temporary storage for raw disk blocks shouldn't get tremendously large (20MB or so) Cached: in-memory cache for files read from the disk (the diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 1fbd4eb..9f5481b 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -47,6 +47,7 @@ Currently, these files are in /proc/sys/vm: - numa_zonelist_order - oom_dump_tasks - oom_kill_allocating_task +- overcommit_kbytes - overcommit_memory - overcommit_ratio - page-cluster @@ -574,6 +575,17 @@ The default value is 0. ============================================================== +overcommit_kbytes: + +When overcommit_memory is set to 2, the committed address space is not +permitted to exceed swap plus this amount of physical RAM. See below. + +Note: overcommit_kbytes is the counterpart of overcommit_ratio. Only one +of them may be specified at a time. Setting one disables the other (which +then appears as 0 when read). + +============================================================== + overcommit_memory: This value contains a flag that enables memory overcommitment. diff --git a/Documentation/vm/overcommit-accounting b/Documentation/vm/overcommit-accounting index 8eaa2fc..cbfaaa6 100644 --- a/Documentation/vm/overcommit-accounting +++ b/Documentation/vm/overcommit-accounting @@ -14,8 +14,8 @@ The Linux kernel supports the following overcommit handling modes 2 - Don't overcommit. The total address space commit for the system is not permitted to exceed swap + a - configurable percentage (default is 50) of physical RAM. - Depending on the percentage you use, in most situations + configurable amount (default is 50%) of physical RAM. + Depending on the amount you use, in most situations this means a process will not be killed while accessing pages but will receive errors on memory allocation as appropriate. @@ -26,7 +26,8 @@ The Linux kernel supports the following overcommit handling modes The overcommit policy is set via the sysctl `vm.overcommit_memory'. -The overcommit percentage is set via `vm.overcommit_ratio'. +The overcommit amount can be set via `vm.overcommit_ratio' (percentage) +or `vm.overcommit_kbytes' (absolute value). The current overcommit limit and amount committed are viewable in /proc/meminfo as CommitLimit and Committed_AS respectively. diff --git a/arch/arm/include/asm/dma.h b/arch/arm/include/asm/dma.h index 58b8c6a..9908443 100644 --- a/arch/arm/include/asm/dma.h +++ b/arch/arm/include/asm/dma.h @@ -8,8 +8,8 @@ #define MAX_DMA_ADDRESS 0xffffffffUL #else #define MAX_DMA_ADDRESS ({ \ - extern unsigned long arm_dma_zone_size; \ - arm_dma_zone_size ? \ + extern phys_addr_t arm_dma_zone_size; \ + arm_dma_zone_size && arm_dma_zone_size < (0x10000000 - PAGE_OFFSET) ? \ (PAGE_OFFSET + arm_dma_zone_size) : 0xffffffffUL; }) #endif diff --git a/arch/arm/kernel/devtree.c b/arch/arm/kernel/devtree.c index 34d5fd5..f751714 100644 --- a/arch/arm/kernel/devtree.c +++ b/arch/arm/kernel/devtree.c @@ -33,7 +33,7 @@ void __init early_init_dt_add_memory_arch(u64 base, u64 size) void * __init early_init_dt_alloc_memory_arch(u64 size, u64 align) { - return alloc_bootmem_align(size, align); + return memblock_virt_alloc(size, align); } void __init arm_dt_memblock_reserve(void) diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c index 987a7f5..8ce1cbd 100644 --- a/arch/arm/kernel/setup.c +++ b/arch/arm/kernel/setup.c @@ -717,7 +717,7 @@ static void __init request_standard_resources(const struct machine_desc *mdesc) kernel_data.end = virt_to_phys(_end - 1); for_each_memblock(memory, region) { - res = alloc_bootmem_low(sizeof(*res)); + res = memblock_virt_alloc(sizeof(*res), 0); res->name = "System RAM"; res->start = __pfn_to_phys(memblock_region_memory_base_pfn(region)); res->end = __pfn_to_phys(memblock_region_memory_end_pfn(region)) - 1; diff --git a/arch/arm/mach-omap2/omap_hwmod.c b/arch/arm/mach-omap2/omap_hwmod.c index 8a1b5e0..f7a6fd3 100644 --- a/arch/arm/mach-omap2/omap_hwmod.c +++ b/arch/arm/mach-omap2/omap_hwmod.c @@ -2791,9 +2791,7 @@ static int __init _alloc_links(struct omap_hwmod_link **ml, sz = sizeof(struct omap_hwmod_link) * LINKS_PER_OCP_IF; *sl = NULL; - *ml = alloc_bootmem(sz); - - memset(*ml, 0, sz); + *ml = memblock_virt_alloc(sz, 0); *sl = (void *)(*ml) + sizeof(struct omap_hwmod_link); @@ -2912,9 +2910,7 @@ static int __init _alloc_linkspace(struct omap_hwmod_ocp_if **ois) pr_debug("omap_hwmod: %s: allocating %d byte linkspace (%d links)\n", __func__, sz, max_ls); - linkspace = alloc_bootmem(sz); - - memset(linkspace, 0, sz); + linkspace = memblock_virt_alloc(sz, 0); return 0; } diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index 3e8f106e..11eb8ad 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -92,9 +92,6 @@ void show_mem(unsigned int filter) printk("Mem-info:\n"); show_free_areas(filter); - if (filter & SHOW_MEM_FILTER_PAGE_COUNT) - return; - for_each_bank (i, mi) { struct membank *bank = &mi->bank[i]; unsigned int pfn1, pfn2; @@ -461,7 +458,7 @@ free_memmap(unsigned long start_pfn, unsigned long end_pfn) * free the section of the memmap array. */ if (pg < pgend) - free_bootmem(pg, pgend - pg); + memblock_free_early(pg, pgend - pg); } /* diff --git a/arch/ia64/mm/contig.c b/arch/ia64/mm/contig.c index da5237d..52715a7 100644 --- a/arch/ia64/mm/contig.c +++ b/arch/ia64/mm/contig.c @@ -31,74 +31,6 @@ static unsigned long max_gap; #endif -/** - * show_mem - give short summary of memory stats - * - * Shows a simple page count of reserved and used pages in the system. - * For discontig machines, it does this on a per-pgdat basis. - */ -void show_mem(unsigned int filter) -{ - int i, total_reserved = 0; - int total_shared = 0, total_cached = 0; - unsigned long total_present = 0; - pg_data_t *pgdat; - - printk(KERN_INFO "Mem-info:\n"); - show_free_areas(filter); - printk(KERN_INFO "Node memory in pages:\n"); - if (filter & SHOW_MEM_FILTER_PAGE_COUNT) - return; - for_each_online_pgdat(pgdat) { - unsigned long present; - unsigned long flags; - int shared = 0, cached = 0, reserved = 0; - int nid = pgdat->node_id; - - if (skip_free_areas_node(filter, nid)) - continue; - pgdat_resize_lock(pgdat, &flags); - present = pgdat->node_present_pages; - for(i = 0; i < pgdat->node_spanned_pages; i++) { - struct page *page; - if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) - touch_nmi_watchdog(); - if (pfn_valid(pgdat->node_start_pfn + i)) - page = pfn_to_page(pgdat->node_start_pfn + i); - else { -#ifdef CONFIG_VIRTUAL_MEM_MAP - if (max_gap < LARGE_GAP) - continue; -#endif - i = vmemmap_find_next_valid_pfn(nid, i) - 1; - continue; - } - if (PageReserved(page)) - reserved++; - else if (PageSwapCache(page)) - cached++; - else if (page_count(page)) - shared += page_count(page)-1; - } - pgdat_resize_unlock(pgdat, &flags); - total_present += present; - total_reserved += reserved; - total_cached += cached; - total_shared += shared; - printk(KERN_INFO "Node %4d: RAM: %11ld, rsvd: %8d, " - "shrd: %10d, swpd: %10d\n", nid, - present, reserved, shared, cached); - } - printk(KERN_INFO "%ld pages of RAM\n", total_present); - printk(KERN_INFO "%d reserved pages\n", total_reserved); - printk(KERN_INFO "%d pages shared\n", total_shared); - printk(KERN_INFO "%d pages swap cached\n", total_cached); - printk(KERN_INFO "Total of %ld pages in page table cache\n", - quicklist_total_size()); - printk(KERN_INFO "%ld free buffer pages\n", nr_free_buffer_pages()); -} - - /* physical address where the bootmem map is located */ unsigned long bootmap_start; diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c index 2de08f4..8786268 100644 --- a/arch/ia64/mm/discontig.c +++ b/arch/ia64/mm/discontig.c @@ -608,69 +608,6 @@ void *per_cpu_init(void) #endif /* CONFIG_SMP */ /** - * show_mem - give short summary of memory stats - * - * Shows a simple page count of reserved and used pages in the system. - * For discontig machines, it does this on a per-pgdat basis. - */ -void show_mem(unsigned int filter) -{ - int i, total_reserved = 0; - int total_shared = 0, total_cached = 0; - unsigned long total_present = 0; - pg_data_t *pgdat; - - printk(KERN_INFO "Mem-info:\n"); - show_free_areas(filter); - if (filter & SHOW_MEM_FILTER_PAGE_COUNT) - return; - printk(KERN_INFO "Node memory in pages:\n"); - for_each_online_pgdat(pgdat) { - unsigned long present; - unsigned long flags; - int shared = 0, cached = 0, reserved = 0; - int nid = pgdat->node_id; - - if (skip_free_areas_node(filter, nid)) - continue; - pgdat_resize_lock(pgdat, &flags); - present = pgdat->node_present_pages; - for(i = 0; i < pgdat->node_spanned_pages; i++) { - struct page *page; - if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) - touch_nmi_watchdog(); - if (pfn_valid(pgdat->node_start_pfn + i)) - page = pfn_to_page(pgdat->node_start_pfn + i); - else { - i = vmemmap_find_next_valid_pfn(nid, i) - 1; - continue; - } - if (PageReserved(page)) - reserved++; - else if (PageSwapCache(page)) - cached++; - else if (page_count(page)) - shared += page_count(page)-1; - } - pgdat_resize_unlock(pgdat, &flags); - total_present += present; - total_reserved += reserved; - total_cached += cached; - total_shared += shared; - printk(KERN_INFO "Node %4d: RAM: %11ld, rsvd: %8d, " - "shrd: %10d, swpd: %10d\n", nid, - present, reserved, shared, cached); - } - printk(KERN_INFO "%ld pages of RAM\n", total_present); - printk(KERN_INFO "%d reserved pages\n", total_reserved); - printk(KERN_INFO "%d pages shared\n", total_shared); - printk(KERN_INFO "%d pages swap cached\n", total_cached); - printk(KERN_INFO "Total of %ld pages in page table cache\n", - quicklist_total_size()); - printk(KERN_INFO "%ld free buffer pages\n", nr_free_buffer_pages()); -} - -/** * call_pernode_memory - use SRAT to call callback functions with node info * @start: physical start of range * @len: length of range diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 88504ab..25c3502 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -684,3 +684,51 @@ per_linux32_init(void) } __initcall(per_linux32_init); + +/** + * show_mem - give short summary of memory stats + * + * Shows a simple page count of reserved and used pages in the system. + * For discontig machines, it does this on a per-pgdat basis. + */ +void show_mem(unsigned int filter) +{ + int total_reserved = 0; + unsigned long total_present = 0; + pg_data_t *pgdat; + + printk(KERN_INFO "Mem-info:\n"); + show_free_areas(filter); + printk(KERN_INFO "Node memory in pages:\n"); + for_each_online_pgdat(pgdat) { + unsigned long present; + unsigned long flags; + int reserved = 0; + int nid = pgdat->node_id; + int zoneid; + + if (skip_free_areas_node(filter, nid)) + continue; + pgdat_resize_lock(pgdat, &flags); + + for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { + struct zone *zone = &pgdat->node_zones[zoneid]; + if (!populated_zone(zone)) + continue; + + reserved += zone->present_pages - zone->managed_pages; + } + present = pgdat->node_present_pages; + + pgdat_resize_unlock(pgdat, &flags); + total_present += present; + total_reserved += reserved; + printk(KERN_INFO "Node %4d: RAM: %11ld, rsvd: %8d, ", + nid, present, reserved); + } + printk(KERN_INFO "%ld pages of RAM\n", total_present); + printk(KERN_INFO "%d reserved pages\n", total_reserved); + printk(KERN_INFO "Total of %ld pages in page table cache\n", + quicklist_total_size()); + printk(KERN_INFO "%ld free buffer pages\n", nr_free_buffer_pages()); +} diff --git a/arch/metag/mm/init.c b/arch/metag/mm/init.c index 3cd6288f..11fa51c 100644 --- a/arch/metag/mm/init.c +++ b/arch/metag/mm/init.c @@ -204,7 +204,8 @@ static void __init do_init_bootmem(void) start_pfn = memblock_region_memory_base_pfn(reg); end_pfn = memblock_region_memory_end_pfn(reg); memblock_set_node(PFN_PHYS(start_pfn), - PFN_PHYS(end_pfn - start_pfn), 0); + PFN_PHYS(end_pfn - start_pfn), + &memblock.memory, 0); } /* All of system RAM sits in node 0 for the non-NUMA case */ diff --git a/arch/metag/mm/numa.c b/arch/metag/mm/numa.c index b172aa4..67b46c2 100644 --- a/arch/metag/mm/numa.c +++ b/arch/metag/mm/numa.c @@ -42,7 +42,8 @@ void __init setup_bootmem_node(int nid, unsigned long start, unsigned long end) memblock_add(start, end - start); memblock_set_node(PFN_PHYS(start_pfn), - PFN_PHYS(end_pfn - start_pfn), nid); + PFN_PHYS(end_pfn - start_pfn), + &memblock.memory, nid); /* Node-local pgdat */ pgdat_paddr = memblock_alloc_base(sizeof(struct pglist_data), diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c index 74c7bcc..89077d3 100644 --- a/arch/microblaze/mm/init.c +++ b/arch/microblaze/mm/init.c @@ -192,7 +192,8 @@ void __init setup_memory(void) start_pfn = memblock_region_memory_base_pfn(reg); end_pfn = memblock_region_memory_end_pfn(reg); memblock_set_node(start_pfn << PAGE_SHIFT, - (end_pfn - start_pfn) << PAGE_SHIFT, 0); + (end_pfn - start_pfn) << PAGE_SHIFT, + &memblock.memory, 0); } /* free bootmem is whole main memory */ diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c index 96f8168..ae085ad 100644 --- a/arch/parisc/mm/init.c +++ b/arch/parisc/mm/init.c @@ -645,55 +645,30 @@ EXPORT_SYMBOL(empty_zero_page); void show_mem(unsigned int filter) { - int i,free = 0,total = 0,reserved = 0; - int shared = 0, cached = 0; + int total = 0,reserved = 0; + pg_data_t *pgdat; printk(KERN_INFO "Mem-info:\n"); show_free_areas(filter); - if (filter & SHOW_MEM_FILTER_PAGE_COUNT) - return; -#ifndef CONFIG_DISCONTIGMEM - i = max_mapnr; - while (i-- > 0) { - total++; - if (PageReserved(mem_map+i)) - reserved++; - else if (PageSwapCache(mem_map+i)) - cached++; - else if (!page_count(&mem_map[i])) - free++; - else - shared += page_count(&mem_map[i]) - 1; - } -#else - for (i = 0; i < npmem_ranges; i++) { - int j; - for (j = node_start_pfn(i); j < node_end_pfn(i); j++) { - struct page *p; - unsigned long flags; - - pgdat_resize_lock(NODE_DATA(i), &flags); - p = nid_page_nr(i, j) - node_start_pfn(i); - - total++; - if (PageReserved(p)) - reserved++; - else if (PageSwapCache(p)) - cached++; - else if (!page_count(p)) - free++; - else - shared += page_count(p) - 1; - pgdat_resize_unlock(NODE_DATA(i), &flags); - } + for_each_online_pgdat(pgdat) { + unsigned long flags; + int zoneid; + + pgdat_resize_lock(pgdat, &flags); + for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { + struct zone *zone = &pgdat->node_zones[zoneid]; + if (!populated_zone(zone)) + continue; + + total += zone->present_pages; + reserved = zone->present_pages - zone->managed_pages; + } + pgdat_resize_unlock(pgdat, &flags); } -#endif + printk(KERN_INFO "%d pages of RAM\n", total); printk(KERN_INFO "%d reserved pages\n", reserved); - printk(KERN_INFO "%d pages shared\n", shared); - printk(KERN_INFO "%d pages swap cached\n", cached); - #ifdef CONFIG_DISCONTIGMEM { diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 3fa93dc..8c1dd23 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -209,7 +209,7 @@ void __init do_init_bootmem(void) /* Place all memblock_regions in the same node and merge contiguous * memblock_regions */ - memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0); + memblock_set_node(0, (phys_addr_t)ULLONG_MAX, &memblock.memory, 0); /* Add all physical memory to the bootmem map, mark each area * present. diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 078d3e0..5a944f2 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -670,7 +670,8 @@ static void __init parse_drconf_memory(struct device_node *memory) node_set_online(nid); sz = numa_enforce_memory_limit(base, size); if (sz) - memblock_set_node(base, sz, nid); + memblock_set_node(base, sz, + &memblock.memory, nid); } while (--ranges); } } @@ -760,7 +761,7 @@ new_range: continue; } - memblock_set_node(start, size, nid); + memblock_set_node(start, size, &memblock.memory, nid); if (--ranges) goto new_range; @@ -797,7 +798,8 @@ static void __init setup_nonnuma(void) fake_numa_create_new_node(end_pfn, &nid); memblock_set_node(PFN_PHYS(start_pfn), - PFN_PHYS(end_pfn - start_pfn), nid); + PFN_PHYS(end_pfn - start_pfn), + &memblock.memory, nid); node_set_online(nid); } } diff --git a/arch/score/Kconfig b/arch/score/Kconfig index 305f7ee..c75d06a 100644 --- a/arch/score/Kconfig +++ b/arch/score/Kconfig @@ -2,7 +2,6 @@ menu "Machine selection" config SCORE def_bool y - select HAVE_GENERIC_HARDIRQS select GENERIC_IRQ_SHOW select GENERIC_IOMAP select GENERIC_ATOMIC64 diff --git a/arch/sh/kernel/kgdb.c b/arch/sh/kernel/kgdb.c index 38b3139..adad46e 100644 --- a/arch/sh/kernel/kgdb.c +++ b/arch/sh/kernel/kgdb.c @@ -13,6 +13,7 @@ #include <linux/kdebug.h> #include <linux/irq.h> #include <linux/io.h> +#include <linux/sched.h> #include <asm/cacheflush.h> #include <asm/traps.h> diff --git a/arch/sh/kernel/setup.c b/arch/sh/kernel/setup.c index 1cf90e9..de19cfa 100644 --- a/arch/sh/kernel/setup.c +++ b/arch/sh/kernel/setup.c @@ -230,8 +230,8 @@ void __init __add_active_range(unsigned int nid, unsigned long start_pfn, pmb_bolt_mapping((unsigned long)__va(start), start, end - start, PAGE_KERNEL); - memblock_set_node(PFN_PHYS(start_pfn), - PFN_PHYS(end_pfn - start_pfn), nid); + memblock_set_node(PFN_PHYS(start_pfn), PFN_PHYS(end_pfn - start_pfn), + &memblock.memory, nid); } void __init __weak plat_early_device_setup(void) diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 5322e53..eafbc65 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -1021,7 +1021,8 @@ static void __init add_node_ranges(void) "start[%lx] end[%lx]\n", nid, start, this_end); - memblock_set_node(start, this_end - start, nid); + memblock_set_node(start, this_end - start, + &memblock.memory, nid); start = this_end; } } @@ -1325,7 +1326,7 @@ static void __init bootmem_init_nonnuma(void) (top_of_ram - total_ram) >> 20); init_node_masks_nonnuma(); - memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0); + memblock_set_node(0, (phys_addr_t)ULLONG_MAX, &memblock.memory, 0); allocate_node_data(0); node_set_online(0); } diff --git a/arch/unicore32/mm/init.c b/arch/unicore32/mm/init.c index ae6bc03..be2bde9 100644 --- a/arch/unicore32/mm/init.c +++ b/arch/unicore32/mm/init.c @@ -66,9 +66,6 @@ void show_mem(unsigned int filter) printk(KERN_DEFAULT "Mem-info:\n"); show_free_areas(filter); - if (filter & SHOW_MEM_FILTER_PAGE_COUNT) - return; - for_each_bank(i, mi) { struct membank *bank = &mi->bank[i]; unsigned int pfn1, pfn2; diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index f97fbe3..2f59cce 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -51,9 +51,9 @@ extern int devmem_is_allowed(unsigned long pagenr); extern unsigned long max_low_pfn_mapped; extern unsigned long max_pfn_mapped; -static inline phys_addr_t get_max_mapped(void) +static inline phys_addr_t get_max_low_mapped(void) { - return (phys_addr_t)max_pfn_mapped << PAGE_SHIFT; + return (phys_addr_t)max_low_pfn_mapped << PAGE_SHIFT; } bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn); diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c index e2dbcb7..83a7995 100644 --- a/arch/x86/kernel/check.c +++ b/arch/x86/kernel/check.c @@ -91,7 +91,7 @@ void __init setup_bios_corruption_check(void) corruption_check_size = round_up(corruption_check_size, PAGE_SIZE); - for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) { + for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL) { start = clamp_t(phys_addr_t, round_up(start, PAGE_SIZE), PAGE_SIZE, corruption_check_size); end = clamp_t(phys_addr_t, round_down(end, PAGE_SIZE), diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 174da5f..988c00a 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1120,7 +1120,7 @@ void __init memblock_find_dma_reserve(void) nr_pages += end_pfn - start_pfn; } - for_each_free_mem_range(u, MAX_NUMNODES, &start, &end, NULL) { + for_each_free_mem_range(u, NUMA_NO_NODE, &start, &end, NULL) { start_pfn = min_t(unsigned long, PFN_UP(start), MAX_DMA_PFN); end_pfn = min_t(unsigned long, PFN_DOWN(end), MAX_DMA_PFN); if (start_pfn < end_pfn) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 06853e6..c967559 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1119,7 +1119,7 @@ void __init setup_arch(char **cmdline_p) setup_real_mode(); - memblock_set_current_limit(get_max_mapped()); + memblock_set_current_limit(get_max_low_mapped()); dma_contiguous_reserve(0); /* diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 5bdc543..e395048 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -665,7 +665,7 @@ void __init initmem_init(void) high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; #endif - memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0); + memblock_set_node(0, (phys_addr_t)ULLONG_MAX, &memblock.memory, 0); sparse_memory_present_with_active_regions(0); #ifdef CONFIG_FLATMEM diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 104d56a..f35c66c 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -643,7 +643,7 @@ kernel_physical_mapping_init(unsigned long start, #ifndef CONFIG_NUMA void __init initmem_init(void) { - memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0); + memblock_set_node(0, (phys_addr_t)ULLONG_MAX, &memblock.memory, 0); } #endif diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c index 8dabbed..1e9da79 100644 --- a/arch/x86/mm/memtest.c +++ b/arch/x86/mm/memtest.c @@ -74,7 +74,7 @@ static void __init do_one_pass(u64 pattern, u64 start, u64 end) u64 i; phys_addr_t this_start, this_end; - for_each_free_mem_range(i, MAX_NUMNODES, &this_start, &this_end, NULL) { + for_each_free_mem_range(i, NUMA_NO_NODE, &this_start, &this_end, NULL) { this_start = clamp_t(phys_addr_t, this_start, start, end); this_end = clamp_t(phys_addr_t, this_end, start, end); if (this_start < this_end) { diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index c85da7b..81b2750 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -491,7 +491,16 @@ static int __init numa_register_memblks(struct numa_meminfo *mi) for (i = 0; i < mi->nr_blks; i++) { struct numa_memblk *mb = &mi->blk[i]; - memblock_set_node(mb->start, mb->end - mb->start, mb->nid); + memblock_set_node(mb->start, mb->end - mb->start, + &memblock.memory, mb->nid); + + /* + * At this time, all memory regions reserved by memblock are + * used by the kernel. Set the nid in memblock.reserved will + * mark out all the nodes the kernel resides in. + */ + memblock_set_node(mb->start, mb->end - mb->start, + &memblock.reserved, mb->nid); } /* @@ -553,6 +562,30 @@ static void __init numa_init_array(void) } } +static void __init numa_clear_kernel_node_hotplug(void) +{ + int i, nid; + nodemask_t numa_kernel_nodes; + unsigned long start, end; + struct memblock_type *type = &memblock.reserved; + + /* Mark all kernel nodes. */ + for (i = 0; i < type->cnt; i++) + node_set(type->regions[i].nid, numa_kernel_nodes); + + /* Clear MEMBLOCK_HOTPLUG flag for memory in kernel nodes. */ + for (i = 0; i < numa_meminfo.nr_blks; i++) { + nid = numa_meminfo.blk[i].nid; + if (!node_isset(nid, numa_kernel_nodes)) + continue; + + start = numa_meminfo.blk[i].start; + end = numa_meminfo.blk[i].end; + + memblock_clear_hotplug(start, end - start); + } +} + static int __init numa_init(int (*init_func)(void)) { int i; @@ -565,7 +598,12 @@ static int __init numa_init(int (*init_func)(void)) nodes_clear(node_possible_map); nodes_clear(node_online_map); memset(&numa_meminfo, 0, sizeof(numa_meminfo)); - WARN_ON(memblock_set_node(0, ULLONG_MAX, MAX_NUMNODES)); + WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.memory, + MAX_NUMNODES)); + WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.reserved, + MAX_NUMNODES)); + /* In case that parsing SRAT failed. */ + WARN_ON(memblock_clear_hotplug(0, ULLONG_MAX)); numa_reset_distance(); ret = init_func(); @@ -601,6 +639,16 @@ static int __init numa_init(int (*init_func)(void)) numa_clear_node(i); } numa_init_array(); + + /* + * At very early time, the kernel have to use some memory such as + * loading the kernel image. We cannot prevent this anyway. So any + * node the kernel resides in should be un-hotpluggable. + * + * And when we come here, numa_init() won't fail. + */ + numa_clear_kernel_node_hotplug(); + return 0; } diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c index 266ca91..1a25187 100644 --- a/arch/x86/mm/srat.c +++ b/arch/x86/mm/srat.c @@ -181,6 +181,11 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) (unsigned long long) start, (unsigned long long) end - 1, hotpluggable ? " hotplug" : ""); + /* Mark hotplug range in memblock. */ + if (hotpluggable && memblock_mark_hotplug(start, ma->length)) + pr_warn("SRAT: Failed to mark hotplug range [mem %#010Lx-%#010Lx] in memblock\n", + (unsigned long long)start, (unsigned long long)end - 1); + return 0; out_err_bad_srat: bad_srat(); diff --git a/drivers/char/mem.c b/drivers/char/mem.c index f895a8c..92c5937 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -22,7 +22,6 @@ #include <linux/device.h> #include <linux/highmem.h> #include <linux/backing-dev.h> -#include <linux/bootmem.h> #include <linux/splice.h> #include <linux/pfn.h> #include <linux/export.h> diff --git a/drivers/firmware/memmap.c b/drivers/firmware/memmap.c index e2e04b0..17cf96c 100644 --- a/drivers/firmware/memmap.c +++ b/drivers/firmware/memmap.c @@ -324,7 +324,7 @@ int __init firmware_map_add_early(u64 start, u64 end, const char *type) { struct firmware_map_entry *entry; - entry = alloc_bootmem(sizeof(struct firmware_map_entry)); + entry = memblock_virt_alloc(sizeof(struct firmware_map_entry), 0); if (WARN_ON(!entry)) return -ENOMEM; diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 43b9bfe..59779e1 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -917,7 +917,7 @@ static void dma_pte_free_level(struct dmar_domain *domain, int level, /* If range covers entire pagetable, free it */ if (!(start_pfn > level_pfn || - last_pfn < level_pfn + level_size(level))) { + last_pfn < level_pfn + level_size(level) - 1)) { dma_clear_pte(pte); domain_flush_cache(domain, pte, sizeof(*pte)); free_pgtable_page(level_pte); diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index dc52e13..3881610 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -680,7 +680,8 @@ static int do_i2c_rdwr_ioctl(unsigned int fd, unsigned int cmd, struct i2c_msg __user *tmsgs; struct i2c_msg32 __user *umsgs; compat_caddr_t datap; - int nmsgs, i; + u32 nmsgs; + int i; if (get_user(nmsgs, &udata->nmsgs)) return -EFAULT; diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index 1fedd5f..0b9ff43 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -82,20 +82,23 @@ static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark) * events. */ static int dnotify_handle_event(struct fsnotify_group *group, + struct inode *inode, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, - struct fsnotify_event *event) + u32 mask, void *data, int data_type, + const unsigned char *file_name) { struct dnotify_mark *dn_mark; - struct inode *to_tell; struct dnotify_struct *dn; struct dnotify_struct **prev; struct fown_struct *fown; - __u32 test_mask = event->mask & ~FS_EVENT_ON_CHILD; + __u32 test_mask = mask & ~FS_EVENT_ON_CHILD; - BUG_ON(vfsmount_mark); + /* not a dir, dnotify doesn't care */ + if (!S_ISDIR(inode->i_mode)) + return 0; - to_tell = event->to_tell; + BUG_ON(vfsmount_mark); dn_mark = container_of(inode_mark, struct dnotify_mark, fsn_mark); @@ -122,23 +125,6 @@ static int dnotify_handle_event(struct fsnotify_group *group, return 0; } -/* - * Given an inode and mask determine if dnotify would be interested in sending - * userspace notification for that pair. - */ -static bool dnotify_should_send_event(struct fsnotify_group *group, - struct inode *inode, - struct fsnotify_mark *inode_mark, - struct fsnotify_mark *vfsmount_mark, - __u32 mask, void *data, int data_type) -{ - /* not a dir, dnotify doesn't care */ - if (!S_ISDIR(inode->i_mode)) - return false; - - return true; -} - static void dnotify_free_mark(struct fsnotify_mark *fsn_mark) { struct dnotify_mark *dn_mark = container_of(fsn_mark, @@ -152,10 +138,6 @@ static void dnotify_free_mark(struct fsnotify_mark *fsn_mark) static struct fsnotify_ops dnotify_fsnotify_ops = { .handle_event = dnotify_handle_event, - .should_send_event = dnotify_should_send_event, - .free_group_priv = NULL, - .freeing_mark = NULL, - .free_event_priv = NULL, }; /* diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 0c2f912..5877262 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -9,31 +9,27 @@ #include <linux/types.h> #include <linux/wait.h> -static bool should_merge(struct fsnotify_event *old, struct fsnotify_event *new) +#include "fanotify.h" + +static bool should_merge(struct fsnotify_event *old_fsn, + struct fsnotify_event *new_fsn) { - pr_debug("%s: old=%p new=%p\n", __func__, old, new); + struct fanotify_event_info *old, *new; - if (old->to_tell == new->to_tell && - old->data_type == new->data_type && - old->tgid == new->tgid) { - switch (old->data_type) { - case (FSNOTIFY_EVENT_PATH): #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS - /* dont merge two permission events */ - if ((old->mask & FAN_ALL_PERM_EVENTS) && - (new->mask & FAN_ALL_PERM_EVENTS)) - return false; + /* dont merge two permission events */ + if ((old_fsn->mask & FAN_ALL_PERM_EVENTS) && + (new_fsn->mask & FAN_ALL_PERM_EVENTS)) + return false; #endif - if ((old->path.mnt == new->path.mnt) && - (old->path.dentry == new->path.dentry)) - return true; - break; - case (FSNOTIFY_EVENT_NONE): - return true; - default: - BUG(); - }; - } + pr_debug("%s: old=%p new=%p\n", __func__, old_fsn, new_fsn); + old = FANOTIFY_E(old_fsn); + new = FANOTIFY_E(new_fsn); + + if (old_fsn->inode == new_fsn->inode && old->tgid == new->tgid && + old->path.mnt == new->path.mnt && + old->path.dentry == new->path.dentry) + return true; return false; } @@ -41,59 +37,28 @@ static bool should_merge(struct fsnotify_event *old, struct fsnotify_event *new) static struct fsnotify_event *fanotify_merge(struct list_head *list, struct fsnotify_event *event) { - struct fsnotify_event_holder *test_holder; - struct fsnotify_event *test_event = NULL; - struct fsnotify_event *new_event; + struct fsnotify_event *test_event; + bool do_merge = false; pr_debug("%s: list=%p event=%p\n", __func__, list, event); - - list_for_each_entry_reverse(test_holder, list, event_list) { - if (should_merge(test_holder->event, event)) { - test_event = test_holder->event; + list_for_each_entry_reverse(test_event, list, list) { + if (should_merge(test_event, event)) { + do_merge = true; break; } } - if (!test_event) + if (!do_merge) return NULL; - fsnotify_get_event(test_event); - - /* if they are exactly the same we are done */ - if (test_event->mask == event->mask) - return test_event; - - /* - * if the refcnt == 2 this is the only queue - * for this event and so we can update the mask - * in place. - */ - if (atomic_read(&test_event->refcnt) == 2) { - test_event->mask |= event->mask; - return test_event; - } - - new_event = fsnotify_clone_event(test_event); - - /* done with test_event */ - fsnotify_put_event(test_event); - - /* couldn't allocate memory, merge was not possible */ - if (unlikely(!new_event)) - return ERR_PTR(-ENOMEM); - - /* build new event and replace it on the list */ - new_event->mask = (test_event->mask | event->mask); - fsnotify_replace_event(test_holder, new_event); - - /* we hold a reference on new_event from clone_event */ - return new_event; + test_event->mask |= event->mask; + return test_event; } #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS static int fanotify_get_response_from_access(struct fsnotify_group *group, - struct fsnotify_event *event) + struct fanotify_event_info *event) { int ret; @@ -106,7 +71,6 @@ static int fanotify_get_response_from_access(struct fsnotify_group *group, return 0; /* userspace responded, convert to something usable */ - spin_lock(&event->lock); switch (event->response) { case FAN_ALLOW: ret = 0; @@ -116,7 +80,6 @@ static int fanotify_get_response_from_access(struct fsnotify_group *group, ret = -EPERM; } event->response = 0; - spin_unlock(&event->lock); pr_debug("%s: group=%p event=%p about to return ret=%d\n", __func__, group, event, ret); @@ -125,58 +88,17 @@ static int fanotify_get_response_from_access(struct fsnotify_group *group, } #endif -static int fanotify_handle_event(struct fsnotify_group *group, - struct fsnotify_mark *inode_mark, - struct fsnotify_mark *fanotify_mark, - struct fsnotify_event *event) -{ - int ret = 0; - struct fsnotify_event *notify_event = NULL; - - BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS); - BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY); - BUILD_BUG_ON(FAN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE); - BUILD_BUG_ON(FAN_CLOSE_WRITE != FS_CLOSE_WRITE); - BUILD_BUG_ON(FAN_OPEN != FS_OPEN); - BUILD_BUG_ON(FAN_EVENT_ON_CHILD != FS_EVENT_ON_CHILD); - BUILD_BUG_ON(FAN_Q_OVERFLOW != FS_Q_OVERFLOW); - BUILD_BUG_ON(FAN_OPEN_PERM != FS_OPEN_PERM); - BUILD_BUG_ON(FAN_ACCESS_PERM != FS_ACCESS_PERM); - BUILD_BUG_ON(FAN_ONDIR != FS_ISDIR); - - pr_debug("%s: group=%p event=%p\n", __func__, group, event); - - notify_event = fsnotify_add_notify_event(group, event, NULL, fanotify_merge); - if (IS_ERR(notify_event)) - return PTR_ERR(notify_event); - -#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS - if (event->mask & FAN_ALL_PERM_EVENTS) { - /* if we merged we need to wait on the new event */ - if (notify_event) - event = notify_event; - ret = fanotify_get_response_from_access(group, event); - } -#endif - - if (notify_event) - fsnotify_put_event(notify_event); - - return ret; -} - -static bool fanotify_should_send_event(struct fsnotify_group *group, - struct inode *to_tell, - struct fsnotify_mark *inode_mark, +static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmnt_mark, - __u32 event_mask, void *data, int data_type) + u32 event_mask, + void *data, int data_type) { __u32 marks_mask, marks_ignored_mask; struct path *path = data; - pr_debug("%s: group=%p to_tell=%p inode_mark=%p vfsmnt_mark=%p " - "mask=%x data=%p data_type=%d\n", __func__, group, to_tell, - inode_mark, vfsmnt_mark, event_mask, data, data_type); + pr_debug("%s: inode_mark=%p vfsmnt_mark=%p mask=%x data=%p" + " data_type=%d\n", __func__, inode_mark, vfsmnt_mark, + event_mask, data, data_type); /* if we don't have enough info to send an event to userspace say no */ if (data_type != FSNOTIFY_EVENT_PATH) @@ -217,6 +139,74 @@ static bool fanotify_should_send_event(struct fsnotify_group *group, return false; } +static int fanotify_handle_event(struct fsnotify_group *group, + struct inode *inode, + struct fsnotify_mark *inode_mark, + struct fsnotify_mark *fanotify_mark, + u32 mask, void *data, int data_type, + const unsigned char *file_name) +{ + int ret = 0; + struct fanotify_event_info *event; + struct fsnotify_event *fsn_event; + struct fsnotify_event *notify_fsn_event; + + BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS); + BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY); + BUILD_BUG_ON(FAN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE); + BUILD_BUG_ON(FAN_CLOSE_WRITE != FS_CLOSE_WRITE); + BUILD_BUG_ON(FAN_OPEN != FS_OPEN); + BUILD_BUG_ON(FAN_EVENT_ON_CHILD != FS_EVENT_ON_CHILD); + BUILD_BUG_ON(FAN_Q_OVERFLOW != FS_Q_OVERFLOW); + BUILD_BUG_ON(FAN_OPEN_PERM != FS_OPEN_PERM); + BUILD_BUG_ON(FAN_ACCESS_PERM != FS_ACCESS_PERM); + BUILD_BUG_ON(FAN_ONDIR != FS_ISDIR); + + if (!fanotify_should_send_event(inode_mark, fanotify_mark, mask, data, + data_type)) + return 0; + + pr_debug("%s: group=%p inode=%p mask=%x\n", __func__, group, inode, + mask); + + event = kmem_cache_alloc(fanotify_event_cachep, GFP_KERNEL); + if (unlikely(!event)) + return -ENOMEM; + + fsn_event = &event->fse; + fsnotify_init_event(fsn_event, inode, mask); + event->tgid = get_pid(task_tgid(current)); + if (data_type == FSNOTIFY_EVENT_PATH) { + struct path *path = data; + event->path = *path; + path_get(&event->path); + } else { + event->path.mnt = NULL; + event->path.dentry = NULL; + } +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS + event->response = 0; +#endif + + notify_fsn_event = fsnotify_add_notify_event(group, fsn_event, + fanotify_merge); + if (notify_fsn_event) { + /* Our event wasn't used in the end. Free it. */ + fsnotify_destroy_event(group, fsn_event); + if (IS_ERR(notify_fsn_event)) + return PTR_ERR(notify_fsn_event); + /* We need to ask about a different events after a merge... */ + event = FANOTIFY_E(notify_fsn_event); + fsn_event = notify_fsn_event; + } + +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS + if (fsn_event->mask & FAN_ALL_PERM_EVENTS) + ret = fanotify_get_response_from_access(group, event); +#endif + return ret; +} + static void fanotify_free_group_priv(struct fsnotify_group *group) { struct user_struct *user; @@ -226,10 +216,18 @@ static void fanotify_free_group_priv(struct fsnotify_group *group) free_uid(user); } +static void fanotify_free_event(struct fsnotify_event *fsn_event) +{ + struct fanotify_event_info *event; + + event = FANOTIFY_E(fsn_event); + path_put(&event->path); + put_pid(event->tgid); + kmem_cache_free(fanotify_event_cachep, event); +} + const struct fsnotify_ops fanotify_fsnotify_ops = { .handle_event = fanotify_handle_event, - .should_send_event = fanotify_should_send_event, .free_group_priv = fanotify_free_group_priv, - .free_event_priv = NULL, - .freeing_mark = NULL, + .free_event = fanotify_free_event, }; diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h new file mode 100644 index 0000000..0e90174 --- /dev/null +++ b/fs/notify/fanotify/fanotify.h @@ -0,0 +1,23 @@ +#include <linux/fsnotify_backend.h> +#include <linux/path.h> +#include <linux/slab.h> + +extern struct kmem_cache *fanotify_event_cachep; + +struct fanotify_event_info { + struct fsnotify_event fse; + /* + * We hold ref to this path so it may be dereferenced at any point + * during this object's lifetime + */ + struct path path; + struct pid *tgid; +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS + u32 response; /* userspace answer to question */ +#endif +}; + +static inline struct fanotify_event_info *FANOTIFY_E(struct fsnotify_event *fse) +{ + return container_of(fse, struct fanotify_event_info, fse); +} diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index e44cb64..57d7c08 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -19,6 +19,7 @@ #include "../../mount.h" #include "../fdinfo.h" +#include "fanotify.h" #define FANOTIFY_DEFAULT_MAX_EVENTS 16384 #define FANOTIFY_DEFAULT_MAX_MARKS 8192 @@ -28,11 +29,12 @@ extern const struct fsnotify_ops fanotify_fsnotify_ops; static struct kmem_cache *fanotify_mark_cache __read_mostly; static struct kmem_cache *fanotify_response_event_cache __read_mostly; +struct kmem_cache *fanotify_event_cachep __read_mostly; struct fanotify_response_event { struct list_head list; __s32 fd; - struct fsnotify_event *event; + struct fanotify_event_info *event; }; /* @@ -61,8 +63,8 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group, } static int create_fd(struct fsnotify_group *group, - struct fsnotify_event *event, - struct file **file) + struct fanotify_event_info *event, + struct file **file) { int client_fd; struct file *new_file; @@ -73,12 +75,6 @@ static int create_fd(struct fsnotify_group *group, if (client_fd < 0) return client_fd; - if (event->data_type != FSNOTIFY_EVENT_PATH) { - WARN_ON(1); - put_unused_fd(client_fd); - return -EINVAL; - } - /* * we need a new file handle for the userspace program so it can read even if it was * originally opened O_WRONLY. @@ -109,23 +105,25 @@ static int create_fd(struct fsnotify_group *group, } static int fill_event_metadata(struct fsnotify_group *group, - struct fanotify_event_metadata *metadata, - struct fsnotify_event *event, - struct file **file) + struct fanotify_event_metadata *metadata, + struct fsnotify_event *fsn_event, + struct file **file) { int ret = 0; + struct fanotify_event_info *event; pr_debug("%s: group=%p metadata=%p event=%p\n", __func__, - group, metadata, event); + group, metadata, fsn_event); *file = NULL; + event = container_of(fsn_event, struct fanotify_event_info, fse); metadata->event_len = FAN_EVENT_METADATA_LEN; metadata->metadata_len = FAN_EVENT_METADATA_LEN; metadata->vers = FANOTIFY_METADATA_VERSION; metadata->reserved = 0; - metadata->mask = event->mask & FAN_ALL_OUTGOING_EVENTS; + metadata->mask = fsn_event->mask & FAN_ALL_OUTGOING_EVENTS; metadata->pid = pid_vnr(event->tgid); - if (unlikely(event->mask & FAN_Q_OVERFLOW)) + if (unlikely(fsn_event->mask & FAN_Q_OVERFLOW)) metadata->fd = FAN_NOFD; else { metadata->fd = create_fd(group, event, file); @@ -209,7 +207,7 @@ static int prepare_for_access_response(struct fsnotify_group *group, if (!re) return -ENOMEM; - re->event = event; + re->event = FANOTIFY_E(event); re->fd = fd; mutex_lock(&group->fanotify_data.access_mutex); @@ -217,7 +215,7 @@ static int prepare_for_access_response(struct fsnotify_group *group, if (atomic_read(&group->fanotify_data.bypass_perm)) { mutex_unlock(&group->fanotify_data.access_mutex); kmem_cache_free(fanotify_response_event_cache, re); - event->response = FAN_ALLOW; + FANOTIFY_E(event)->response = FAN_ALLOW; return 0; } @@ -273,7 +271,7 @@ out_close_fd: out: #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS if (event->mask & FAN_ALL_PERM_EVENTS) { - event->response = FAN_DENY; + FANOTIFY_E(event)->response = FAN_DENY; wake_up(&group->fanotify_data.access_waitq); } #endif @@ -321,7 +319,7 @@ static ssize_t fanotify_read(struct file *file, char __user *buf, if (IS_ERR(kevent)) break; ret = copy_event_to_user(group, kevent, buf); - fsnotify_put_event(kevent); + fsnotify_destroy_event(group, kevent); if (ret < 0) break; buf += ret; @@ -409,7 +407,7 @@ static int fanotify_release(struct inode *ignored, struct file *file) static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct fsnotify_group *group; - struct fsnotify_event_holder *holder; + struct fsnotify_event *fsn_event; void __user *p; int ret = -ENOTTY; size_t send_len = 0; @@ -421,7 +419,7 @@ static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long ar switch (cmd) { case FIONREAD: mutex_lock(&group->notification_mutex); - list_for_each_entry(holder, &group->notification_list, event_list) + list_for_each_entry(fsn_event, &group->notification_list, list) send_len += FAN_EVENT_METADATA_LEN; mutex_unlock(&group->notification_mutex); ret = put_user(send_len, (int __user *) p); @@ -906,6 +904,7 @@ static int __init fanotify_user_setup(void) fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, SLAB_PANIC); fanotify_response_event_cache = KMEM_CACHE(fanotify_response_event, SLAB_PANIC); + fanotify_event_cachep = KMEM_CACHE(fanotify_event_info, SLAB_PANIC); return 0; } diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 4bb21d6..1d4e1ea 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -128,8 +128,7 @@ static int send_to_group(struct inode *to_tell, struct fsnotify_mark *vfsmount_mark, __u32 mask, void *data, int data_is, u32 cookie, - const unsigned char *file_name, - struct fsnotify_event **event) + const unsigned char *file_name) { struct fsnotify_group *group = NULL; __u32 inode_test_mask = 0; @@ -170,27 +169,17 @@ static int send_to_group(struct inode *to_tell, pr_debug("%s: group=%p to_tell=%p mask=%x inode_mark=%p" " inode_test_mask=%x vfsmount_mark=%p vfsmount_test_mask=%x" - " data=%p data_is=%d cookie=%d event=%p\n", + " data=%p data_is=%d cookie=%d\n", __func__, group, to_tell, mask, inode_mark, inode_test_mask, vfsmount_mark, vfsmount_test_mask, data, - data_is, cookie, *event); + data_is, cookie); if (!inode_test_mask && !vfsmount_test_mask) return 0; - if (group->ops->should_send_event(group, to_tell, inode_mark, - vfsmount_mark, mask, data, - data_is) == false) - return 0; - - if (!*event) { - *event = fsnotify_create_event(to_tell, mask, data, - data_is, file_name, - cookie, GFP_KERNEL); - if (!*event) - return -ENOMEM; - } - return group->ops->handle_event(group, inode_mark, vfsmount_mark, *event); + return group->ops->handle_event(group, to_tell, inode_mark, + vfsmount_mark, mask, data, data_is, + file_name); } /* @@ -205,7 +194,6 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, struct hlist_node *inode_node = NULL, *vfsmount_node = NULL; struct fsnotify_mark *inode_mark = NULL, *vfsmount_mark = NULL; struct fsnotify_group *inode_group, *vfsmount_group; - struct fsnotify_event *event = NULL; struct mount *mnt; int idx, ret = 0; /* global tests shouldn't care about events on child only the specific event */ @@ -258,18 +246,18 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, if (inode_group > vfsmount_group) { /* handle inode */ - ret = send_to_group(to_tell, inode_mark, NULL, mask, data, - data_is, cookie, file_name, &event); + ret = send_to_group(to_tell, inode_mark, NULL, mask, + data, data_is, cookie, file_name); /* we didn't use the vfsmount_mark */ vfsmount_group = NULL; } else if (vfsmount_group > inode_group) { - ret = send_to_group(to_tell, NULL, vfsmount_mark, mask, data, - data_is, cookie, file_name, &event); + ret = send_to_group(to_tell, NULL, vfsmount_mark, mask, + data, data_is, cookie, file_name); inode_group = NULL; } else { ret = send_to_group(to_tell, inode_mark, vfsmount_mark, - mask, data, data_is, cookie, file_name, - &event); + mask, data, data_is, cookie, + file_name); } if (ret && (mask & ALL_FSNOTIFY_PERM_EVENTS)) @@ -285,12 +273,6 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, ret = 0; out: srcu_read_unlock(&fsnotify_mark_srcu, idx); - /* - * fsnotify_create_event() took a reference so the event can't be cleaned - * up while we are still trying to add it to lists, drop that one. - */ - if (event) - fsnotify_put_event(event); return ret; } diff --git a/fs/notify/group.c b/fs/notify/group.c index bd2625b..ee674fe 100644 --- a/fs/notify/group.c +++ b/fs/notify/group.c @@ -99,6 +99,7 @@ struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops) INIT_LIST_HEAD(&group->marks_list); group->ops = ops; + fsnotify_init_event(&group->overflow_event, NULL, FS_Q_OVERFLOW); return group; } diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h index b6642e4..485eef3 100644 --- a/fs/notify/inotify/inotify.h +++ b/fs/notify/inotify/inotify.h @@ -2,11 +2,12 @@ #include <linux/inotify.h> #include <linux/slab.h> /* struct kmem_cache */ -extern struct kmem_cache *event_priv_cachep; - -struct inotify_event_private_data { - struct fsnotify_event_private_data fsnotify_event_priv_data; +struct inotify_event_info { + struct fsnotify_event fse; int wd; + u32 sync_cookie; + int name_len; + char name[]; }; struct inotify_inode_mark { @@ -14,8 +15,18 @@ struct inotify_inode_mark { int wd; }; +static inline struct inotify_event_info *INOTIFY_E(struct fsnotify_event *fse) +{ + return container_of(fse, struct inotify_event_info, fse); +} + extern void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark, struct fsnotify_group *group); -extern void inotify_free_event_priv(struct fsnotify_event_private_data *event_priv); +extern int inotify_handle_event(struct fsnotify_group *group, + struct inode *inode, + struct fsnotify_mark *inode_mark, + struct fsnotify_mark *vfsmount_mark, + u32 mask, void *data, int data_type, + const unsigned char *file_name); extern const struct fsnotify_ops inotify_fsnotify_ops; diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index 4216308..aad1a35 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -34,100 +34,87 @@ #include "inotify.h" /* - * Check if 2 events contain the same information. We do not compare private data - * but at this moment that isn't a problem for any know fsnotify listeners. + * Check if 2 events contain the same information. */ -static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new) +static bool event_compare(struct fsnotify_event *old_fsn, + struct fsnotify_event *new_fsn) { - if ((old->mask == new->mask) && - (old->to_tell == new->to_tell) && - (old->data_type == new->data_type) && - (old->name_len == new->name_len)) { - switch (old->data_type) { - case (FSNOTIFY_EVENT_INODE): - /* remember, after old was put on the wait_q we aren't - * allowed to look at the inode any more, only thing - * left to check was if the file_name is the same */ - if (!old->name_len || - !strcmp(old->file_name, new->file_name)) - return true; - break; - case (FSNOTIFY_EVENT_PATH): - if ((old->path.mnt == new->path.mnt) && - (old->path.dentry == new->path.dentry)) - return true; - break; - case (FSNOTIFY_EVENT_NONE): - if (old->mask & FS_Q_OVERFLOW) - return true; - else if (old->mask & FS_IN_IGNORED) - return false; - return true; - }; - } + struct inotify_event_info *old, *new; + + if (old_fsn->mask & FS_IN_IGNORED) + return false; + old = INOTIFY_E(old_fsn); + new = INOTIFY_E(new_fsn); + if ((old_fsn->mask == new_fsn->mask) && + (old_fsn->inode == new_fsn->inode) && + (old->name_len == new->name_len) && + (!old->name_len || !strcmp(old->name, new->name))) + return true; return false; } static struct fsnotify_event *inotify_merge(struct list_head *list, struct fsnotify_event *event) { - struct fsnotify_event_holder *last_holder; struct fsnotify_event *last_event; - /* and the list better be locked by something too */ - spin_lock(&event->lock); - - last_holder = list_entry(list->prev, struct fsnotify_event_holder, event_list); - last_event = last_holder->event; - if (event_compare(last_event, event)) - fsnotify_get_event(last_event); - else - last_event = NULL; - - spin_unlock(&event->lock); - + last_event = list_entry(list->prev, struct fsnotify_event, list); + if (!event_compare(last_event, event)) + return NULL; return last_event; } -static int inotify_handle_event(struct fsnotify_group *group, - struct fsnotify_mark *inode_mark, - struct fsnotify_mark *vfsmount_mark, - struct fsnotify_event *event) +int inotify_handle_event(struct fsnotify_group *group, + struct inode *inode, + struct fsnotify_mark *inode_mark, + struct fsnotify_mark *vfsmount_mark, + u32 mask, void *data, int data_type, + const unsigned char *file_name) { struct inotify_inode_mark *i_mark; - struct inode *to_tell; - struct inotify_event_private_data *event_priv; - struct fsnotify_event_private_data *fsn_event_priv; + struct inotify_event_info *event; struct fsnotify_event *added_event; - int wd, ret = 0; + struct fsnotify_event *fsn_event; + int ret = 0; + int len = 0; + int alloc_len = sizeof(struct inotify_event_info); BUG_ON(vfsmount_mark); - pr_debug("%s: group=%p event=%p to_tell=%p mask=%x\n", __func__, group, - event, event->to_tell, event->mask); + if ((inode_mark->mask & FS_EXCL_UNLINK) && + (data_type == FSNOTIFY_EVENT_PATH)) { + struct path *path = data; - to_tell = event->to_tell; + if (d_unlinked(path->dentry)) + return 0; + } + if (file_name) { + len = strlen(file_name); + alloc_len += len + 1; + } + + pr_debug("%s: group=%p inode=%p mask=%x\n", __func__, group, inode, + mask); i_mark = container_of(inode_mark, struct inotify_inode_mark, fsn_mark); - wd = i_mark->wd; - event_priv = kmem_cache_alloc(event_priv_cachep, GFP_KERNEL); - if (unlikely(!event_priv)) + event = kmalloc(alloc_len, GFP_KERNEL); + if (unlikely(!event)) return -ENOMEM; - fsn_event_priv = &event_priv->fsnotify_event_priv_data; - - fsnotify_get_group(group); - fsn_event_priv->group = group; - event_priv->wd = wd; + fsn_event = &event->fse; + fsnotify_init_event(fsn_event, inode, mask); + event->wd = i_mark->wd; + event->name_len = len; + if (len) + strcpy(event->name, file_name); - added_event = fsnotify_add_notify_event(group, event, fsn_event_priv, inotify_merge); + added_event = fsnotify_add_notify_event(group, fsn_event, inotify_merge); if (added_event) { - inotify_free_event_priv(fsn_event_priv); - if (!IS_ERR(added_event)) - fsnotify_put_event(added_event); - else + /* Our event wasn't used in the end. Free it. */ + fsnotify_destroy_event(group, fsn_event); + if (IS_ERR(added_event)) ret = PTR_ERR(added_event); } @@ -142,22 +129,6 @@ static void inotify_freeing_mark(struct fsnotify_mark *fsn_mark, struct fsnotify inotify_ignored_and_remove_idr(fsn_mark, group); } -static bool inotify_should_send_event(struct fsnotify_group *group, struct inode *inode, - struct fsnotify_mark *inode_mark, - struct fsnotify_mark *vfsmount_mark, - __u32 mask, void *data, int data_type) -{ - if ((inode_mark->mask & FS_EXCL_UNLINK) && - (data_type == FSNOTIFY_EVENT_PATH)) { - struct path *path = data; - - if (d_unlinked(path->dentry)) - return false; - } - - return true; -} - /* * This is NEVER supposed to be called. Inotify marks should either have been * removed from the idr when the watch was removed or in the @@ -202,22 +173,14 @@ static void inotify_free_group_priv(struct fsnotify_group *group) free_uid(group->inotify_data.user); } -void inotify_free_event_priv(struct fsnotify_event_private_data *fsn_event_priv) +static void inotify_free_event(struct fsnotify_event *fsn_event) { - struct inotify_event_private_data *event_priv; - - - event_priv = container_of(fsn_event_priv, struct inotify_event_private_data, - fsnotify_event_priv_data); - - fsnotify_put_group(fsn_event_priv->group); - kmem_cache_free(event_priv_cachep, event_priv); + kfree(INOTIFY_E(fsn_event)); } const struct fsnotify_ops inotify_fsnotify_ops = { .handle_event = inotify_handle_event, - .should_send_event = inotify_should_send_event, .free_group_priv = inotify_free_group_priv, - .free_event_priv = inotify_free_event_priv, + .free_event = inotify_free_event, .freeing_mark = inotify_freeing_mark, }; diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 60f954a..497395c 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -50,7 +50,6 @@ static int inotify_max_queued_events __read_mostly; static int inotify_max_user_watches __read_mostly; static struct kmem_cache *inotify_inode_mark_cachep __read_mostly; -struct kmem_cache *event_priv_cachep __read_mostly; #ifdef CONFIG_SYSCTL @@ -124,6 +123,16 @@ static unsigned int inotify_poll(struct file *file, poll_table *wait) return ret; } +static int round_event_name_len(struct fsnotify_event *fsn_event) +{ + struct inotify_event_info *event; + + event = INOTIFY_E(fsn_event); + if (!event->name_len) + return 0; + return roundup(event->name_len + 1, sizeof(struct inotify_event)); +} + /* * Get an inotify_kernel_event if one exists and is small * enough to fit in "count". Return an error pointer if @@ -144,9 +153,7 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group, pr_debug("%s: group=%p event=%p\n", __func__, group, event); - if (event->name_len) - event_size += roundup(event->name_len + 1, event_size); - + event_size += round_event_name_len(event); if (event_size > count) return ERR_PTR(-EINVAL); @@ -164,40 +171,27 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group, * buffer we had in "get_one_event()" above. */ static ssize_t copy_event_to_user(struct fsnotify_group *group, - struct fsnotify_event *event, + struct fsnotify_event *fsn_event, char __user *buf) { struct inotify_event inotify_event; - struct fsnotify_event_private_data *fsn_priv; - struct inotify_event_private_data *priv; + struct inotify_event_info *event; size_t event_size = sizeof(struct inotify_event); - size_t name_len = 0; - - pr_debug("%s: group=%p event=%p\n", __func__, group, event); + size_t name_len; + size_t pad_name_len; - /* we get the inotify watch descriptor from the event private data */ - spin_lock(&event->lock); - fsn_priv = fsnotify_remove_priv_from_event(group, event); - spin_unlock(&event->lock); - - if (!fsn_priv) - inotify_event.wd = -1; - else { - priv = container_of(fsn_priv, struct inotify_event_private_data, - fsnotify_event_priv_data); - inotify_event.wd = priv->wd; - inotify_free_event_priv(fsn_priv); - } + pr_debug("%s: group=%p event=%p\n", __func__, group, fsn_event); + event = INOTIFY_E(fsn_event); + name_len = event->name_len; /* - * round up event->name_len so it is a multiple of event_size + * round up name length so it is a multiple of event_size * plus an extra byte for the terminating '\0'. */ - if (event->name_len) - name_len = roundup(event->name_len + 1, event_size); - inotify_event.len = name_len; - - inotify_event.mask = inotify_mask_to_arg(event->mask); + pad_name_len = round_event_name_len(fsn_event); + inotify_event.len = pad_name_len; + inotify_event.mask = inotify_mask_to_arg(fsn_event->mask); + inotify_event.wd = event->wd; inotify_event.cookie = event->sync_cookie; /* send the main event */ @@ -209,20 +203,18 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, /* * fsnotify only stores the pathname, so here we have to send the pathname * and then pad that pathname out to a multiple of sizeof(inotify_event) - * with zeros. I get my zeros from the nul_inotify_event. + * with zeros. */ - if (name_len) { - unsigned int len_to_zero = name_len - event->name_len; + if (pad_name_len) { /* copy the path name */ - if (copy_to_user(buf, event->file_name, event->name_len)) + if (copy_to_user(buf, event->name, name_len)) return -EFAULT; - buf += event->name_len; + buf += name_len; /* fill userspace with 0's */ - if (clear_user(buf, len_to_zero)) + if (clear_user(buf, pad_name_len - name_len)) return -EFAULT; - buf += len_to_zero; - event_size += name_len; + event_size += pad_name_len; } return event_size; @@ -254,7 +246,7 @@ static ssize_t inotify_read(struct file *file, char __user *buf, if (IS_ERR(kevent)) break; ret = copy_event_to_user(group, kevent, buf); - fsnotify_put_event(kevent); + fsnotify_destroy_event(group, kevent); if (ret < 0) break; buf += ret; @@ -297,8 +289,7 @@ static long inotify_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct fsnotify_group *group; - struct fsnotify_event_holder *holder; - struct fsnotify_event *event; + struct fsnotify_event *fsn_event; void __user *p; int ret = -ENOTTY; size_t send_len = 0; @@ -311,12 +302,10 @@ static long inotify_ioctl(struct file *file, unsigned int cmd, switch (cmd) { case FIONREAD: mutex_lock(&group->notification_mutex); - list_for_each_entry(holder, &group->notification_list, event_list) { - event = holder->event; + list_for_each_entry(fsn_event, &group->notification_list, + list) { send_len += sizeof(struct inotify_event); - if (event->name_len) - send_len += roundup(event->name_len + 1, - sizeof(struct inotify_event)); + send_len += round_event_name_len(fsn_event); } mutex_unlock(&group->notification_mutex); ret = put_user(send_len, (int __user *) p); @@ -503,43 +492,12 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark, struct fsnotify_group *group) { struct inotify_inode_mark *i_mark; - struct fsnotify_event *ignored_event, *notify_event; - struct inotify_event_private_data *event_priv; - struct fsnotify_event_private_data *fsn_event_priv; - int ret; - - i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark); - - ignored_event = fsnotify_create_event(NULL, FS_IN_IGNORED, NULL, - FSNOTIFY_EVENT_NONE, NULL, 0, - GFP_NOFS); - if (!ignored_event) - goto skip_send_ignore; - - event_priv = kmem_cache_alloc(event_priv_cachep, GFP_NOFS); - if (unlikely(!event_priv)) - goto skip_send_ignore; - - fsn_event_priv = &event_priv->fsnotify_event_priv_data; - - fsnotify_get_group(group); - fsn_event_priv->group = group; - event_priv->wd = i_mark->wd; - - notify_event = fsnotify_add_notify_event(group, ignored_event, fsn_event_priv, NULL); - if (notify_event) { - if (IS_ERR(notify_event)) - ret = PTR_ERR(notify_event); - else - fsnotify_put_event(notify_event); - inotify_free_event_priv(fsn_event_priv); - } -skip_send_ignore: - /* matches the reference taken when the event was created */ - if (ignored_event) - fsnotify_put_event(ignored_event); + /* Queue ignore event for the watch */ + inotify_handle_event(group, NULL, fsn_mark, NULL, FS_IN_IGNORED, + NULL, FSNOTIFY_EVENT_NONE, NULL); + i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark); /* remove this mark from the idr */ inotify_remove_from_idr(group, i_mark); @@ -836,7 +794,6 @@ static int __init inotify_user_setup(void) BUG_ON(hweight32(ALL_INOTIFY_BITS) != 21); inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC); - event_priv_cachep = KMEM_CACHE(inotify_event_private_data, SLAB_PANIC); inotify_max_queued_events = 16384; inotify_max_user_instances = 128; diff --git a/fs/notify/notification.c b/fs/notify/notification.c index 7b51b05..952237b 100644 --- a/fs/notify/notification.c +++ b/fs/notify/notification.c @@ -48,15 +48,6 @@ #include <linux/fsnotify_backend.h> #include "fsnotify.h" -static struct kmem_cache *fsnotify_event_cachep; -static struct kmem_cache *fsnotify_event_holder_cachep; -/* - * This is a magic event we send when the q is too full. Since it doesn't - * hold real event information we just keep one system wide and use it any time - * it is needed. It's refcnt is set 1 at kernel init time and will never - * get set to 0 so it will never get 'freed' - */ -static struct fsnotify_event *q_overflow_event; static atomic_t fsnotify_sync_cookie = ATOMIC_INIT(0); /** @@ -76,60 +67,14 @@ bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group) return list_empty(&group->notification_list) ? true : false; } -void fsnotify_get_event(struct fsnotify_event *event) +void fsnotify_destroy_event(struct fsnotify_group *group, + struct fsnotify_event *event) { - atomic_inc(&event->refcnt); -} - -void fsnotify_put_event(struct fsnotify_event *event) -{ - if (!event) + /* Overflow events are per-group and we don't want to free them */ + if (!event || event->mask == FS_Q_OVERFLOW) return; - if (atomic_dec_and_test(&event->refcnt)) { - pr_debug("%s: event=%p\n", __func__, event); - - if (event->data_type == FSNOTIFY_EVENT_PATH) - path_put(&event->path); - - BUG_ON(!list_empty(&event->private_data_list)); - - kfree(event->file_name); - put_pid(event->tgid); - kmem_cache_free(fsnotify_event_cachep, event); - } -} - -struct fsnotify_event_holder *fsnotify_alloc_event_holder(void) -{ - return kmem_cache_alloc(fsnotify_event_holder_cachep, GFP_KERNEL); -} - -void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder) -{ - if (holder) - kmem_cache_free(fsnotify_event_holder_cachep, holder); -} - -/* - * Find the private data that the group previously attached to this event when - * the group added the event to the notification queue (fsnotify_add_notify_event) - */ -struct fsnotify_event_private_data *fsnotify_remove_priv_from_event(struct fsnotify_group *group, struct fsnotify_event *event) -{ - struct fsnotify_event_private_data *lpriv; - struct fsnotify_event_private_data *priv = NULL; - - assert_spin_locked(&event->lock); - - list_for_each_entry(lpriv, &event->private_data_list, event_list) { - if (lpriv->group == group) { - priv = lpriv; - list_del(&priv->event_list); - break; - } - } - return priv; + group->ops->free_event(event); } /* @@ -137,91 +82,35 @@ struct fsnotify_event_private_data *fsnotify_remove_priv_from_event(struct fsnot * event off the queue to deal with. If the event is successfully added to the * group's notification queue, a reference is taken on event. */ -struct fsnotify_event *fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_event *event, - struct fsnotify_event_private_data *priv, +struct fsnotify_event *fsnotify_add_notify_event(struct fsnotify_group *group, + struct fsnotify_event *event, struct fsnotify_event *(*merge)(struct list_head *, struct fsnotify_event *)) { struct fsnotify_event *return_event = NULL; - struct fsnotify_event_holder *holder = NULL; struct list_head *list = &group->notification_list; - pr_debug("%s: group=%p event=%p priv=%p\n", __func__, group, event, priv); - - /* - * There is one fsnotify_event_holder embedded inside each fsnotify_event. - * Check if we expect to be able to use that holder. If not alloc a new - * holder. - * For the overflow event it's possible that something will use the in - * event holder before we get the lock so we may need to jump back and - * alloc a new holder, this can't happen for most events... - */ - if (!list_empty(&event->holder.event_list)) { -alloc_holder: - holder = fsnotify_alloc_event_holder(); - if (!holder) - return ERR_PTR(-ENOMEM); - } + pr_debug("%s: group=%p event=%p\n", __func__, group, event); mutex_lock(&group->notification_mutex); if (group->q_len >= group->max_events) { - event = q_overflow_event; - - /* - * we need to return the overflow event - * which means we need a ref - */ - fsnotify_get_event(event); + /* Queue overflow event only if it isn't already queued */ + if (list_empty(&group->overflow_event.list)) + event = &group->overflow_event; return_event = event; - - /* sorry, no private data on the overflow event */ - priv = NULL; } if (!list_empty(list) && merge) { - struct fsnotify_event *tmp; - - tmp = merge(list, event); - if (tmp) { - mutex_unlock(&group->notification_mutex); - - if (return_event) - fsnotify_put_event(return_event); - if (holder != &event->holder) - fsnotify_destroy_event_holder(holder); - return tmp; - } - } - - spin_lock(&event->lock); - - if (list_empty(&event->holder.event_list)) { - if (unlikely(holder)) - fsnotify_destroy_event_holder(holder); - holder = &event->holder; - } else if (unlikely(!holder)) { - /* between the time we checked above and got the lock the in - * event holder was used, go back and get a new one */ - spin_unlock(&event->lock); - mutex_unlock(&group->notification_mutex); - + return_event = merge(list, event); if (return_event) { - fsnotify_put_event(return_event); - return_event = NULL; + mutex_unlock(&group->notification_mutex); + return return_event; } - - goto alloc_holder; } group->q_len++; - holder->event = event; - - fsnotify_get_event(event); - list_add_tail(&holder->event_list, list); - if (priv) - list_add_tail(&priv->event_list, &event->private_data_list); - spin_unlock(&event->lock); + list_add_tail(&event->list, list); mutex_unlock(&group->notification_mutex); wake_up(&group->notification_waitq); @@ -230,32 +119,20 @@ alloc_holder: } /* - * Remove and return the first event from the notification list. There is a - * reference held on this event since it was on the list. It is the responsibility - * of the caller to drop this reference. + * Remove and return the first event from the notification list. It is the + * responsibility of the caller to destroy the obtained event */ struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group) { struct fsnotify_event *event; - struct fsnotify_event_holder *holder; BUG_ON(!mutex_is_locked(&group->notification_mutex)); pr_debug("%s: group=%p\n", __func__, group); - holder = list_first_entry(&group->notification_list, struct fsnotify_event_holder, event_list); - - event = holder->event; - - spin_lock(&event->lock); - holder->event = NULL; - list_del_init(&holder->event_list); - spin_unlock(&event->lock); - - /* event == holder means we are referenced through the in event holder */ - if (holder != &event->holder) - fsnotify_destroy_event_holder(holder); - + event = list_first_entry(&group->notification_list, + struct fsnotify_event, list); + list_del(&event->list); group->q_len--; return event; @@ -266,15 +143,10 @@ struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group */ struct fsnotify_event *fsnotify_peek_notify_event(struct fsnotify_group *group) { - struct fsnotify_event *event; - struct fsnotify_event_holder *holder; - BUG_ON(!mutex_is_locked(&group->notification_mutex)); - holder = list_first_entry(&group->notification_list, struct fsnotify_event_holder, event_list); - event = holder->event; - - return event; + return list_first_entry(&group->notification_list, + struct fsnotify_event, list); } /* @@ -284,181 +156,31 @@ struct fsnotify_event *fsnotify_peek_notify_event(struct fsnotify_group *group) void fsnotify_flush_notify(struct fsnotify_group *group) { struct fsnotify_event *event; - struct fsnotify_event_private_data *priv; mutex_lock(&group->notification_mutex); while (!fsnotify_notify_queue_is_empty(group)) { event = fsnotify_remove_notify_event(group); - /* if they don't implement free_event_priv they better not have attached any */ - if (group->ops->free_event_priv) { - spin_lock(&event->lock); - priv = fsnotify_remove_priv_from_event(group, event); - spin_unlock(&event->lock); - if (priv) - group->ops->free_event_priv(priv); - } - fsnotify_put_event(event); /* matches fsnotify_add_notify_event */ + fsnotify_destroy_event(group, event); } mutex_unlock(&group->notification_mutex); } -static void initialize_event(struct fsnotify_event *event) -{ - INIT_LIST_HEAD(&event->holder.event_list); - atomic_set(&event->refcnt, 1); - - spin_lock_init(&event->lock); - - INIT_LIST_HEAD(&event->private_data_list); -} - -/* - * Caller damn well better be holding whatever mutex is protecting the - * old_holder->event_list and the new_event must be a clean event which - * cannot be found anywhere else in the kernel. - */ -int fsnotify_replace_event(struct fsnotify_event_holder *old_holder, - struct fsnotify_event *new_event) -{ - struct fsnotify_event *old_event = old_holder->event; - struct fsnotify_event_holder *new_holder = &new_event->holder; - - enum event_spinlock_class { - SPINLOCK_OLD, - SPINLOCK_NEW, - }; - - pr_debug("%s: old_event=%p new_event=%p\n", __func__, old_event, new_event); - - /* - * if the new_event's embedded holder is in use someone - * screwed up and didn't give us a clean new event. - */ - BUG_ON(!list_empty(&new_holder->event_list)); - - spin_lock_nested(&old_event->lock, SPINLOCK_OLD); - spin_lock_nested(&new_event->lock, SPINLOCK_NEW); - - new_holder->event = new_event; - list_replace_init(&old_holder->event_list, &new_holder->event_list); - - spin_unlock(&new_event->lock); - spin_unlock(&old_event->lock); - - /* event == holder means we are referenced through the in event holder */ - if (old_holder != &old_event->holder) - fsnotify_destroy_event_holder(old_holder); - - fsnotify_get_event(new_event); /* on the list take reference */ - fsnotify_put_event(old_event); /* off the list, drop reference */ - - return 0; -} - -struct fsnotify_event *fsnotify_clone_event(struct fsnotify_event *old_event) -{ - struct fsnotify_event *event; - - event = kmem_cache_alloc(fsnotify_event_cachep, GFP_KERNEL); - if (!event) - return NULL; - - pr_debug("%s: old_event=%p new_event=%p\n", __func__, old_event, event); - - memcpy(event, old_event, sizeof(*event)); - initialize_event(event); - - if (event->name_len) { - event->file_name = kstrdup(old_event->file_name, GFP_KERNEL); - if (!event->file_name) { - kmem_cache_free(fsnotify_event_cachep, event); - return NULL; - } - } - event->tgid = get_pid(old_event->tgid); - if (event->data_type == FSNOTIFY_EVENT_PATH) - path_get(&event->path); - - return event; -} - /* * fsnotify_create_event - Allocate a new event which will be sent to each * group's handle_event function if the group was interested in this * particular event. * - * @to_tell the inode which is supposed to receive the event (sometimes a + * @inode the inode which is supposed to receive the event (sometimes a * parent of the inode to which the event happened. * @mask what actually happened. * @data pointer to the object which was actually affected * @data_type flag indication if the data is a file, path, inode, nothing... * @name the filename, if available */ -struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, void *data, - int data_type, const unsigned char *name, - u32 cookie, gfp_t gfp) +void fsnotify_init_event(struct fsnotify_event *event, struct inode *inode, + u32 mask) { - struct fsnotify_event *event; - - event = kmem_cache_zalloc(fsnotify_event_cachep, gfp); - if (!event) - return NULL; - - pr_debug("%s: event=%p to_tell=%p mask=%x data=%p data_type=%d\n", - __func__, event, to_tell, mask, data, data_type); - - initialize_event(event); - - if (name) { - event->file_name = kstrdup(name, gfp); - if (!event->file_name) { - kmem_cache_free(fsnotify_event_cachep, event); - return NULL; - } - event->name_len = strlen(event->file_name); - } - - event->tgid = get_pid(task_tgid(current)); - event->sync_cookie = cookie; - event->to_tell = to_tell; - event->data_type = data_type; - - switch (data_type) { - case FSNOTIFY_EVENT_PATH: { - struct path *path = data; - event->path.dentry = path->dentry; - event->path.mnt = path->mnt; - path_get(&event->path); - break; - } - case FSNOTIFY_EVENT_INODE: - event->inode = data; - break; - case FSNOTIFY_EVENT_NONE: - event->inode = NULL; - event->path.dentry = NULL; - event->path.mnt = NULL; - break; - default: - BUG(); - } - + INIT_LIST_HEAD(&event->list); + event->inode = inode; event->mask = mask; - - return event; -} - -static __init int fsnotify_notification_init(void) -{ - fsnotify_event_cachep = KMEM_CACHE(fsnotify_event, SLAB_PANIC); - fsnotify_event_holder_cachep = KMEM_CACHE(fsnotify_event_holder, SLAB_PANIC); - - q_overflow_event = fsnotify_create_event(NULL, FS_Q_OVERFLOW, NULL, - FSNOTIFY_EVENT_NONE, NULL, 0, - GFP_KERNEL); - if (!q_overflow_event) - panic("unable to allocate fsnotify q_overflow_event\n"); - - return 0; } -subsys_initcall(fsnotify_notification_init); diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile index f17e58b..ce210d4 100644 --- a/fs/ocfs2/Makefile +++ b/fs/ocfs2/Makefile @@ -38,7 +38,6 @@ ocfs2-objs := \ symlink.o \ sysfile.o \ uptodate.o \ - ver.o \ quota_local.o \ quota_global.o \ xattr.o \ diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index dc7411f..8750ae1 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -7260,14 +7260,8 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) start = range->start >> osb->s_clustersize_bits; len = range->len >> osb->s_clustersize_bits; minlen = range->minlen >> osb->s_clustersize_bits; - trimmed = 0; - - if (!len) { - range->len = 0; - return 0; - } - if (minlen >= osb->bitmap_cpg) + if (minlen >= osb->bitmap_cpg || range->len < sb->s_blocksize) return -EINVAL; main_bm_inode = ocfs2_get_system_file_inode(osb, @@ -7293,6 +7287,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) goto out_unlock; } + len = range->len >> osb->s_clustersize_bits; if (start + len > le32_to_cpu(main_bm->i_clusters)) len = le32_to_cpu(main_bm->i_clusters) - start; @@ -7307,6 +7302,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1); last_bit = osb->bitmap_cpg; + trimmed = 0; for (group = first_group; group <= last_group;) { if (first_bit + len >= osb->bitmap_cpg) last_bit = osb->bitmap_cpg; diff --git a/fs/ocfs2/cluster/Makefile b/fs/ocfs2/cluster/Makefile index bc8c5e7..1aefc03 100644 --- a/fs/ocfs2/cluster/Makefile +++ b/fs/ocfs2/cluster/Makefile @@ -1,4 +1,4 @@ obj-$(CONFIG_OCFS2_FS) += ocfs2_nodemanager.o ocfs2_nodemanager-objs := heartbeat.o masklog.o sys.o nodemanager.o \ - quorum.o tcp.o netdebug.o ver.o + quorum.o tcp.o netdebug.o diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c index bb24064..441c84e 100644 --- a/fs/ocfs2/cluster/nodemanager.c +++ b/fs/ocfs2/cluster/nodemanager.c @@ -29,7 +29,6 @@ #include "heartbeat.h" #include "masklog.h" #include "sys.h" -#include "ver.h" /* for now we operate under the assertion that there can be only one * cluster active at a time. Changing this will require trickling @@ -945,8 +944,6 @@ static int __init init_o2nm(void) { int ret = -1; - cluster_print_version(); - ret = o2hb_init(); if (ret) goto out; @@ -984,6 +981,7 @@ out: MODULE_AUTHOR("Oracle"); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("OCFS2 cluster management"); module_init(init_o2nm) module_exit(exit_o2nm) diff --git a/fs/ocfs2/cluster/ver.c b/fs/ocfs2/cluster/ver.c deleted file mode 100644 index a56eee6..0000000 --- a/fs/ocfs2/cluster/ver.c +++ /dev/null @@ -1,42 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * ver.c - * - * version string - * - * Copyright (C) 2002, 2005 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include <linux/module.h> -#include <linux/kernel.h> - -#include "ver.h" - -#define CLUSTER_BUILD_VERSION "1.5.0" - -#define VERSION_STR "OCFS2 Node Manager " CLUSTER_BUILD_VERSION - -void cluster_print_version(void) -{ - printk(KERN_INFO "%s\n", VERSION_STR); -} - -MODULE_DESCRIPTION(VERSION_STR); - -MODULE_VERSION(CLUSTER_BUILD_VERSION); diff --git a/fs/ocfs2/cluster/ver.h b/fs/ocfs2/cluster/ver.h deleted file mode 100644 index 32554c3..0000000 --- a/fs/ocfs2/cluster/ver.h +++ /dev/null @@ -1,31 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * ver.h - * - * Function prototypes - * - * Copyright (C) 2005 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#ifndef O2CLUSTER_VER_H -#define O2CLUSTER_VER_H - -void cluster_print_version(void); - -#endif /* O2CLUSTER_VER_H */ diff --git a/fs/ocfs2/dlm/Makefile b/fs/ocfs2/dlm/Makefile index c8a044e..bd1aab1f 100644 --- a/fs/ocfs2/dlm/Makefile +++ b/fs/ocfs2/dlm/Makefile @@ -3,5 +3,5 @@ ccflags-y := -Ifs/ocfs2 obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \ - dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o dlmver.o + dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 8b3382a..33660a4 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -43,8 +43,6 @@ #include "dlmdomain.h" #include "dlmdebug.h" -#include "dlmver.h" - #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_DOMAIN) #include "cluster/masklog.h" @@ -2328,8 +2326,6 @@ static int __init dlm_init(void) { int status; - dlm_print_version(); - status = dlm_init_mle_cache(); if (status) { mlog(ML_ERROR, "Could not create o2dlm_mle slabcache\n"); @@ -2379,6 +2375,7 @@ static void __exit dlm_exit (void) MODULE_AUTHOR("Oracle"); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("OCFS2 Distributed Lock Management"); module_init(dlm_init); module_exit(dlm_exit); diff --git a/fs/ocfs2/dlm/dlmver.c b/fs/ocfs2/dlm/dlmver.c deleted file mode 100644 index dfc0da4..0000000 --- a/fs/ocfs2/dlm/dlmver.c +++ /dev/null @@ -1,42 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * dlmver.c - * - * version string - * - * Copyright (C) 2002, 2005 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include <linux/module.h> -#include <linux/kernel.h> - -#include "dlmver.h" - -#define DLM_BUILD_VERSION "1.5.0" - -#define VERSION_STR "OCFS2 DLM " DLM_BUILD_VERSION - -void dlm_print_version(void) -{ - printk(KERN_INFO "%s\n", VERSION_STR); -} - -MODULE_DESCRIPTION(VERSION_STR); - -MODULE_VERSION(DLM_BUILD_VERSION); diff --git a/fs/ocfs2/dlm/dlmver.h b/fs/ocfs2/dlm/dlmver.h deleted file mode 100644 index f674aee..0000000 --- a/fs/ocfs2/dlm/dlmver.h +++ /dev/null @@ -1,31 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * dlmfsver.h - * - * Function prototypes - * - * Copyright (C) 2005 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#ifndef DLM_VER_H -#define DLM_VER_H - -void dlm_print_version(void); - -#endif /* DLM_VER_H */ diff --git a/fs/ocfs2/dlmfs/Makefile b/fs/ocfs2/dlmfs/Makefile index f14be89..eed3db8c 100644 --- a/fs/ocfs2/dlmfs/Makefile +++ b/fs/ocfs2/dlmfs/Makefile @@ -2,4 +2,4 @@ ccflags-y := -Ifs/ocfs2 obj-$(CONFIG_OCFS2_FS) += ocfs2_dlmfs.o -ocfs2_dlmfs-objs := userdlm.o dlmfs.o dlmfsver.o +ocfs2_dlmfs-objs := userdlm.o dlmfs.o diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index efa2b3d..09b7d9d 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -49,7 +49,6 @@ #include "stackglue.h" #include "userdlm.h" -#include "dlmfsver.h" #define MLOG_MASK_PREFIX ML_DLMFS #include "cluster/masklog.h" @@ -644,8 +643,6 @@ static int __init init_dlmfs_fs(void) int status; int cleanup_inode = 0, cleanup_worker = 0; - dlmfs_print_version(); - status = bdi_init(&dlmfs_backing_dev_info); if (status) return status; @@ -701,6 +698,7 @@ static void __exit exit_dlmfs_fs(void) MODULE_AUTHOR("Oracle"); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("OCFS2 DLM-Filesystem"); module_init(init_dlmfs_fs) module_exit(exit_dlmfs_fs) diff --git a/fs/ocfs2/dlmfs/dlmfsver.c b/fs/ocfs2/dlmfs/dlmfsver.c deleted file mode 100644 index a733b33..0000000 --- a/fs/ocfs2/dlmfs/dlmfsver.c +++ /dev/null @@ -1,42 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * dlmfsver.c - * - * version string - * - * Copyright (C) 2002, 2005 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include <linux/module.h> -#include <linux/kernel.h> - -#include "dlmfsver.h" - -#define DLM_BUILD_VERSION "1.5.0" - -#define VERSION_STR "OCFS2 DLMFS " DLM_BUILD_VERSION - -void dlmfs_print_version(void) -{ - printk(KERN_INFO "%s\n", VERSION_STR); -} - -MODULE_DESCRIPTION(VERSION_STR); - -MODULE_VERSION(DLM_BUILD_VERSION); diff --git a/fs/ocfs2/dlmfs/dlmfsver.h b/fs/ocfs2/dlmfs/dlmfsver.h deleted file mode 100644 index f35eadb..0000000 --- a/fs/ocfs2/dlmfs/dlmfsver.h +++ /dev/null @@ -1,31 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * dlmver.h - * - * Function prototypes - * - * Copyright (C) 2005 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#ifndef DLMFS_VER_H -#define DLMFS_VER_H - -void dlmfs_print_version(void); - -#endif /* DLMFS_VER_H */ diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 3407b2c..1998695 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -2996,6 +2996,8 @@ int ocfs2_dlm_init(struct ocfs2_super *osb) /* for now, uuid == domain */ status = ocfs2_cluster_connect(osb->osb_cluster_stack, + osb->osb_cluster_name, + strlen(osb->osb_cluster_name), osb->uuid_str, strlen(osb->uuid_str), &lproto, ocfs2_do_node_down, osb, @@ -3005,7 +3007,7 @@ int ocfs2_dlm_init(struct ocfs2_super *osb) goto bail; } - status = ocfs2_cluster_this_node(&osb->node_num); + status = ocfs2_cluster_this_node(conn, &osb->node_num); if (status < 0) { mlog_errno(status); mlog(ML_ERROR, diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 6fff128..f42eece 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1869,7 +1869,8 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode, } size = sr->l_start + sr->l_len; - if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) { + if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64 || + cmd == OCFS2_IOC_UNRESVSP || cmd == OCFS2_IOC_UNRESVSP64) { if (sr->l_len <= 0) { ret = -EINVAL; goto out_inode_unlock; diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index fa32ce9..8ca3c29 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -7,6 +7,7 @@ #include <linux/fs.h> #include <linux/mount.h> +#include <linux/blkdev.h> #include <linux/compat.h> #include <cluster/masklog.h> @@ -966,15 +967,21 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case FITRIM: { struct super_block *sb = inode->i_sb; + struct request_queue *q = bdev_get_queue(sb->s_bdev); struct fstrim_range range; int ret = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; + if (!blk_queue_discard(q)) + return -EOPNOTSUPP; + if (copy_from_user(&range, argp, sizeof(range))) return -EFAULT; + range.minlen = max_t(u64, q->limits.discard_granularity, + range.minlen); ret = ocfs2_trim_fs(sb, &range); if (ret < 0) return ret; diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index 631a982..64c304d 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -561,83 +561,6 @@ static void ocfs2_probe_alloc_group(struct inode *inode, struct buffer_head *bh, mlog(0, "found phys_cpos: %u to fit the wanted moving.\n", *phys_cpos); } -static int ocfs2_alloc_dinode_update_counts(struct inode *inode, - handle_t *handle, - struct buffer_head *di_bh, - u32 num_bits, - u16 chain) -{ - int ret; - u32 tmp_used; - struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; - struct ocfs2_chain_list *cl = - (struct ocfs2_chain_list *) &di->id2.i_chain; - - ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (ret < 0) { - mlog_errno(ret); - goto out; - } - - tmp_used = le32_to_cpu(di->id1.bitmap1.i_used); - di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used); - le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits); - ocfs2_journal_dirty(handle, di_bh); - -out: - return ret; -} - -static inline int ocfs2_block_group_set_bits(handle_t *handle, - struct inode *alloc_inode, - struct ocfs2_group_desc *bg, - struct buffer_head *group_bh, - unsigned int bit_off, - unsigned int num_bits) -{ - int status; - void *bitmap = bg->bg_bitmap; - int journal_type = OCFS2_JOURNAL_ACCESS_WRITE; - - /* All callers get the descriptor via - * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ - BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); - BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits); - - mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off, - num_bits); - - if (ocfs2_is_cluster_bitmap(alloc_inode)) - journal_type = OCFS2_JOURNAL_ACCESS_UNDO; - - status = ocfs2_journal_access_gd(handle, - INODE_CACHE(alloc_inode), - group_bh, - journal_type); - if (status < 0) { - mlog_errno(status); - goto bail; - } - - le16_add_cpu(&bg->bg_free_bits_count, -num_bits); - if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) { - ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit" - " count %u but claims %u are freed. num_bits %d", - (unsigned long long)le64_to_cpu(bg->bg_blkno), - le16_to_cpu(bg->bg_bits), - le16_to_cpu(bg->bg_free_bits_count), num_bits); - return -EROFS; - } - while (num_bits--) - ocfs2_set_bit(bit_off++, bitmap); - - ocfs2_journal_dirty(handle, group_bh); - -bail: - return status; -} - static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, u32 cpos, u32 phys_cpos, u32 *new_phys_cpos, u32 len, int ext_flags) diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 3a90347..553f53c 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -387,6 +387,7 @@ struct ocfs2_super u8 osb_stackflags; char osb_cluster_stack[OCFS2_STACK_LABEL_LEN + 1]; + char osb_cluster_name[OCFS2_CLUSTER_NAME_LEN + 1]; struct ocfs2_cluster_connection *cconn; struct ocfs2_lock_res osb_super_lockres; struct ocfs2_lock_res osb_rename_lockres; diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c index bf1f893..1724d43 100644 --- a/fs/ocfs2/stack_o2cb.c +++ b/fs/ocfs2/stack_o2cb.c @@ -398,7 +398,8 @@ static int o2cb_cluster_disconnect(struct ocfs2_cluster_connection *conn) return 0; } -static int o2cb_cluster_this_node(unsigned int *node) +static int o2cb_cluster_this_node(struct ocfs2_cluster_connection *conn, + unsigned int *node) { int node_num; diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index 286edf1..13a8537 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -23,6 +23,7 @@ #include <linux/mutex.h> #include <linux/slab.h> #include <linux/reboot.h> +#include <linux/sched.h> #include <asm/uaccess.h> #include "stackglue.h" @@ -102,6 +103,12 @@ #define OCFS2_TEXT_UUID_LEN 32 #define OCFS2_CONTROL_MESSAGE_VERNUM_LEN 2 #define OCFS2_CONTROL_MESSAGE_NODENUM_LEN 8 +#define VERSION_LOCK "version_lock" + +enum ocfs2_connection_type { + WITH_CONTROLD, + NO_CONTROLD +}; /* * ocfs2_live_connection is refcounted because the filesystem and @@ -110,6 +117,13 @@ struct ocfs2_live_connection { struct list_head oc_list; struct ocfs2_cluster_connection *oc_conn; + enum ocfs2_connection_type oc_type; + atomic_t oc_this_node; + int oc_our_slot; + struct dlm_lksb oc_version_lksb; + char oc_lvb[DLM_LVB_LEN]; + struct completion oc_sync_wait; + wait_queue_head_t oc_wait; }; struct ocfs2_control_private { @@ -198,20 +212,15 @@ static struct ocfs2_live_connection *ocfs2_connection_find(const char *name) * mount path. Since the VFS prevents multiple calls to * fill_super(), we can't get dupes here. */ -static int ocfs2_live_connection_new(struct ocfs2_cluster_connection *conn, - struct ocfs2_live_connection **c_ret) +static int ocfs2_live_connection_attach(struct ocfs2_cluster_connection *conn, + struct ocfs2_live_connection *c) { int rc = 0; - struct ocfs2_live_connection *c; - - c = kzalloc(sizeof(struct ocfs2_live_connection), GFP_KERNEL); - if (!c) - return -ENOMEM; mutex_lock(&ocfs2_control_lock); c->oc_conn = conn; - if (atomic_read(&ocfs2_control_opened)) + if ((c->oc_type == NO_CONTROLD) || atomic_read(&ocfs2_control_opened)) list_add(&c->oc_list, &ocfs2_live_connection_list); else { printk(KERN_ERR @@ -220,12 +229,6 @@ static int ocfs2_live_connection_new(struct ocfs2_cluster_connection *conn, } mutex_unlock(&ocfs2_control_lock); - - if (!rc) - *c_ret = c; - else - kfree(c); - return rc; } @@ -799,18 +802,251 @@ static int fs_protocol_compare(struct ocfs2_protocol_version *existing, return 0; } +static void lvb_to_version(char *lvb, struct ocfs2_protocol_version *ver) +{ + struct ocfs2_protocol_version *pv = + (struct ocfs2_protocol_version *)lvb; + /* + * ocfs2_protocol_version has two u8 variables, so we don't + * need any endian conversion. + */ + ver->pv_major = pv->pv_major; + ver->pv_minor = pv->pv_minor; +} + +static void version_to_lvb(struct ocfs2_protocol_version *ver, char *lvb) +{ + struct ocfs2_protocol_version *pv = + (struct ocfs2_protocol_version *)lvb; + /* + * ocfs2_protocol_version has two u8 variables, so we don't + * need any endian conversion. + */ + pv->pv_major = ver->pv_major; + pv->pv_minor = ver->pv_minor; +} + +static void sync_wait_cb(void *arg) +{ + struct ocfs2_cluster_connection *conn = arg; + struct ocfs2_live_connection *lc = conn->cc_private; + complete(&lc->oc_sync_wait); +} + +static int sync_unlock(struct ocfs2_cluster_connection *conn, + struct dlm_lksb *lksb, char *name) +{ + int error; + struct ocfs2_live_connection *lc = conn->cc_private; + + error = dlm_unlock(conn->cc_lockspace, lksb->sb_lkid, 0, lksb, conn); + if (error) { + printk(KERN_ERR "%s lkid %x error %d\n", + name, lksb->sb_lkid, error); + return error; + } + + wait_for_completion(&lc->oc_sync_wait); + + if (lksb->sb_status != -DLM_EUNLOCK) { + printk(KERN_ERR "%s lkid %x status %d\n", + name, lksb->sb_lkid, lksb->sb_status); + return -1; + } + return 0; +} + +static int sync_lock(struct ocfs2_cluster_connection *conn, + int mode, uint32_t flags, + struct dlm_lksb *lksb, char *name) +{ + int error, status; + struct ocfs2_live_connection *lc = conn->cc_private; + + error = dlm_lock(conn->cc_lockspace, mode, lksb, flags, + name, strlen(name), + 0, sync_wait_cb, conn, NULL); + if (error) { + printk(KERN_ERR "%s lkid %x flags %x mode %d error %d\n", + name, lksb->sb_lkid, flags, mode, error); + return error; + } + + wait_for_completion(&lc->oc_sync_wait); + + status = lksb->sb_status; + + if (status && status != -EAGAIN) { + printk(KERN_ERR "%s lkid %x flags %x mode %d status %d\n", + name, lksb->sb_lkid, flags, mode, status); + } + + return status; +} + + +static int version_lock(struct ocfs2_cluster_connection *conn, int mode, + int flags) +{ + struct ocfs2_live_connection *lc = conn->cc_private; + return sync_lock(conn, mode, flags, + &lc->oc_version_lksb, VERSION_LOCK); +} + +static int version_unlock(struct ocfs2_cluster_connection *conn) +{ + struct ocfs2_live_connection *lc = conn->cc_private; + return sync_unlock(conn, &lc->oc_version_lksb, VERSION_LOCK); +} + +/* get_protocol_version() + * + * To exchange ocfs2 versioning, we use the LVB of the version dlm lock. + * The algorithm is: + * 1. Attempt to take the lock in EX mode (non-blocking). + * 2. If successful (which means it is the first mount), write the + * version number and downconvert to PR lock. + * 3. If unsuccessful (returns -EAGAIN), read the version from the LVB after + * taking the PR lock. + */ + +static int get_protocol_version(struct ocfs2_cluster_connection *conn) +{ + int ret; + struct ocfs2_live_connection *lc = conn->cc_private; + struct ocfs2_protocol_version pv; + + running_proto.pv_major = + ocfs2_user_plugin.sp_max_proto.pv_major; + running_proto.pv_minor = + ocfs2_user_plugin.sp_max_proto.pv_minor; + + lc->oc_version_lksb.sb_lvbptr = lc->oc_lvb; + ret = version_lock(conn, DLM_LOCK_EX, + DLM_LKF_VALBLK|DLM_LKF_NOQUEUE); + if (!ret) { + conn->cc_version.pv_major = running_proto.pv_major; + conn->cc_version.pv_minor = running_proto.pv_minor; + version_to_lvb(&running_proto, lc->oc_lvb); + version_lock(conn, DLM_LOCK_PR, DLM_LKF_CONVERT|DLM_LKF_VALBLK); + } else if (ret == -EAGAIN) { + ret = version_lock(conn, DLM_LOCK_PR, DLM_LKF_VALBLK); + if (ret) + goto out; + lvb_to_version(lc->oc_lvb, &pv); + + if ((pv.pv_major != running_proto.pv_major) || + (pv.pv_minor > running_proto.pv_minor)) { + ret = -EINVAL; + goto out; + } + + conn->cc_version.pv_major = pv.pv_major; + conn->cc_version.pv_minor = pv.pv_minor; + } +out: + return ret; +} + +static void user_recover_prep(void *arg) +{ +} + +static void user_recover_slot(void *arg, struct dlm_slot *slot) +{ + struct ocfs2_cluster_connection *conn = arg; + printk(KERN_INFO "ocfs2: Node %d/%d down. Initiating recovery.\n", + slot->nodeid, slot->slot); + conn->cc_recovery_handler(slot->nodeid, conn->cc_recovery_data); + +} + +static void user_recover_done(void *arg, struct dlm_slot *slots, + int num_slots, int our_slot, + uint32_t generation) +{ + struct ocfs2_cluster_connection *conn = arg; + struct ocfs2_live_connection *lc = conn->cc_private; + int i; + + for (i = 0; i < num_slots; i++) + if (slots[i].slot == our_slot) { + atomic_set(&lc->oc_this_node, slots[i].nodeid); + break; + } + + lc->oc_our_slot = our_slot; + wake_up(&lc->oc_wait); +} + +static const struct dlm_lockspace_ops ocfs2_ls_ops = { + .recover_prep = user_recover_prep, + .recover_slot = user_recover_slot, + .recover_done = user_recover_done, +}; + +static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn) +{ + version_unlock(conn); + dlm_release_lockspace(conn->cc_lockspace, 2); + conn->cc_lockspace = NULL; + ocfs2_live_connection_drop(conn->cc_private); + conn->cc_private = NULL; + return 0; +} + static int user_cluster_connect(struct ocfs2_cluster_connection *conn) { dlm_lockspace_t *fsdlm; - struct ocfs2_live_connection *uninitialized_var(control); - int rc = 0; + struct ocfs2_live_connection *lc; + int rc, ops_rv; BUG_ON(conn == NULL); - rc = ocfs2_live_connection_new(conn, &control); + lc = kzalloc(sizeof(struct ocfs2_live_connection), GFP_KERNEL); + if (!lc) { + rc = -ENOMEM; + goto out; + } + + init_waitqueue_head(&lc->oc_wait); + init_completion(&lc->oc_sync_wait); + atomic_set(&lc->oc_this_node, 0); + conn->cc_private = lc; + lc->oc_type = NO_CONTROLD; + + rc = dlm_new_lockspace(conn->cc_name, conn->cc_cluster_name, + DLM_LSFL_FS, DLM_LVB_LEN, + &ocfs2_ls_ops, conn, &ops_rv, &fsdlm); + if (rc) + goto out; + + if (ops_rv == -EOPNOTSUPP) { + lc->oc_type = WITH_CONTROLD; + printk(KERN_NOTICE "ocfs2: You seem to be using an older " + "version of dlm_controld and/or ocfs2-tools." + " Please consider upgrading.\n"); + } else if (ops_rv) { + rc = ops_rv; + goto out; + } + conn->cc_lockspace = fsdlm; + + rc = ocfs2_live_connection_attach(conn, lc); if (rc) goto out; + if (lc->oc_type == NO_CONTROLD) { + rc = get_protocol_version(conn); + if (rc) { + printk(KERN_ERR "ocfs2: Could not determine" + " locking version\n"); + user_cluster_disconnect(conn); + goto out; + } + wait_event(lc->oc_wait, (atomic_read(&lc->oc_this_node) > 0)); + } + /* * running_proto must have been set before we allowed any mounts * to proceed. @@ -818,42 +1054,34 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn) if (fs_protocol_compare(&running_proto, &conn->cc_version)) { printk(KERN_ERR "Unable to mount with fs locking protocol version " - "%u.%u because the userspace control daemon has " - "negotiated %u.%u\n", + "%u.%u because negotiated protocol is %u.%u\n", conn->cc_version.pv_major, conn->cc_version.pv_minor, running_proto.pv_major, running_proto.pv_minor); rc = -EPROTO; - ocfs2_live_connection_drop(control); - goto out; - } - - rc = dlm_new_lockspace(conn->cc_name, NULL, DLM_LSFL_FS, DLM_LVB_LEN, - NULL, NULL, NULL, &fsdlm); - if (rc) { - ocfs2_live_connection_drop(control); - goto out; + ocfs2_live_connection_drop(lc); + lc = NULL; } - conn->cc_private = control; - conn->cc_lockspace = fsdlm; out: + if (rc && lc) + kfree(lc); return rc; } -static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn) -{ - dlm_release_lockspace(conn->cc_lockspace, 2); - conn->cc_lockspace = NULL; - ocfs2_live_connection_drop(conn->cc_private); - conn->cc_private = NULL; - return 0; -} -static int user_cluster_this_node(unsigned int *this_node) +static int user_cluster_this_node(struct ocfs2_cluster_connection *conn, + unsigned int *this_node) { int rc; + struct ocfs2_live_connection *lc = conn->cc_private; + + if (lc->oc_type == WITH_CONTROLD) + rc = ocfs2_control_get_this_node(); + else if (lc->oc_type == NO_CONTROLD) + rc = atomic_read(&lc->oc_this_node); + else + rc = -EINVAL; - rc = ocfs2_control_get_this_node(); if (rc < 0) return rc; diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index cb7ec0b..1324e66 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -309,6 +309,8 @@ int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino, EXPORT_SYMBOL_GPL(ocfs2_plock); int ocfs2_cluster_connect(const char *stack_name, + const char *cluster_name, + int cluster_name_len, const char *group, int grouplen, struct ocfs2_locking_protocol *lproto, @@ -342,8 +344,10 @@ int ocfs2_cluster_connect(const char *stack_name, goto out; } - memcpy(new_conn->cc_name, group, grouplen); + strlcpy(new_conn->cc_name, group, GROUP_NAME_MAX + 1); new_conn->cc_namelen = grouplen; + strlcpy(new_conn->cc_cluster_name, cluster_name, CLUSTER_NAME_MAX + 1); + new_conn->cc_cluster_name_len = cluster_name_len; new_conn->cc_recovery_handler = recovery_handler; new_conn->cc_recovery_data = recovery_data; @@ -386,8 +390,9 @@ int ocfs2_cluster_connect_agnostic(const char *group, if (cluster_stack_name[0]) stack_name = cluster_stack_name; - return ocfs2_cluster_connect(stack_name, group, grouplen, lproto, - recovery_handler, recovery_data, conn); + return ocfs2_cluster_connect(stack_name, NULL, 0, group, grouplen, + lproto, recovery_handler, recovery_data, + conn); } EXPORT_SYMBOL_GPL(ocfs2_cluster_connect_agnostic); @@ -460,9 +465,10 @@ void ocfs2_cluster_hangup(const char *group, int grouplen) } EXPORT_SYMBOL_GPL(ocfs2_cluster_hangup); -int ocfs2_cluster_this_node(unsigned int *node) +int ocfs2_cluster_this_node(struct ocfs2_cluster_connection *conn, + unsigned int *node) { - return active_stack->sp_ops->this_node(node); + return active_stack->sp_ops->this_node(conn, node); } EXPORT_SYMBOL_GPL(ocfs2_cluster_this_node); diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h index 1ec56fd..66334a3 100644 --- a/fs/ocfs2/stackglue.h +++ b/fs/ocfs2/stackglue.h @@ -45,6 +45,9 @@ struct file_lock; */ #define GROUP_NAME_MAX 64 +/* This shadows OCFS2_CLUSTER_NAME_LEN */ +#define CLUSTER_NAME_MAX 16 + /* * ocfs2_protocol_version changes when ocfs2 does something different in @@ -97,8 +100,10 @@ struct ocfs2_locking_protocol { * locking compatibility. */ struct ocfs2_cluster_connection { - char cc_name[GROUP_NAME_MAX]; + char cc_name[GROUP_NAME_MAX + 1]; int cc_namelen; + char cc_cluster_name[CLUSTER_NAME_MAX + 1]; + int cc_cluster_name_len; struct ocfs2_protocol_version cc_version; struct ocfs2_locking_protocol *cc_proto; void (*cc_recovery_handler)(int node_num, void *recovery_data); @@ -152,7 +157,8 @@ struct ocfs2_stack_operations { * ->this_node() returns the cluster's unique identifier for the * local node. */ - int (*this_node)(unsigned int *node); + int (*this_node)(struct ocfs2_cluster_connection *conn, + unsigned int *node); /* * Call the underlying dlm lock function. The ->dlm_lock() @@ -239,6 +245,8 @@ struct ocfs2_stack_plugin { /* Used by the filesystem */ int ocfs2_cluster_connect(const char *stack_name, + const char *cluster_name, + int cluster_name_len, const char *group, int grouplen, struct ocfs2_locking_protocol *lproto, @@ -260,7 +268,8 @@ int ocfs2_cluster_connect_agnostic(const char *group, int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn, int hangup_pending); void ocfs2_cluster_hangup(const char *group, int grouplen); -int ocfs2_cluster_this_node(unsigned int *node); +int ocfs2_cluster_this_node(struct ocfs2_cluster_connection *conn, + unsigned int *node); struct ocfs2_lock_res; int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn, diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 2c91452..47ae266 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -113,12 +113,6 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, struct ocfs2_suballoc_result *res); static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, int nr); -static inline int ocfs2_block_group_set_bits(handle_t *handle, - struct inode *alloc_inode, - struct ocfs2_group_desc *bg, - struct buffer_head *group_bh, - unsigned int bit_off, - unsigned int num_bits); static int ocfs2_relink_block_group(handle_t *handle, struct inode *alloc_inode, struct buffer_head *fe_bh, @@ -1343,7 +1337,7 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb, return status; } -static inline int ocfs2_block_group_set_bits(handle_t *handle, +int ocfs2_block_group_set_bits(handle_t *handle, struct inode *alloc_inode, struct ocfs2_group_desc *bg, struct buffer_head *group_bh, @@ -1388,8 +1382,6 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle, ocfs2_journal_dirty(handle, group_bh); bail: - if (status) - mlog_errno(status); return status; } @@ -1588,7 +1580,7 @@ static int ocfs2_block_group_search(struct inode *inode, return ret; } -static int ocfs2_alloc_dinode_update_counts(struct inode *inode, +int ocfs2_alloc_dinode_update_counts(struct inode *inode, handle_t *handle, struct buffer_head *di_bh, u32 num_bits, diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h index a36d0aa..218d803 100644 --- a/fs/ocfs2/suballoc.h +++ b/fs/ocfs2/suballoc.h @@ -86,6 +86,18 @@ int ocfs2_reserve_clusters(struct ocfs2_super *osb, u32 bits_wanted, struct ocfs2_alloc_context **ac); +int ocfs2_alloc_dinode_update_counts(struct inode *inode, + handle_t *handle, + struct buffer_head *di_bh, + u32 num_bits, + u16 chain); +int ocfs2_block_group_set_bits(handle_t *handle, + struct inode *alloc_inode, + struct ocfs2_group_desc *bg, + struct buffer_head *group_bh, + unsigned int bit_off, + unsigned int num_bits); + int ocfs2_claim_metadata(handle_t *handle, struct ocfs2_alloc_context *ac, u32 bits_wanted, diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index c414929..49d84f8 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -68,7 +68,6 @@ #include "super.h" #include "sysfile.h" #include "uptodate.h" -#include "ver.h" #include "xattr.h" #include "quota.h" #include "refcounttree.h" @@ -90,6 +89,7 @@ static struct dentry *ocfs2_debugfs_root = NULL; MODULE_AUTHOR("Oracle"); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("OCFS2 cluster file system"); struct mount_options { @@ -1618,8 +1618,6 @@ static int __init ocfs2_init(void) { int status, i; - ocfs2_print_version(); - for (i = 0; i < OCFS2_IOEND_WQ_HASH_SZ; i++) init_waitqueue_head(&ocfs2__ioend_wq[i]); @@ -1947,11 +1945,15 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) ocfs2_shutdown_local_alloc(osb); - ocfs2_truncate_log_shutdown(osb); - /* This will disable recovery and flush any recovery work. */ ocfs2_recovery_exit(osb); + /* + * During dismount, when it recovers another node it will call + * ocfs2_recover_orphans and queue delayed work osb_truncate_log_wq. + */ + ocfs2_truncate_log_shutdown(osb); + ocfs2_journal_shutdown(osb); ocfs2_sync_blockdev(sb); @@ -2225,10 +2227,9 @@ static int ocfs2_initialize_super(struct super_block *sb, if (ocfs2_clusterinfo_valid(osb)) { osb->osb_stackflags = OCFS2_RAW_SB(di)->s_cluster_info.ci_stackflags; - memcpy(osb->osb_cluster_stack, + strlcpy(osb->osb_cluster_stack, OCFS2_RAW_SB(di)->s_cluster_info.ci_stack, - OCFS2_STACK_LABEL_LEN); - osb->osb_cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0'; + OCFS2_STACK_LABEL_LEN + 1); if (strlen(osb->osb_cluster_stack) != OCFS2_STACK_LABEL_LEN) { mlog(ML_ERROR, "couldn't mount because of an invalid " @@ -2237,6 +2238,9 @@ static int ocfs2_initialize_super(struct super_block *sb, status = -EINVAL; goto bail; } + strlcpy(osb->osb_cluster_name, + OCFS2_RAW_SB(di)->s_cluster_info.ci_cluster, + OCFS2_CLUSTER_NAME_LEN + 1); } else { /* The empty string is identical with classic tools that * don't know about s_cluster_info. */ diff --git a/fs/ocfs2/ver.c b/fs/ocfs2/ver.c deleted file mode 100644 index e2488f4..0000000 --- a/fs/ocfs2/ver.c +++ /dev/null @@ -1,43 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * ver.c - * - * version string - * - * Copyright (C) 2002, 2005 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include <linux/module.h> -#include <linux/string.h> -#include <linux/kernel.h> - -#include "ver.h" - -#define OCFS2_BUILD_VERSION "1.5.0" - -#define VERSION_STR "OCFS2 " OCFS2_BUILD_VERSION - -void ocfs2_print_version(void) -{ - printk(KERN_INFO "%s\n", VERSION_STR); -} - -MODULE_DESCRIPTION(VERSION_STR); - -MODULE_VERSION(OCFS2_BUILD_VERSION); diff --git a/fs/ocfs2/ver.h b/fs/ocfs2/ver.h deleted file mode 100644 index d7395cb..0000000 --- a/fs/ocfs2/ver.h +++ /dev/null @@ -1,31 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * ver.h - * - * Function prototypes - * - * Copyright (C) 2002, 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#ifndef OCFS2_VER_H -#define OCFS2_VER_H - -void ocfs2_print_version(void); - -#endif /* OCFS2_VER_H */ diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 8bd2135..021e7c0 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -22,11 +22,80 @@ #include <linux/errno.h> -EXPORT_SYMBOL(posix_acl_init); -EXPORT_SYMBOL(posix_acl_alloc); -EXPORT_SYMBOL(posix_acl_valid); -EXPORT_SYMBOL(posix_acl_equiv_mode); -EXPORT_SYMBOL(posix_acl_from_mode); +struct posix_acl **acl_by_type(struct inode *inode, int type) +{ + switch (type) { + case ACL_TYPE_ACCESS: + return &inode->i_acl; + case ACL_TYPE_DEFAULT: + return &inode->i_default_acl; + default: + BUG(); + } +} +EXPORT_SYMBOL(acl_by_type); + +struct posix_acl *get_cached_acl(struct inode *inode, int type) +{ + struct posix_acl **p = acl_by_type(inode, type); + struct posix_acl *acl = ACCESS_ONCE(*p); + if (acl) { + spin_lock(&inode->i_lock); + acl = *p; + if (acl != ACL_NOT_CACHED) + acl = posix_acl_dup(acl); + spin_unlock(&inode->i_lock); + } + return acl; +} +EXPORT_SYMBOL(get_cached_acl); + +struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type) +{ + return rcu_dereference(*acl_by_type(inode, type)); +} +EXPORT_SYMBOL(get_cached_acl_rcu); + +void set_cached_acl(struct inode *inode, int type, struct posix_acl *acl) +{ + struct posix_acl **p = acl_by_type(inode, type); + struct posix_acl *old; + spin_lock(&inode->i_lock); + old = *p; + rcu_assign_pointer(*p, posix_acl_dup(acl)); + spin_unlock(&inode->i_lock); + if (old != ACL_NOT_CACHED) + posix_acl_release(old); +} +EXPORT_SYMBOL(set_cached_acl); + +void forget_cached_acl(struct inode *inode, int type) +{ + struct posix_acl **p = acl_by_type(inode, type); + struct posix_acl *old; + spin_lock(&inode->i_lock); + old = *p; + *p = ACL_NOT_CACHED; + spin_unlock(&inode->i_lock); + if (old != ACL_NOT_CACHED) + posix_acl_release(old); +} +EXPORT_SYMBOL(forget_cached_acl); + +void forget_all_cached_acls(struct inode *inode) +{ + struct posix_acl *old_access, *old_default; + spin_lock(&inode->i_lock); + old_access = inode->i_acl; + old_default = inode->i_default_acl; + inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED; + spin_unlock(&inode->i_lock); + if (old_access != ACL_NOT_CACHED) + posix_acl_release(old_access); + if (old_default != ACL_NOT_CACHED) + posix_acl_release(old_default); +} +EXPORT_SYMBOL(forget_all_cached_acls); /* * Init a fresh posix_acl @@ -37,6 +106,7 @@ posix_acl_init(struct posix_acl *acl, int count) atomic_set(&acl->a_refcount, 1); acl->a_count = count; } +EXPORT_SYMBOL(posix_acl_init); /* * Allocate a new ACL with the specified number of entries. @@ -51,6 +121,7 @@ posix_acl_alloc(int count, gfp_t flags) posix_acl_init(acl, count); return acl; } +EXPORT_SYMBOL(posix_acl_alloc); /* * Clone an ACL. @@ -146,6 +217,7 @@ posix_acl_valid(const struct posix_acl *acl) return 0; return -EINVAL; } +EXPORT_SYMBOL(posix_acl_valid); /* * Returns 0 if the acl can be exactly represented in the traditional @@ -186,6 +258,7 @@ posix_acl_equiv_mode(const struct posix_acl *acl, umode_t *mode_p) *mode_p = (*mode_p & ~S_IRWXUGO) | mode; return not_equiv; } +EXPORT_SYMBOL(posix_acl_equiv_mode); /* * Create an ACL representing the file mode permission bits of an inode. @@ -207,6 +280,7 @@ posix_acl_from_mode(umode_t mode, gfp_t flags) acl->a_entries[2].e_perm = (mode & S_IRWXO); return acl; } +EXPORT_SYMBOL(posix_acl_from_mode); /* * Return 0 if current is granted want access to the inode diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index a77d2b2..24270ec 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -26,7 +26,11 @@ static int meminfo_proc_show(struct seq_file *m, void *v) unsigned long committed; struct vmalloc_info vmi; long cached; + long available; + unsigned long pagecache; + unsigned long wmark_low = 0; unsigned long pages[NR_LRU_LISTS]; + struct zone *zone; int lru; /* @@ -47,12 +51,44 @@ static int meminfo_proc_show(struct seq_file *m, void *v) for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) pages[lru] = global_page_state(NR_LRU_BASE + lru); + for_each_zone(zone) + wmark_low += zone->watermark[WMARK_LOW]; + + /* + * Estimate the amount of memory available for userspace allocations, + * without causing swapping. + * + * Free memory cannot be taken below the low watermark, before the + * system starts swapping. + */ + available = i.freeram - wmark_low; + + /* + * Not all the page cache can be freed, otherwise the system will + * start swapping. Assume at least half of the page cache, or the + * low watermark worth of cache, needs to stay. + */ + pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE]; + pagecache -= min(pagecache / 2, wmark_low); + available += pagecache; + + /* + * Part of the reclaimable swap consists of items that are in use, + * and cannot be freed. Cap this estimate at the low watermark. + */ + available += global_page_state(NR_SLAB_RECLAIMABLE) - + min(global_page_state(NR_SLAB_RECLAIMABLE) / 2, wmark_low); + + if (available < 0) + available = 0; + /* * Tagged format, for easy grepping and expansion. */ seq_printf(m, "MemTotal: %8lu kB\n" "MemFree: %8lu kB\n" + "MemAvailable: %8lu kB\n" "Buffers: %8lu kB\n" "Cached: %8lu kB\n" "SwapCached: %8lu kB\n" @@ -105,6 +141,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) , K(i.totalram), K(i.freeram), + K(available), K(i.bufferram), K(cached), K(total_swapcache_pages()), diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index 39d1465..6a3e2c4 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -275,4 +275,4 @@ int __init init_ramfs_fs(void) return err; } -module_init(init_ramfs_fs) +fs_initcall(init_ramfs_fs); diff --git a/fs/read_write.c b/fs/read_write.c index 58e440d..1193ffd 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -901,10 +901,6 @@ static ssize_t compat_do_readv_writev(int type, struct file *file, io_fn_t fn; iov_fn_t fnv; - ret = -EFAULT; - if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector))) - goto out; - ret = compat_rw_copy_check_uvector(type, uvector, nr_segs, UIO_FASTIOV, iovstack, &iov); if (ret <= 0) @@ -166,6 +166,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) if (!s) return NULL; + INIT_LIST_HEAD(&s->s_mounts); + if (security_sb_alloc(s)) goto fail; @@ -188,7 +190,6 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) if (list_lru_init(&s->s_inode_lru)) goto fail; - INIT_LIST_HEAD(&s->s_mounts); init_rwsem(&s->s_umount); lockdep_set_class(&s->s_umount, &type->s_umount_key); /* diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index f1f07d3..2fae55d 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h @@ -5,6 +5,7 @@ #define _LINUX_BOOTMEM_H #include <linux/mmzone.h> +#include <linux/mm_types.h> #include <asm/dma.h> /* @@ -52,7 +53,6 @@ extern void free_bootmem_node(pg_data_t *pgdat, unsigned long size); extern void free_bootmem(unsigned long physaddr, unsigned long size); extern void free_bootmem_late(unsigned long physaddr, unsigned long size); -extern void __free_pages_bootmem(struct page *page, unsigned int order); /* * Flags for reserve_bootmem (also if CONFIG_HAVE_ARCH_BOOTMEM_NODE, @@ -142,6 +142,157 @@ extern void *__alloc_bootmem_low_node(pg_data_t *pgdat, #define alloc_bootmem_low_pages_node(pgdat, x) \ __alloc_bootmem_low_node(pgdat, x, PAGE_SIZE, 0) + +#if defined(CONFIG_HAVE_MEMBLOCK) && defined(CONFIG_NO_BOOTMEM) + +/* FIXME: use MEMBLOCK_ALLOC_* variants here */ +#define BOOTMEM_ALLOC_ACCESSIBLE 0 +#define BOOTMEM_ALLOC_ANYWHERE (~(phys_addr_t)0) + +/* FIXME: Move to memblock.h at a point where we remove nobootmem.c */ +void *memblock_virt_alloc_try_nid_nopanic(phys_addr_t size, + phys_addr_t align, phys_addr_t min_addr, + phys_addr_t max_addr, int nid); +void *memblock_virt_alloc_try_nid(phys_addr_t size, phys_addr_t align, + phys_addr_t min_addr, phys_addr_t max_addr, int nid); +void __memblock_free_early(phys_addr_t base, phys_addr_t size); +void __memblock_free_late(phys_addr_t base, phys_addr_t size); + +static inline void * __init memblock_virt_alloc( + phys_addr_t size, phys_addr_t align) +{ + return memblock_virt_alloc_try_nid(size, align, BOOTMEM_LOW_LIMIT, + BOOTMEM_ALLOC_ACCESSIBLE, + NUMA_NO_NODE); +} + +static inline void * __init memblock_virt_alloc_nopanic( + phys_addr_t size, phys_addr_t align) +{ + return memblock_virt_alloc_try_nid_nopanic(size, align, + BOOTMEM_LOW_LIMIT, + BOOTMEM_ALLOC_ACCESSIBLE, + NUMA_NO_NODE); +} + +static inline void * __init memblock_virt_alloc_from_nopanic( + phys_addr_t size, phys_addr_t align, phys_addr_t min_addr) +{ + return memblock_virt_alloc_try_nid_nopanic(size, align, min_addr, + BOOTMEM_ALLOC_ACCESSIBLE, + NUMA_NO_NODE); +} + +static inline void * __init memblock_virt_alloc_node( + phys_addr_t size, int nid) +{ + return memblock_virt_alloc_try_nid(size, 0, BOOTMEM_LOW_LIMIT, + BOOTMEM_ALLOC_ACCESSIBLE, nid); +} + +static inline void * __init memblock_virt_alloc_node_nopanic( + phys_addr_t size, int nid) +{ + return memblock_virt_alloc_try_nid_nopanic(size, 0, BOOTMEM_LOW_LIMIT, + BOOTMEM_ALLOC_ACCESSIBLE, + nid); +} + +static inline void __init memblock_free_early( + phys_addr_t base, phys_addr_t size) +{ + __memblock_free_early(base, size); +} + +static inline void __init memblock_free_early_nid( + phys_addr_t base, phys_addr_t size, int nid) +{ + __memblock_free_early(base, size); +} + +static inline void __init memblock_free_late( + phys_addr_t base, phys_addr_t size) +{ + __memblock_free_late(base, size); +} + +#else + +#define BOOTMEM_ALLOC_ACCESSIBLE 0 + + +/* Fall back to all the existing bootmem APIs */ +static inline void * __init memblock_virt_alloc( + phys_addr_t size, phys_addr_t align) +{ + if (!align) + align = SMP_CACHE_BYTES; + return __alloc_bootmem(size, align, BOOTMEM_LOW_LIMIT); +} + +static inline void * __init memblock_virt_alloc_nopanic( + phys_addr_t size, phys_addr_t align) +{ + if (!align) + align = SMP_CACHE_BYTES; + return __alloc_bootmem_nopanic(size, align, BOOTMEM_LOW_LIMIT); +} + +static inline void * __init memblock_virt_alloc_from_nopanic( + phys_addr_t size, phys_addr_t align, phys_addr_t min_addr) +{ + return __alloc_bootmem_nopanic(size, align, min_addr); +} + +static inline void * __init memblock_virt_alloc_node( + phys_addr_t size, int nid) +{ + return __alloc_bootmem_node(NODE_DATA(nid), size, SMP_CACHE_BYTES, + BOOTMEM_LOW_LIMIT); +} + +static inline void * __init memblock_virt_alloc_node_nopanic( + phys_addr_t size, int nid) +{ + return __alloc_bootmem_node_nopanic(NODE_DATA(nid), size, + SMP_CACHE_BYTES, + BOOTMEM_LOW_LIMIT); +} + +static inline void * __init memblock_virt_alloc_try_nid(phys_addr_t size, + phys_addr_t align, phys_addr_t min_addr, phys_addr_t max_addr, int nid) +{ + return __alloc_bootmem_node_high(NODE_DATA(nid), size, align, + min_addr); +} + +static inline void * __init memblock_virt_alloc_try_nid_nopanic( + phys_addr_t size, phys_addr_t align, + phys_addr_t min_addr, phys_addr_t max_addr, int nid) +{ + return ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size, align, + min_addr, max_addr); +} + +static inline void __init memblock_free_early( + phys_addr_t base, phys_addr_t size) +{ + free_bootmem(base, size); +} + +static inline void __init memblock_free_early_nid( + phys_addr_t base, phys_addr_t size, int nid) +{ + free_bootmem_node(NODE_DATA(nid), base, size); +} + +static inline void __init memblock_free_late( + phys_addr_t base, phys_addr_t size) +{ + free_bootmem_late(base, size); +} +#endif /* defined(CONFIG_HAVE_MEMBLOCK) && defined(CONFIG_NO_BOOTMEM) */ + #ifdef CONFIG_HAVE_ARCH_ALLOC_REMAP extern void *alloc_remap(int nid, unsigned long size); #else diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 091d72e..7e1c76e 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -62,6 +62,22 @@ static inline bool compaction_deferred(struct zone *zone, int order) return zone->compact_considered < defer_limit; } +/* + * Update defer tracking counters after successful compaction of given order, + * which means an allocation either succeeded (alloc_success == true) or is + * expected to succeed. + */ +static inline void compaction_defer_reset(struct zone *zone, int order, + bool alloc_success) +{ + if (alloc_success) { + zone->compact_considered = 0; + zone->compact_defer_shift = 0; + } + if (order >= zone->compact_order_failed) + zone->compact_order_failed = order + 1; +} + /* Returns true if restarting compaction after many failures */ static inline bool compaction_restarting(struct zone *zone, int order) { diff --git a/include/linux/dma-debug.h b/include/linux/dma-debug.h index fc0e34c..fe8cb61 100644 --- a/include/linux/dma-debug.h +++ b/include/linux/dma-debug.h @@ -85,6 +85,8 @@ extern void debug_dma_sync_sg_for_device(struct device *dev, extern void debug_dma_dump_mappings(struct device *dev); +extern void debug_dma_assert_idle(struct page *page); + #else /* CONFIG_DMA_API_DEBUG */ static inline void dma_debug_add_bus(struct bus_type *bus) @@ -183,6 +185,10 @@ static inline void debug_dma_dump_mappings(struct device *dev) { } +static inline void debug_dma_assert_idle(struct page *page) +{ +} + #endif /* CONFIG_DMA_API_DEBUG */ #endif /* __DMA_DEBUG_H */ diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 4b2ee8d1..7d8d5e6 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -15,7 +15,6 @@ #include <linux/path.h> /* struct path */ #include <linux/spinlock.h> #include <linux/types.h> - #include <linux/atomic.h> /* @@ -79,6 +78,7 @@ struct fsnotify_group; struct fsnotify_event; struct fsnotify_mark; struct fsnotify_event_private_data; +struct fsnotify_fname; /* * Each group much define these ops. The fsnotify infrastructure will call @@ -94,17 +94,27 @@ struct fsnotify_event_private_data; * userspace messages that marks have been removed. */ struct fsnotify_ops { - bool (*should_send_event)(struct fsnotify_group *group, struct inode *inode, - struct fsnotify_mark *inode_mark, - struct fsnotify_mark *vfsmount_mark, - __u32 mask, void *data, int data_type); int (*handle_event)(struct fsnotify_group *group, + struct inode *inode, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, - struct fsnotify_event *event); + u32 mask, void *data, int data_type, + const unsigned char *file_name); void (*free_group_priv)(struct fsnotify_group *group); void (*freeing_mark)(struct fsnotify_mark *mark, struct fsnotify_group *group); - void (*free_event_priv)(struct fsnotify_event_private_data *priv); + void (*free_event)(struct fsnotify_event *event); +}; + +/* + * all of the information about the original object we want to now send to + * a group. If you want to carry more info from the accessing task to the + * listener this structure is where you need to be adding fields. + */ +struct fsnotify_event { + struct list_head list; + /* inode may ONLY be dereferenced during handle_event(). */ + struct inode *inode; /* either the inode the event happened to or its parent */ + u32 mask; /* the type of access, bitwise OR for FS_* event types */ }; /* @@ -148,7 +158,11 @@ struct fsnotify_group { * a group */ struct list_head marks_list; /* all inode marks for this group */ - struct fasync_struct *fsn_fa; /* async notification */ + struct fasync_struct *fsn_fa; /* async notification */ + + struct fsnotify_event overflow_event; /* Event we queue when the + * notification list is too + * full */ /* groups can define private fields here or use the void *private */ union { @@ -177,76 +191,10 @@ struct fsnotify_group { }; }; -/* - * A single event can be queued in multiple group->notification_lists. - * - * each group->notification_list will point to an event_holder which in turns points - * to the actual event that needs to be sent to userspace. - * - * Seemed cheaper to create a refcnt'd event and a small holder for every group - * than create a different event for every group - * - */ -struct fsnotify_event_holder { - struct fsnotify_event *event; - struct list_head event_list; -}; - -/* - * Inotify needs to tack data onto an event. This struct lets us later find the - * correct private data of the correct group. - */ -struct fsnotify_event_private_data { - struct fsnotify_group *group; - struct list_head event_list; -}; - -/* - * all of the information about the original object we want to now send to - * a group. If you want to carry more info from the accessing task to the - * listener this structure is where you need to be adding fields. - */ -struct fsnotify_event { - /* - * If we create an event we are also likely going to need a holder - * to link to a group. So embed one holder in the event. Means only - * one allocation for the common case where we only have one group - */ - struct fsnotify_event_holder holder; - spinlock_t lock; /* protection for the associated event_holder and private_list */ - /* to_tell may ONLY be dereferenced during handle_event(). */ - struct inode *to_tell; /* either the inode the event happened to or its parent */ - /* - * depending on the event type we should have either a path or inode - * We hold a reference on path, but NOT on inode. Since we have the ref on - * the path, it may be dereferenced at any point during this object's - * lifetime. That reference is dropped when this object's refcnt hits - * 0. If this event contains an inode instead of a path, the inode may - * ONLY be used during handle_event(). - */ - union { - struct path path; - struct inode *inode; - }; /* when calling fsnotify tell it if the data is a path or inode */ #define FSNOTIFY_EVENT_NONE 0 #define FSNOTIFY_EVENT_PATH 1 #define FSNOTIFY_EVENT_INODE 2 - int data_type; /* which of the above union we have */ - atomic_t refcnt; /* how many groups still are using/need to send this event */ - __u32 mask; /* the type of access, bitwise OR for FS_* event types */ - - u32 sync_cookie; /* used to corrolate events, namely inotify mv events */ - const unsigned char *file_name; - size_t name_len; - struct pid *tgid; - -#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS - __u32 response; /* userspace answer to question */ -#endif /* CONFIG_FANOTIFY_ACCESS_PERMISSIONS */ - - struct list_head private_data_list; /* groups can store private data here */ -}; /* * Inode specific fields in an fsnotify_mark @@ -370,17 +318,12 @@ extern void fsnotify_put_group(struct fsnotify_group *group); extern void fsnotify_destroy_group(struct fsnotify_group *group); /* fasync handler function */ extern int fsnotify_fasync(int fd, struct file *file, int on); -/* take a reference to an event */ -extern void fsnotify_get_event(struct fsnotify_event *event); -extern void fsnotify_put_event(struct fsnotify_event *event); -/* find private data previously attached to an event and unlink it */ -extern struct fsnotify_event_private_data *fsnotify_remove_priv_from_event(struct fsnotify_group *group, - struct fsnotify_event *event); - +/* Free event from memory */ +extern void fsnotify_destroy_event(struct fsnotify_group *group, + struct fsnotify_event *event); /* attach the event to the group notification queue */ extern struct fsnotify_event *fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_event *event, - struct fsnotify_event_private_data *priv, struct fsnotify_event *(*merge)(struct list_head *, struct fsnotify_event *)); /* true if the group notification queue is empty */ @@ -430,15 +373,8 @@ extern void fsnotify_put_mark(struct fsnotify_mark *mark); extern void fsnotify_unmount_inodes(struct list_head *list); /* put here because inotify does some weird stuff when destroying watches */ -extern struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, - void *data, int data_is, - const unsigned char *name, - u32 cookie, gfp_t gfp); - -/* fanotify likes to change events after they are on lists... */ -extern struct fsnotify_event *fsnotify_clone_event(struct fsnotify_event *old_event); -extern int fsnotify_replace_event(struct fsnotify_event_holder *old_holder, - struct fsnotify_event *new_event); +extern void fsnotify_init_event(struct fsnotify_event *event, + struct inode *to_tell, u32 mask); #else diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 91672e2..db51201 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -157,6 +157,26 @@ static inline int hpage_nr_pages(struct page *page) return HPAGE_PMD_NR; return 1; } +/* + * compound_trans_head() should be used instead of compound_head(), + * whenever the "page" passed as parameter could be the tail of a + * transparent hugepage that could be undergoing a + * __split_huge_page_refcount(). The page structure layout often + * changes across releases and it makes extensive use of unions. So if + * the page structure layout will change in a way that + * page->first_page gets clobbered by __split_huge_page_refcount, the + * implementation making use of smp_rmb() will be required. + * + * Currently we define compound_trans_head as compound_head, because + * page->private is in the same union with page->first_page, and + * page->private isn't clobbered. However this also means we're + * currently leaving dirt into the page->private field of anonymous + * pages resulting from a THP split, instead of setting page->private + * to zero like for every other page that has PG_private not set. But + * anonymous pages don't use page->private so this is not a problem. + */ +#if 0 +/* This will be needed if page->private will be clobbered in split_huge_page */ static inline struct page *compound_trans_head(struct page *page) { if (PageTail(page)) { @@ -174,6 +194,9 @@ static inline struct page *compound_trans_head(struct page *page) } return page; } +#else +#define compound_trans_head(page) compound_head(page) +#endif extern int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pmd_t pmd, pmd_t *pmdp); diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index bd7e987..d01cc97 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -31,7 +31,6 @@ struct hugepage_subpool *hugepage_new_subpool(long nr_blocks); void hugepage_put_subpool(struct hugepage_subpool *spool); int PageHuge(struct page *page); -int PageHeadHuge(struct page *page_head); void reset_vma_resv_huge_pages(struct vm_area_struct *vma); int hugetlb_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); @@ -104,11 +103,6 @@ static inline int PageHuge(struct page *page) return 0; } -static inline int PageHeadHuge(struct page *page_head) -{ - return 0; -} - static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma) { } @@ -360,6 +354,7 @@ static inline pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma, static inline struct hstate *page_hstate(struct page *page) { + VM_BUG_ON(!PageHuge(page)); return size_to_hstate(PAGE_SIZE << compound_order(page)); } diff --git a/include/linux/init_task.h b/include/linux/init_task.h index f0e5238..1516a8f 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -41,6 +41,7 @@ extern struct fs_struct init_fs; #define INIT_SIGNALS(sig) { \ .nr_threads = 1, \ + .thread_head = LIST_HEAD_INIT(init_task.thread_node), \ .wait_chldexit = __WAIT_QUEUE_HEAD_INITIALIZER(sig.wait_chldexit),\ .shared_pending = { \ .list = LIST_HEAD_INIT(sig.shared_pending.list), \ @@ -222,6 +223,7 @@ extern struct task_group root_task_group; [PIDTYPE_SID] = INIT_PID_LINK(PIDTYPE_SID), \ }, \ .thread_group = LIST_HEAD_INIT(tsk.thread_group), \ + .thread_node = LIST_HEAD_INIT(init_signals.thread_head), \ INIT_IDS \ INIT_PERF_EVENTS(tsk) \ INIT_TRACE_IRQFLAGS \ diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 45c9b6a..3be6bb1 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -73,11 +73,7 @@ static inline void set_page_stable_node(struct page *page, struct page *ksm_might_need_to_copy(struct page *page, struct vm_area_struct *vma, unsigned long address); -int page_referenced_ksm(struct page *page, - struct mem_cgroup *memcg, unsigned long *vm_flags); -int try_to_unmap_ksm(struct page *page, enum ttu_flags flags); -int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *, - struct vm_area_struct *, unsigned long, void *), void *arg); +int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc); void ksm_migrate_page(struct page *newpage, struct page *oldpage); #else /* !CONFIG_KSM */ @@ -115,13 +111,8 @@ static inline int page_referenced_ksm(struct page *page, return 0; } -static inline int try_to_unmap_ksm(struct page *page, enum ttu_flags flags) -{ - return 0; -} - -static inline int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page*, - struct vm_area_struct *, unsigned long, void *), void *arg) +static inline int rmap_walk_ksm(struct page *page, + struct rmap_walk_control *rwc) { return 0; } diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 77c60e5..cd0274b 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -19,9 +19,13 @@ #define INIT_MEMBLOCK_REGIONS 128 +/* Definition of memblock flags. */ +#define MEMBLOCK_HOTPLUG 0x1 /* hotpluggable region */ + struct memblock_region { phys_addr_t base; phys_addr_t size; + unsigned long flags; #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP int nid; #endif @@ -43,12 +47,17 @@ struct memblock { extern struct memblock memblock; extern int memblock_debug; +#ifdef CONFIG_MOVABLE_NODE +/* If movable_node boot option specified */ +extern bool movable_node_enabled; +#endif /* CONFIG_MOVABLE_NODE */ #define memblock_dbg(fmt, ...) \ if (memblock_debug) printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__) -phys_addr_t memblock_find_in_range_node(phys_addr_t start, phys_addr_t end, - phys_addr_t size, phys_addr_t align, int nid); +phys_addr_t memblock_find_in_range_node(phys_addr_t size, phys_addr_t align, + phys_addr_t start, phys_addr_t end, + int nid); phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end, phys_addr_t size, phys_addr_t align); phys_addr_t get_allocated_memblock_reserved_regions_info(phys_addr_t *addr); @@ -59,6 +68,28 @@ int memblock_remove(phys_addr_t base, phys_addr_t size); int memblock_free(phys_addr_t base, phys_addr_t size); int memblock_reserve(phys_addr_t base, phys_addr_t size); void memblock_trim_memory(phys_addr_t align); +int memblock_mark_hotplug(phys_addr_t base, phys_addr_t size); +int memblock_clear_hotplug(phys_addr_t base, phys_addr_t size); +#ifdef CONFIG_MOVABLE_NODE +static inline bool memblock_is_hotpluggable(struct memblock_region *m) +{ + return m->flags & MEMBLOCK_HOTPLUG; +} + +static inline bool movable_node_is_enabled(void) +{ + return movable_node_enabled; +} +#else +static inline bool memblock_is_hotpluggable(struct memblock_region *m) +{ + return false; +} +static inline bool movable_node_is_enabled(void) +{ + return false; +} +#endif #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn, @@ -87,7 +118,7 @@ void __next_free_mem_range(u64 *idx, int nid, phys_addr_t *out_start, /** * for_each_free_mem_range - iterate through free memblock areas * @i: u64 used as loop variable - * @nid: node selector, %MAX_NUMNODES for all nodes + * @nid: node selector, %NUMA_NO_NODE for all nodes * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL * @p_nid: ptr to int for nid of the range, can be %NULL @@ -107,7 +138,7 @@ void __next_free_mem_range_rev(u64 *idx, int nid, phys_addr_t *out_start, /** * for_each_free_mem_range_reverse - rev-iterate through free memblock areas * @i: u64 used as loop variable - * @nid: node selector, %MAX_NUMNODES for all nodes + * @nid: node selector, %NUMA_NO_NODE for all nodes * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL * @p_nid: ptr to int for nid of the range, can be %NULL @@ -121,8 +152,21 @@ void __next_free_mem_range_rev(u64 *idx, int nid, phys_addr_t *out_start, i != (u64)ULLONG_MAX; \ __next_free_mem_range_rev(&i, nid, p_start, p_end, p_nid)) +static inline void memblock_set_region_flags(struct memblock_region *r, + unsigned long flags) +{ + r->flags |= flags; +} + +static inline void memblock_clear_region_flags(struct memblock_region *r, + unsigned long flags) +{ + r->flags &= ~flags; +} + #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP -int memblock_set_node(phys_addr_t base, phys_addr_t size, int nid); +int memblock_set_node(phys_addr_t base, phys_addr_t size, + struct memblock_type *type, int nid); static inline void memblock_set_region_node(struct memblock_region *r, int nid) { diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 9fe426b..5f1ea75 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -211,20 +211,8 @@ static inline void mpol_get(struct mempolicy *pol) { } -static inline struct mempolicy *mpol_dup(struct mempolicy *old) -{ - return NULL; -} - struct shared_policy {}; -static inline int mpol_set_shared_policy(struct shared_policy *info, - struct vm_area_struct *vma, - struct mempolicy *new) -{ - return -EINVAL; -} - static inline void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) { @@ -234,12 +222,6 @@ static inline void mpol_free_shared_policy(struct shared_policy *p) { } -static inline struct mempolicy * -mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) -{ - return NULL; -} - #define vma_policy(vma) NULL static inline int @@ -266,10 +248,6 @@ static inline void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) { } -static inline void mpol_fix_fork_child_flag(struct task_struct *p) -{ -} - static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol, nodemask_t **nodemask) @@ -284,12 +262,6 @@ static inline bool init_nodemask_of_mempolicy(nodemask_t *m) return false; } -static inline bool mempolicy_nodemask_intersects(struct task_struct *tsk, - const nodemask_t *mask) -{ - return false; -} - static inline int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to, int flags) { @@ -307,10 +279,6 @@ static inline int mpol_parse_str(char *str, struct mempolicy **mpol) } #endif -static inline void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) -{ -} - static inline int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long address) { diff --git a/include/linux/migrate.h b/include/linux/migrate.h index f015c05..84a31ad 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -35,16 +35,12 @@ enum migrate_reason { #ifdef CONFIG_MIGRATION -extern void putback_lru_pages(struct list_head *l); extern void putback_movable_pages(struct list_head *l); extern int migrate_page(struct address_space *, struct page *, struct page *, enum migrate_mode); extern int migrate_pages(struct list_head *l, new_page_t x, unsigned long private, enum migrate_mode mode, int reason); -extern int fail_migrate_page(struct address_space *, - struct page *, struct page *); - extern int migrate_prep(void); extern int migrate_prep_local(void); extern int migrate_vmas(struct mm_struct *mm, @@ -59,7 +55,6 @@ extern int migrate_page_move_mapping(struct address_space *mapping, int extra_count); #else -static inline void putback_lru_pages(struct list_head *l) {} static inline void putback_movable_pages(struct list_head *l) {} static inline int migrate_pages(struct list_head *l, new_page_t x, unsigned long private, enum migrate_mode mode, int reason) @@ -86,7 +81,6 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping, /* Possible settings for the migrate_page() method in address_operations */ #define migrate_page NULL -#define fail_migrate_page NULL #endif /* CONFIG_MIGRATION */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 3552717..a512dd8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -57,6 +57,15 @@ extern int sysctl_legacy_va_layout; extern unsigned long sysctl_user_reserve_kbytes; extern unsigned long sysctl_admin_reserve_kbytes; +extern int sysctl_overcommit_memory; +extern int sysctl_overcommit_ratio; +extern unsigned long sysctl_overcommit_kbytes; + +extern int overcommit_ratio_handler(struct ctl_table *, int, void __user *, + size_t *, loff_t *); +extern int overcommit_kbytes_handler(struct ctl_table *, int, void __user *, + size_t *, loff_t *); + #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) /* to align the pointer to the (next) page boundary */ @@ -414,15 +423,44 @@ static inline int page_count(struct page *page) return atomic_read(&compound_head(page)->_count); } +#ifdef CONFIG_HUGETLB_PAGE +extern int PageHeadHuge(struct page *page_head); +#else /* CONFIG_HUGETLB_PAGE */ +static inline int PageHeadHuge(struct page *page_head) +{ + return 0; +} +#endif /* CONFIG_HUGETLB_PAGE */ + +static inline bool __compound_tail_refcounted(struct page *page) +{ + return !PageSlab(page) && !PageHeadHuge(page); +} + +/* + * This takes a head page as parameter and tells if the + * tail page reference counting can be skipped. + * + * For this to be safe, PageSlab and PageHeadHuge must remain true on + * any given page where they return true here, until all tail pins + * have been released. + */ +static inline bool compound_tail_refcounted(struct page *page) +{ + VM_BUG_ON(!PageHead(page)); + return __compound_tail_refcounted(page); +} + static inline void get_huge_page_tail(struct page *page) { /* - * __split_huge_page_refcount() cannot run - * from under us. + * __split_huge_page_refcount() cannot run from under us. */ + VM_BUG_ON(!PageTail(page)); VM_BUG_ON(page_mapcount(page) < 0); VM_BUG_ON(atomic_read(&page->_count) != 0); - atomic_inc(&page->_mapcount); + if (compound_tail_refcounted(page->first_page)) + atomic_inc(&page->_mapcount); } extern bool __get_page_tail(struct page *page); @@ -846,11 +884,14 @@ static __always_inline void *lowmem_page_address(const struct page *page) #endif #if defined(WANT_PAGE_VIRTUAL) -#define page_address(page) ((page)->virtual) -#define set_page_address(page, address) \ - do { \ - (page)->virtual = (address); \ - } while(0) +static inline void *page_address(const struct page *page) +{ + return page->virtual; +} +static inline void set_page_address(struct page *page, void *address) +{ + page->virtual = address; +} #define page_address_init() do { } while(0) #endif @@ -984,7 +1025,6 @@ extern void pagefault_out_of_memory(void); * various contexts. */ #define SHOW_MEM_FILTER_NODES (0x0001u) /* disallowed nodes */ -#define SHOW_MEM_FILTER_PAGE_COUNT (0x0002u) /* page type count */ extern void show_free_areas(unsigned int flags); extern bool skip_free_areas_node(unsigned int flags, int nid); @@ -1318,6 +1358,7 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a #if USE_SPLIT_PTE_PTLOCKS #if ALLOC_SPLIT_PTLOCKS +void __init ptlock_cache_init(void); extern bool ptlock_alloc(struct page *page); extern void ptlock_free(struct page *page); @@ -1326,6 +1367,10 @@ static inline spinlock_t *ptlock_ptr(struct page *page) return page->ptl; } #else /* ALLOC_SPLIT_PTLOCKS */ +static inline void ptlock_cache_init(void) +{ +} + static inline bool ptlock_alloc(struct page *page) { return true; @@ -1378,10 +1423,17 @@ static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd) { return &mm->page_table_lock; } +static inline void ptlock_cache_init(void) {} static inline bool ptlock_init(struct page *page) { return true; } static inline void pte_lock_deinit(struct page *page) {} #endif /* USE_SPLIT_PTE_PTLOCKS */ +static inline void pgtable_init(void) +{ + ptlock_cache_init(); + pgtable_cache_init(); +} + static inline bool pgtable_page_ctor(struct page *page) { inc_zone_page_state(page, NR_PAGETABLE); diff --git a/include/linux/mman.h b/include/linux/mman.h index 7f7f8da..16373c8 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -9,6 +9,7 @@ extern int sysctl_overcommit_memory; extern int sysctl_overcommit_ratio; +extern unsigned long sysctl_overcommit_kbytes; extern struct percpu_counter vm_committed_as; #ifdef CONFIG_SMP diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index bd791e4..5f2052c 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -490,6 +490,12 @@ struct zone { unsigned long managed_pages; /* + * Number of MIGRATE_RESEVE page block. To maintain for just + * optimization. Protected by zone->lock. + */ + int nr_migrate_reserve_block; + + /* * rarely used fields: */ const char *name; @@ -758,10 +764,7 @@ typedef struct pglist_data { int kswapd_max_order; enum zone_type classzone_idx; #ifdef CONFIG_NUMA_BALANCING - /* - * Lock serializing the per destination node AutoNUMA memory - * migration rate limiting data. - */ + /* Lock serializing the migrate rate limiting window */ spinlock_t numabalancing_migrate_lock; /* Rate limiting time interval */ diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h index 7931efe..fb61694 100644 --- a/include/linux/posix_acl.h +++ b/include/linux/posix_acl.h @@ -94,78 +94,12 @@ extern int posix_acl_chmod(struct posix_acl **, gfp_t, umode_t); extern struct posix_acl *get_posix_acl(struct inode *, int); extern int set_posix_acl(struct inode *, int, struct posix_acl *); -#ifdef CONFIG_FS_POSIX_ACL -static inline struct posix_acl **acl_by_type(struct inode *inode, int type) -{ - switch (type) { - case ACL_TYPE_ACCESS: - return &inode->i_acl; - case ACL_TYPE_DEFAULT: - return &inode->i_default_acl; - default: - BUG(); - } -} - -static inline struct posix_acl *get_cached_acl(struct inode *inode, int type) -{ - struct posix_acl **p = acl_by_type(inode, type); - struct posix_acl *acl = ACCESS_ONCE(*p); - if (acl) { - spin_lock(&inode->i_lock); - acl = *p; - if (acl != ACL_NOT_CACHED) - acl = posix_acl_dup(acl); - spin_unlock(&inode->i_lock); - } - return acl; -} - -static inline struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type) -{ - return rcu_dereference(*acl_by_type(inode, type)); -} - -static inline void set_cached_acl(struct inode *inode, - int type, - struct posix_acl *acl) -{ - struct posix_acl **p = acl_by_type(inode, type); - struct posix_acl *old; - spin_lock(&inode->i_lock); - old = *p; - rcu_assign_pointer(*p, posix_acl_dup(acl)); - spin_unlock(&inode->i_lock); - if (old != ACL_NOT_CACHED) - posix_acl_release(old); -} - -static inline void forget_cached_acl(struct inode *inode, int type) -{ - struct posix_acl **p = acl_by_type(inode, type); - struct posix_acl *old; - spin_lock(&inode->i_lock); - old = *p; - *p = ACL_NOT_CACHED; - spin_unlock(&inode->i_lock); - if (old != ACL_NOT_CACHED) - posix_acl_release(old); -} - -static inline void forget_all_cached_acls(struct inode *inode) -{ - struct posix_acl *old_access, *old_default; - spin_lock(&inode->i_lock); - old_access = inode->i_acl; - old_default = inode->i_default_acl; - inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED; - spin_unlock(&inode->i_lock); - if (old_access != ACL_NOT_CACHED) - posix_acl_release(old_access); - if (old_default != ACL_NOT_CACHED) - posix_acl_release(old_default); -} -#endif +struct posix_acl **acl_by_type(struct inode *inode, int type); +struct posix_acl *get_cached_acl(struct inode *inode, int type); +struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type); +void set_cached_acl(struct inode *inode, int type, struct posix_acl *acl); +void forget_cached_acl(struct inode *inode, int type); +void forget_all_cached_acls(struct inode *inode); static inline void cache_no_acl(struct inode *inode) { diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 6dacb93..1da693d 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -184,13 +184,13 @@ static inline void page_dup_rmap(struct page *page) int page_referenced(struct page *, int is_locked, struct mem_cgroup *memcg, unsigned long *vm_flags); int page_referenced_one(struct page *, struct vm_area_struct *, - unsigned long address, unsigned int *mapcount, unsigned long *vm_flags); + unsigned long address, void *arg); #define TTU_ACTION(x) ((x) & TTU_ACTION_MASK) int try_to_unmap(struct page *, enum ttu_flags flags); int try_to_unmap_one(struct page *, struct vm_area_struct *, - unsigned long address, enum ttu_flags flags); + unsigned long address, void *arg); /* * Called from mm/filemap_xip.c to unmap empty zero page @@ -236,10 +236,27 @@ void page_unlock_anon_vma_read(struct anon_vma *anon_vma); int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma); /* - * Called by migrate.c to remove migration ptes, but might be used more later. + * rmap_walk_control: To control rmap traversing for specific needs + * + * arg: passed to rmap_one() and invalid_vma() + * rmap_one: executed on each vma where page is mapped + * done: for checking traversing termination condition + * file_nonlinear: for handling file nonlinear mapping + * anon_lock: for getting anon_lock by optimized way rather than default + * invalid_vma: for skipping uninterested vma */ -int rmap_walk(struct page *page, int (*rmap_one)(struct page *, - struct vm_area_struct *, unsigned long, void *), void *arg); +struct rmap_walk_control { + void *arg; + int (*rmap_one)(struct page *page, struct vm_area_struct *vma, + unsigned long addr, void *arg); + int (*done)(struct page *page); + int (*file_nonlinear)(struct page *, struct address_space *, + struct vm_area_struct *vma); + struct anon_vma *(*anon_lock)(struct page *page); + bool (*invalid_vma)(struct vm_area_struct *vma, void *arg); +}; + +int rmap_walk(struct page *page, struct rmap_walk_control *rwc); #else /* !CONFIG_MMU */ diff --git a/include/linux/sched.h b/include/linux/sched.h index ffccdad..485234d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -549,6 +549,7 @@ struct signal_struct { atomic_t sigcnt; atomic_t live; int nr_threads; + struct list_head thread_head; wait_queue_head_t wait_chldexit; /* for wait4() */ @@ -1271,6 +1272,7 @@ struct task_struct { /* PID/PID hash table linkage. */ struct pid_link pids[PIDTYPE_MAX]; struct list_head thread_group; + struct list_head thread_node; struct completion *vfork_done; /* for vfork() */ int __user *set_child_tid; /* CLONE_CHILD_SETTID */ @@ -2341,6 +2343,16 @@ extern bool current_is_single_threaded(void); #define while_each_thread(g, t) \ while ((t = next_thread(t)) != g) +#define __for_each_thread(signal, t) \ + list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node) + +#define for_each_thread(p, t) \ + __for_each_thread((p)->signal, t) + +/* Careful: this is a double loop, 'break' won't work as expected. */ +#define for_each_process_thread(p, t) \ + for_each_process(p) for_each_thread(p, t) + static inline int get_nr_threads(struct task_struct *tsk) { return tsk->signal->nr_threads; diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h index fde1b3e..06f544e 100644 --- a/include/trace/events/compaction.h +++ b/include/trace/events/compaction.h @@ -67,6 +67,48 @@ TRACE_EVENT(mm_compaction_migratepages, __entry->nr_failed) ); +TRACE_EVENT(mm_compaction_begin, + TP_PROTO(unsigned long zone_start, unsigned long migrate_start, + unsigned long free_start, unsigned long zone_end), + + TP_ARGS(zone_start, migrate_start, free_start, zone_end), + + TP_STRUCT__entry( + __field(unsigned long, zone_start) + __field(unsigned long, migrate_start) + __field(unsigned long, free_start) + __field(unsigned long, zone_end) + ), + + TP_fast_assign( + __entry->zone_start = zone_start; + __entry->migrate_start = migrate_start; + __entry->free_start = free_start; + __entry->zone_end = zone_end; + ), + + TP_printk("zone_start=%lu migrate_start=%lu free_start=%lu zone_end=%lu", + __entry->zone_start, + __entry->migrate_start, + __entry->free_start, + __entry->zone_end) +); + +TRACE_EVENT(mm_compaction_end, + TP_PROTO(int status), + + TP_ARGS(status), + + TP_STRUCT__entry( + __field(int, status) + ), + + TP_fast_assign( + __entry->status = status; + ), + + TP_printk("status=%d", __entry->status) +); #endif /* _TRACE_COMPACTION_H */ diff --git a/include/trace/events/migrate.h b/include/trace/events/migrate.h index ec2a6cc..3075ffb 100644 --- a/include/trace/events/migrate.h +++ b/include/trace/events/migrate.h @@ -45,6 +45,32 @@ TRACE_EVENT(mm_migrate_pages, __print_symbolic(__entry->reason, MIGRATE_REASON)) ); +TRACE_EVENT(mm_numa_migrate_ratelimit, + + TP_PROTO(struct task_struct *p, int dst_nid, unsigned long nr_pages), + + TP_ARGS(p, dst_nid, nr_pages), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN) + __field( pid_t, pid) + __field( int, dst_nid) + __field( unsigned long, nr_pages) + ), + + TP_fast_assign( + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->dst_nid = dst_nid; + __entry->nr_pages = nr_pages; + ), + + TP_printk("comm=%s pid=%d dst_nid=%d nr_pages=%lu", + __entry->comm, + __entry->pid, + __entry->dst_nid, + __entry->nr_pages) +); #endif /* _TRACE_MIGRATE_H */ /* This part must be outside protection */ diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 04c3084..67e1bbf 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -443,6 +443,93 @@ TRACE_EVENT(sched_process_hang, ); #endif /* CONFIG_DETECT_HUNG_TASK */ +DECLARE_EVENT_CLASS(sched_move_task_template, + + TP_PROTO(struct task_struct *tsk, int src_cpu, int dst_cpu), + + TP_ARGS(tsk, src_cpu, dst_cpu), + + TP_STRUCT__entry( + __field( pid_t, pid ) + __field( pid_t, tgid ) + __field( pid_t, ngid ) + __field( int, src_cpu ) + __field( int, src_nid ) + __field( int, dst_cpu ) + __field( int, dst_nid ) + ), + + TP_fast_assign( + __entry->pid = task_pid_nr(tsk); + __entry->tgid = task_tgid_nr(tsk); + __entry->ngid = task_numa_group_id(tsk); + __entry->src_cpu = src_cpu; + __entry->src_nid = cpu_to_node(src_cpu); + __entry->dst_cpu = dst_cpu; + __entry->dst_nid = cpu_to_node(dst_cpu); + ), + + TP_printk("pid=%d tgid=%d ngid=%d src_cpu=%d src_nid=%d dst_cpu=%d dst_nid=%d", + __entry->pid, __entry->tgid, __entry->ngid, + __entry->src_cpu, __entry->src_nid, + __entry->dst_cpu, __entry->dst_nid) +); + +/* + * Tracks migration of tasks from one runqueue to another. Can be used to + * detect if automatic NUMA balancing is bouncing between nodes + */ +DEFINE_EVENT(sched_move_task_template, sched_move_numa, + TP_PROTO(struct task_struct *tsk, int src_cpu, int dst_cpu), + + TP_ARGS(tsk, src_cpu, dst_cpu) +); + +DEFINE_EVENT(sched_move_task_template, sched_stick_numa, + TP_PROTO(struct task_struct *tsk, int src_cpu, int dst_cpu), + + TP_ARGS(tsk, src_cpu, dst_cpu) +); + +TRACE_EVENT(sched_swap_numa, + + TP_PROTO(struct task_struct *src_tsk, int src_cpu, + struct task_struct *dst_tsk, int dst_cpu), + + TP_ARGS(src_tsk, src_cpu, dst_tsk, dst_cpu), + + TP_STRUCT__entry( + __field( pid_t, src_pid ) + __field( pid_t, src_tgid ) + __field( pid_t, src_ngid ) + __field( int, src_cpu ) + __field( int, src_nid ) + __field( pid_t, dst_pid ) + __field( pid_t, dst_tgid ) + __field( pid_t, dst_ngid ) + __field( int, dst_cpu ) + __field( int, dst_nid ) + ), + + TP_fast_assign( + __entry->src_pid = task_pid_nr(src_tsk); + __entry->src_tgid = task_tgid_nr(src_tsk); + __entry->src_ngid = task_numa_group_id(src_tsk); + __entry->src_cpu = src_cpu; + __entry->src_nid = cpu_to_node(src_cpu); + __entry->dst_pid = task_pid_nr(dst_tsk); + __entry->dst_tgid = task_tgid_nr(dst_tsk); + __entry->dst_ngid = task_numa_group_id(dst_tsk); + __entry->dst_cpu = dst_cpu; + __entry->dst_nid = cpu_to_node(dst_cpu); + ), + + TP_printk("src_pid=%d src_tgid=%d src_ngid=%d src_cpu=%d src_nid=%d dst_pid=%d dst_tgid=%d dst_ngid=%d dst_cpu=%d dst_nid=%d", + __entry->src_pid, __entry->src_tgid, __entry->src_ngid, + __entry->src_cpu, __entry->src_nid, + __entry->dst_pid, __entry->dst_tgid, __entry->dst_ngid, + __entry->dst_cpu, __entry->dst_nid) +); #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/init/main.c b/init/main.c index febc511..f865261 100644 --- a/init/main.c +++ b/init/main.c @@ -355,9 +355,11 @@ static inline void smp_prepare_cpus(unsigned int maxcpus) { } */ static void __init setup_command_line(char *command_line) { - saved_command_line = alloc_bootmem(strlen (boot_command_line)+1); - initcall_command_line = alloc_bootmem(strlen (boot_command_line)+1); - static_command_line = alloc_bootmem(strlen (command_line)+1); + saved_command_line = + memblock_virt_alloc(strlen(boot_command_line) + 1, 0); + initcall_command_line = + memblock_virt_alloc(strlen(boot_command_line) + 1, 0); + static_command_line = memblock_virt_alloc(strlen(command_line) + 1, 0); strcpy (saved_command_line, boot_command_line); strcpy (static_command_line, command_line); } @@ -476,7 +478,7 @@ static void __init mm_init(void) mem_init(); kmem_cache_init(); percpu_init_late(); - pgtable_cache_init(); + pgtable_init(); vmalloc_init(); } diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 43c307d..67ccf0e 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -912,12 +912,13 @@ static void evict_chunk(struct audit_chunk *chunk) } static int audit_tree_handle_event(struct fsnotify_group *group, + struct inode *to_tell, struct fsnotify_mark *inode_mark, - struct fsnotify_mark *vfsmonut_mark, - struct fsnotify_event *event) + struct fsnotify_mark *vfsmount_mark, + u32 mask, void *data, int data_type, + const unsigned char *file_name) { - BUG(); - return -EOPNOTSUPP; + return 0; } static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify_group *group) @@ -933,19 +934,8 @@ static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify BUG_ON(atomic_read(&entry->refcnt) < 1); } -static bool audit_tree_send_event(struct fsnotify_group *group, struct inode *inode, - struct fsnotify_mark *inode_mark, - struct fsnotify_mark *vfsmount_mark, - __u32 mask, void *data, int data_type) -{ - return false; -} - static const struct fsnotify_ops audit_tree_ops = { .handle_event = audit_tree_handle_event, - .should_send_event = audit_tree_send_event, - .free_group_priv = NULL, - .free_event_priv = NULL, .freeing_mark = audit_tree_freeing_mark, }; diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 22831c4..2596fac 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -465,35 +465,27 @@ void audit_remove_watch_rule(struct audit_krule *krule) } } -static bool audit_watch_should_send_event(struct fsnotify_group *group, struct inode *inode, - struct fsnotify_mark *inode_mark, - struct fsnotify_mark *vfsmount_mark, - __u32 mask, void *data, int data_type) -{ - return true; -} - /* Update watch data in audit rules based on fsnotify events. */ static int audit_watch_handle_event(struct fsnotify_group *group, + struct inode *to_tell, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, - struct fsnotify_event *event) + u32 mask, void *data, int data_type, + const unsigned char *dname) { struct inode *inode; - __u32 mask = event->mask; - const char *dname = event->file_name; struct audit_parent *parent; parent = container_of(inode_mark, struct audit_parent, mark); BUG_ON(group != audit_watch_group); - switch (event->data_type) { + switch (data_type) { case (FSNOTIFY_EVENT_PATH): - inode = event->path.dentry->d_inode; + inode = ((struct path *)data)->dentry->d_inode; break; case (FSNOTIFY_EVENT_INODE): - inode = event->inode; + inode = (struct inode *)data; break; default: BUG(); @@ -512,11 +504,7 @@ static int audit_watch_handle_event(struct fsnotify_group *group, } static const struct fsnotify_ops audit_watch_fsnotify_ops = { - .should_send_event = audit_watch_should_send_event, .handle_event = audit_watch_handle_event, - .free_group_priv = NULL, - .freeing_mark = NULL, - .free_event_priv = NULL, }; static int __init audit_watch_init(void) diff --git a/kernel/exit.c b/kernel/exit.c index a949819..1e77fc6 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -74,6 +74,7 @@ static void __unhash_process(struct task_struct *p, bool group_dead) __this_cpu_dec(process_counts); } list_del_rcu(&p->thread_group); + list_del_rcu(&p->thread_node); } /* diff --git a/kernel/fork.c b/kernel/fork.c index 294189f..2f11bbe 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1035,6 +1035,11 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->nr_threads = 1; atomic_set(&sig->live, 1); atomic_set(&sig->sigcnt, 1); + + /* list_add(thread_node, thread_head) without INIT_LIST_HEAD() */ + sig->thread_head = (struct list_head)LIST_HEAD_INIT(tsk->thread_node); + tsk->thread_node = (struct list_head)LIST_HEAD_INIT(sig->thread_head); + init_waitqueue_head(&sig->wait_chldexit); sig->curr_target = tsk; init_sigpending(&sig->shared_pending); @@ -1474,6 +1479,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, atomic_inc(¤t->signal->sigcnt); list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); + list_add_tail_rcu(&p->thread_node, + &p->signal->thread_head); } attach_pid(p, PIDTYPE_PID); nr_threads++; diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index b38109e..d9f61a1 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -637,7 +637,7 @@ __register_nosave_region(unsigned long start_pfn, unsigned long end_pfn, BUG_ON(!region); } else /* This allocation cannot fail */ - region = alloc_bootmem(sizeof(struct nosave_region)); + region = memblock_virt_alloc(sizeof(struct nosave_region), 0); region->start_pfn = start_pfn; region->end_pfn = end_pfn; list_add_tail(®ion->list, &nosave_regions); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index be7c86b..f8b41bd 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -757,14 +757,10 @@ void __init setup_log_buf(int early) return; if (early) { - unsigned long mem; - - mem = memblock_alloc(new_log_buf_len, PAGE_SIZE); - if (!mem) - return; - new_log_buf = __va(mem); + new_log_buf = + memblock_virt_alloc(new_log_buf_len, PAGE_SIZE); } else { - new_log_buf = alloc_bootmem_nopanic(new_log_buf_len); + new_log_buf = memblock_virt_alloc_nopanic(new_log_buf_len, 0); } if (unlikely(!new_log_buf)) { diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3897e09..4d6964e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1108,6 +1108,7 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p) if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task))) goto out; + trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu); ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg); out: @@ -4603,6 +4604,7 @@ int migrate_task_to(struct task_struct *p, int target_cpu) /* TODO: This is not properly updating schedstats */ + trace_sched_move_numa(p, curr_cpu, target_cpu); return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg); } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b24b6cf..867b0a4 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1250,11 +1250,15 @@ static int task_numa_migrate(struct task_struct *p) p->numa_scan_period = task_scan_min(p); if (env.best_task == NULL) { - int ret = migrate_task_to(p, env.best_cpu); + ret = migrate_task_to(p, env.best_cpu); + if (ret != 0) + trace_sched_stick_numa(p, env.src_cpu, env.best_cpu); return ret; } ret = migrate_swap(p, env.best_task); + if (ret != 0) + trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task)); put_task_struct(env.best_task); return ret; } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c8da99f..332cefc 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -95,8 +95,6 @@ #if defined(CONFIG_SYSCTL) /* External variables not in a header file. */ -extern int sysctl_overcommit_memory; -extern int sysctl_overcommit_ratio; extern int max_threads; extern int suid_dumpable; #ifdef CONFIG_COREDUMP @@ -1121,7 +1119,14 @@ static struct ctl_table vm_table[] = { .data = &sysctl_overcommit_ratio, .maxlen = sizeof(sysctl_overcommit_ratio), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = overcommit_ratio_handler, + }, + { + .procname = "overcommit_kbytes", + .data = &sysctl_overcommit_kbytes, + .maxlen = sizeof(sysctl_overcommit_kbytes), + .mode = 0644, + .proc_handler = overcommit_kbytes_handler, }, { .procname = "page-cluster", diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 6982094..900b63c 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1584,8 +1584,16 @@ config DMA_API_DEBUG With this option you will be able to detect common bugs in device drivers like double-freeing of DMA mappings or freeing mappings that were never allocated. - This option causes a performance degredation. Use only if you want - to debug device drivers. If unsure, say N. + + This also attempts to catch cases where a page owned by DMA is + accessed by the cpu in a way that could cause data corruption. For + example, this enables cow_user_page() to check that the source page is + not undergoing DMA. + + This option causes a performance degradation. Use only if you want to + debug device drivers and dma interactions. + + If unsure, say N. source "samples/Kconfig" diff --git a/lib/cpumask.c b/lib/cpumask.c index d327b87..b810b75 100644 --- a/lib/cpumask.c +++ b/lib/cpumask.c @@ -140,7 +140,7 @@ EXPORT_SYMBOL(zalloc_cpumask_var); */ void __init alloc_bootmem_cpumask_var(cpumask_var_t *mask) { - *mask = alloc_bootmem(cpumask_size()); + *mask = memblock_virt_alloc(cpumask_size(), 0); } /** @@ -161,6 +161,6 @@ EXPORT_SYMBOL(free_cpumask_var); */ void __init free_bootmem_cpumask_var(cpumask_var_t mask) { - free_bootmem(__pa(mask), cpumask_size()); + memblock_free_early(__pa(mask), cpumask_size()); } #endif diff --git a/lib/dma-debug.c b/lib/dma-debug.c index d87a17a..c380838 100644 --- a/lib/dma-debug.c +++ b/lib/dma-debug.c @@ -53,11 +53,26 @@ enum map_err_types { #define DMA_DEBUG_STACKTRACE_ENTRIES 5 +/** + * struct dma_debug_entry - track a dma_map* or dma_alloc_coherent mapping + * @list: node on pre-allocated free_entries list + * @dev: 'dev' argument to dma_map_{page|single|sg} or dma_alloc_coherent + * @type: single, page, sg, coherent + * @pfn: page frame of the start address + * @offset: offset of mapping relative to pfn + * @size: length of the mapping + * @direction: enum dma_data_direction + * @sg_call_ents: 'nents' from dma_map_sg + * @sg_mapped_ents: 'mapped_ents' from dma_map_sg + * @map_err_type: track whether dma_mapping_error() was checked + * @stacktrace: support backtraces when a violation is detected + */ struct dma_debug_entry { struct list_head list; struct device *dev; int type; - phys_addr_t paddr; + unsigned long pfn; + size_t offset; u64 dev_addr; u64 size; int direction; @@ -372,6 +387,11 @@ static void hash_bucket_del(struct dma_debug_entry *entry) list_del(&entry->list); } +static unsigned long long phys_addr(struct dma_debug_entry *entry) +{ + return page_to_phys(pfn_to_page(entry->pfn)) + entry->offset; +} + /* * Dump mapping entries for debugging purposes */ @@ -389,9 +409,9 @@ void debug_dma_dump_mappings(struct device *dev) list_for_each_entry(entry, &bucket->list, list) { if (!dev || dev == entry->dev) { dev_info(entry->dev, - "%s idx %d P=%Lx D=%Lx L=%Lx %s %s\n", + "%s idx %d P=%Lx N=%lx D=%Lx L=%Lx %s %s\n", type2name[entry->type], idx, - (unsigned long long)entry->paddr, + phys_addr(entry), entry->pfn, entry->dev_addr, entry->size, dir2name[entry->direction], maperr2str[entry->map_err_type]); @@ -404,6 +424,133 @@ void debug_dma_dump_mappings(struct device *dev) EXPORT_SYMBOL(debug_dma_dump_mappings); /* + * For each page mapped (initial page in the case of + * dma_alloc_coherent/dma_map_{single|page}, or each page in a + * scatterlist) insert into this tree using the pfn as the key. At + * dma_unmap_{single|sg|page} or dma_free_coherent delete the entry. If + * the pfn already exists at insertion time add a tag as a reference + * count for the overlapping mappings. For now, the overlap tracking + * just ensures that 'unmaps' balance 'maps' before marking the pfn + * idle, but we should also be flagging overlaps as an API violation. + * + * Memory usage is mostly constrained by the maximum number of available + * dma-debug entries in that we need a free dma_debug_entry before + * inserting into the tree. In the case of dma_map_{single|page} and + * dma_alloc_coherent there is only one dma_debug_entry and one pfn to + * track per event. dma_map_sg(), on the other hand, + * consumes a single dma_debug_entry, but inserts 'nents' entries into + * the tree. + * + * At any time debug_dma_assert_idle() can be called to trigger a + * warning if the given page is in the active set. + */ +static RADIX_TREE(dma_active_pfn, GFP_NOWAIT); +static DEFINE_SPINLOCK(radix_lock); +#define ACTIVE_PFN_MAX_OVERLAP ((1 << RADIX_TREE_MAX_TAGS) - 1) + +static int active_pfn_read_overlap(unsigned long pfn) +{ + int overlap = 0, i; + + for (i = RADIX_TREE_MAX_TAGS - 1; i >= 0; i--) + if (radix_tree_tag_get(&dma_active_pfn, pfn, i)) + overlap |= 1 << i; + return overlap; +} + +static int active_pfn_set_overlap(unsigned long pfn, int overlap) +{ + int i; + + if (overlap > ACTIVE_PFN_MAX_OVERLAP || overlap < 0) + return 0; + + for (i = RADIX_TREE_MAX_TAGS - 1; i >= 0; i--) + if (overlap & 1 << i) + radix_tree_tag_set(&dma_active_pfn, pfn, i); + else + radix_tree_tag_clear(&dma_active_pfn, pfn, i); + + return overlap; +} + +static void active_pfn_inc_overlap(unsigned long pfn) +{ + int overlap = active_pfn_read_overlap(pfn); + + overlap = active_pfn_set_overlap(pfn, ++overlap); + + /* If we overflowed the overlap counter then we're potentially + * leaking dma-mappings. Otherwise, if maps and unmaps are + * balanced then this overflow may cause false negatives in + * debug_dma_assert_idle() as the pfn may be marked idle + * prematurely. + */ + WARN_ONCE(overlap == 0, + "DMA-API: exceeded %d overlapping mappings of pfn %lx\n", + ACTIVE_PFN_MAX_OVERLAP, pfn); +} + +static int active_pfn_dec_overlap(unsigned long pfn) +{ + int overlap = active_pfn_read_overlap(pfn); + + return active_pfn_set_overlap(pfn, --overlap); +} + +static int active_pfn_insert(struct dma_debug_entry *entry) +{ + unsigned long flags; + int rc; + + spin_lock_irqsave(&radix_lock, flags); + rc = radix_tree_insert(&dma_active_pfn, entry->pfn, entry); + if (rc == -EEXIST) + active_pfn_inc_overlap(entry->pfn); + spin_unlock_irqrestore(&radix_lock, flags); + + return rc; +} + +static void active_pfn_remove(struct dma_debug_entry *entry) +{ + unsigned long flags; + + spin_lock_irqsave(&radix_lock, flags); + if (active_pfn_dec_overlap(entry->pfn) == 0) + radix_tree_delete(&dma_active_pfn, entry->pfn); + spin_unlock_irqrestore(&radix_lock, flags); +} + +/** + * debug_dma_assert_idle() - assert that a page is not undergoing dma + * @page: page to lookup in the dma_active_pfn tree + * + * Place a call to this routine in cases where the cpu touching the page + * before the dma completes (page is dma_unmapped) will lead to data + * corruption. + */ +void debug_dma_assert_idle(struct page *page) +{ + unsigned long flags; + struct dma_debug_entry *entry; + + if (!page) + return; + + spin_lock_irqsave(&radix_lock, flags); + entry = radix_tree_lookup(&dma_active_pfn, page_to_pfn(page)); + spin_unlock_irqrestore(&radix_lock, flags); + + if (!entry) + return; + + err_printk(entry->dev, entry, + "DMA-API: cpu touching an active dma mapped page " + "[pfn=0x%lx]\n", entry->pfn); +} + +/* * Wrapper function for adding an entry to the hash. * This function takes care of locking itself. */ @@ -411,10 +558,21 @@ static void add_dma_entry(struct dma_debug_entry *entry) { struct hash_bucket *bucket; unsigned long flags; + int rc; bucket = get_hash_bucket(entry, &flags); hash_bucket_add(bucket, entry); put_hash_bucket(bucket, &flags); + + rc = active_pfn_insert(entry); + if (rc == -ENOMEM) { + pr_err("DMA-API: pfn tracking ENOMEM, dma-debug disabled\n"); + global_disable = true; + } + + /* TODO: report -EEXIST errors here as overlapping mappings are + * not supported by the DMA API + */ } static struct dma_debug_entry *__dma_entry_alloc(void) @@ -469,6 +627,8 @@ static void dma_entry_free(struct dma_debug_entry *entry) { unsigned long flags; + active_pfn_remove(entry); + /* * add to beginning of the list - this way the entries are * more likely cache hot when they are reallocated. @@ -895,15 +1055,15 @@ static void check_unmap(struct dma_debug_entry *ref) ref->dev_addr, ref->size, type2name[entry->type], type2name[ref->type]); } else if ((entry->type == dma_debug_coherent) && - (ref->paddr != entry->paddr)) { + (phys_addr(ref) != phys_addr(entry))) { err_printk(ref->dev, entry, "DMA-API: device driver frees " "DMA memory with different CPU address " "[device address=0x%016llx] [size=%llu bytes] " "[cpu alloc address=0x%016llx] " "[cpu free address=0x%016llx]", ref->dev_addr, ref->size, - (unsigned long long)entry->paddr, - (unsigned long long)ref->paddr); + phys_addr(entry), + phys_addr(ref)); } if (ref->sg_call_ents && ref->type == dma_debug_sg && @@ -1052,7 +1212,8 @@ void debug_dma_map_page(struct device *dev, struct page *page, size_t offset, entry->dev = dev; entry->type = dma_debug_page; - entry->paddr = page_to_phys(page) + offset; + entry->pfn = page_to_pfn(page); + entry->offset = offset, entry->dev_addr = dma_addr; entry->size = size; entry->direction = direction; @@ -1148,7 +1309,8 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg, entry->type = dma_debug_sg; entry->dev = dev; - entry->paddr = sg_phys(s); + entry->pfn = page_to_pfn(sg_page(s)); + entry->offset = s->offset, entry->size = sg_dma_len(s); entry->dev_addr = sg_dma_address(s); entry->direction = direction; @@ -1198,7 +1360,8 @@ void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist, struct dma_debug_entry ref = { .type = dma_debug_sg, .dev = dev, - .paddr = sg_phys(s), + .pfn = page_to_pfn(sg_page(s)), + .offset = s->offset, .dev_addr = sg_dma_address(s), .size = sg_dma_len(s), .direction = dir, @@ -1233,7 +1396,8 @@ void debug_dma_alloc_coherent(struct device *dev, size_t size, entry->type = dma_debug_coherent; entry->dev = dev; - entry->paddr = virt_to_phys(virt); + entry->pfn = page_to_pfn(virt_to_page(virt)); + entry->offset = (size_t) virt & PAGE_MASK; entry->size = size; entry->dev_addr = dma_addr; entry->direction = DMA_BIDIRECTIONAL; @@ -1248,7 +1412,8 @@ void debug_dma_free_coherent(struct device *dev, size_t size, struct dma_debug_entry ref = { .type = dma_debug_coherent, .dev = dev, - .paddr = virt_to_phys(virt), + .pfn = page_to_pfn(virt_to_page(virt)), + .offset = (size_t) virt & PAGE_MASK, .dev_addr = addr, .size = size, .direction = DMA_BIDIRECTIONAL, @@ -1356,7 +1521,8 @@ void debug_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, struct dma_debug_entry ref = { .type = dma_debug_sg, .dev = dev, - .paddr = sg_phys(s), + .pfn = page_to_pfn(sg_page(s)), + .offset = s->offset, .dev_addr = sg_dma_address(s), .size = sg_dma_len(s), .direction = direction, @@ -1388,7 +1554,8 @@ void debug_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, struct dma_debug_entry ref = { .type = dma_debug_sg, .dev = dev, - .paddr = sg_phys(s), + .pfn = page_to_pfn(sg_page(s)), + .offset = s->offset, .dev_addr = sg_dma_address(s), .size = sg_dma_len(s), .direction = direction, diff --git a/lib/show_mem.c b/lib/show_mem.c index 5847a49..0922579 100644 --- a/lib/show_mem.c +++ b/lib/show_mem.c @@ -17,9 +17,6 @@ void show_mem(unsigned int filter) printk("Mem-Info:\n"); show_free_areas(filter); - if (filter & SHOW_MEM_FILTER_PAGE_COUNT) - return; - for_each_online_pgdat(pgdat) { unsigned long flags; int zoneid; @@ -46,4 +43,7 @@ void show_mem(unsigned int filter) printk("%lu pages in pagetable cache\n", quicklist_total_size()); #endif +#ifdef CONFIG_MEMORY_FAILURE + printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages)); +#endif } diff --git a/lib/swiotlb.c b/lib/swiotlb.c index e4399fa..615f3de 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -172,8 +172,9 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) /* * Get the overflow emergency buffer */ - v_overflow_buffer = alloc_bootmem_low_pages_nopanic( - PAGE_ALIGN(io_tlb_overflow)); + v_overflow_buffer = memblock_virt_alloc_nopanic( + PAGE_ALIGN(io_tlb_overflow), + PAGE_SIZE); if (!v_overflow_buffer) return -ENOMEM; @@ -184,11 +185,15 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE * between io_tlb_start and io_tlb_end. */ - io_tlb_list = alloc_bootmem_pages(PAGE_ALIGN(io_tlb_nslabs * sizeof(int))); + io_tlb_list = memblock_virt_alloc( + PAGE_ALIGN(io_tlb_nslabs * sizeof(int)), + PAGE_SIZE); for (i = 0; i < io_tlb_nslabs; i++) io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE); io_tlb_index = 0; - io_tlb_orig_addr = alloc_bootmem_pages(PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t))); + io_tlb_orig_addr = memblock_virt_alloc( + PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t)), + PAGE_SIZE); if (verbose) swiotlb_print_info(); @@ -215,13 +220,13 @@ swiotlb_init(int verbose) bytes = io_tlb_nslabs << IO_TLB_SHIFT; /* Get IO TLB memory from the low pages */ - vstart = alloc_bootmem_low_pages_nopanic(PAGE_ALIGN(bytes)); + vstart = memblock_virt_alloc_nopanic(PAGE_ALIGN(bytes), PAGE_SIZE); if (vstart && !swiotlb_init_with_tbl(vstart, io_tlb_nslabs, verbose)) return; if (io_tlb_start) - free_bootmem(io_tlb_start, - PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); + memblock_free_early(io_tlb_start, + PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); pr_warn("Cannot allocate SWIOTLB buffer"); no_iotlb_memory = true; } @@ -357,14 +362,14 @@ void __init swiotlb_free(void) free_pages((unsigned long)phys_to_virt(io_tlb_start), get_order(io_tlb_nslabs << IO_TLB_SHIFT)); } else { - free_bootmem_late(io_tlb_overflow_buffer, - PAGE_ALIGN(io_tlb_overflow)); - free_bootmem_late(__pa(io_tlb_orig_addr), - PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t))); - free_bootmem_late(__pa(io_tlb_list), - PAGE_ALIGN(io_tlb_nslabs * sizeof(int))); - free_bootmem_late(io_tlb_start, - PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); + memblock_free_late(io_tlb_overflow_buffer, + PAGE_ALIGN(io_tlb_overflow)); + memblock_free_late(__pa(io_tlb_orig_addr), + PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t))); + memblock_free_late(__pa(io_tlb_list), + PAGE_ALIGN(io_tlb_nslabs * sizeof(int))); + memblock_free_late(io_tlb_start, + PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); } io_tlb_nslabs = 0; } diff --git a/mm/compaction.c b/mm/compaction.c index f58bcd0..3a91a2e 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -459,6 +459,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, unsigned long flags; bool locked = false; struct page *page = NULL, *valid_page = NULL; + bool skipped_async_unsuitable = false; /* * Ensure that there are not too many pages isolated from the LRU @@ -534,6 +535,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, if (!cc->sync && last_pageblock_nr != pageblock_nr && !migrate_async_suitable(get_pageblock_migratetype(page))) { cc->finished_update_migrate = true; + skipped_async_unsuitable = true; goto next_pageblock; } @@ -627,8 +629,13 @@ next_pageblock: if (locked) spin_unlock_irqrestore(&zone->lru_lock, flags); - /* Update the pageblock-skip if the whole pageblock was scanned */ - if (low_pfn == end_pfn) + /* + * Update the pageblock-skip information and cached scanner pfn, + * if the whole pageblock was scanned without isolating any page. + * This is not done when pageblock was skipped due to being unsuitable + * for async compaction, so that eventual sync compaction can try. + */ + if (low_pfn == end_pfn && !skipped_async_unsuitable) update_pageblock_skip(cc, valid_page, nr_isolated, true); trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); @@ -660,7 +667,7 @@ static void isolate_freepages(struct zone *zone, * is the end of the pageblock the migration scanner is using. */ pfn = cc->free_pfn; - low_pfn = cc->migrate_pfn + pageblock_nr_pages; + low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages); /* * Take care that if the migration scanner is at the end of the zone @@ -676,7 +683,7 @@ static void isolate_freepages(struct zone *zone, * pages on cc->migratepages. We stop searching if the migrate * and free page scanners meet or enough free pages are isolated. */ - for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages; + for (; pfn >= low_pfn && cc->nr_migratepages > nr_freepages; pfn -= pageblock_nr_pages) { unsigned long isolated; @@ -738,7 +745,14 @@ static void isolate_freepages(struct zone *zone, /* split_free_page does not map the pages */ map_pages(freelist); - cc->free_pfn = high_pfn; + /* + * If we crossed the migrate scanner, we want to keep it that way + * so that compact_finished() may detect this + */ + if (pfn < low_pfn) + cc->free_pfn = max(pfn, zone->zone_start_pfn); + else + cc->free_pfn = high_pfn; cc->nr_freepages = nr_freepages; } @@ -837,6 +851,10 @@ static int compact_finished(struct zone *zone, /* Compaction run completes if the migrate and free scanner meet */ if (cc->free_pfn <= cc->migrate_pfn) { + /* Let the next compaction start anew. */ + zone->compact_cached_migrate_pfn = zone->zone_start_pfn; + zone->compact_cached_free_pfn = zone_end_pfn(zone); + /* * Mark that the PG_migrate_skip information should be cleared * by kswapd when it goes to sleep. kswapd does not set the @@ -947,6 +965,14 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) } /* + * Clear pageblock skip if there were failures recently and compaction + * is about to be retried after being deferred. kswapd does not do + * this reset as it'll reset the cached information when going to sleep. + */ + if (compaction_restarting(zone, cc->order) && !current_is_kswapd()) + __reset_isolation_suitable(zone); + + /* * Setup to move all movable pages to the end of the zone. Used cached * information on where the scanners should start but check that it * is initialised by ensuring the values are within zone boundaries. @@ -962,13 +988,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) zone->compact_cached_migrate_pfn = cc->migrate_pfn; } - /* - * Clear pageblock skip if there were failures recently and compaction - * is about to be retried after being deferred. kswapd does not do - * this reset as it'll reset the cached information when going to sleep. - */ - if (compaction_restarting(zone, cc->order) && !current_is_kswapd()) - __reset_isolation_suitable(zone); + trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn); migrate_prep_local(); @@ -1003,7 +1023,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) if (err) { putback_movable_pages(&cc->migratepages); cc->nr_migratepages = 0; - if (err == -ENOMEM) { + /* + * migrate_pages() may return -ENOMEM when scanners meet + * and we want compact_finished() to detect it + */ + if (err == -ENOMEM && cc->free_pfn > cc->migrate_pfn) { ret = COMPACT_PARTIAL; goto out; } @@ -1015,6 +1039,8 @@ out: cc->nr_freepages -= release_freepages(&cc->freepages); VM_BUG_ON(cc->nr_freepages != 0); + trace_mm_compaction_end(ret); + return ret; } @@ -1120,12 +1146,11 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) compact_zone(zone, cc); if (cc->order > 0) { - int ok = zone_watermark_ok(zone, cc->order, - low_wmark_pages(zone), 0, 0); - if (ok && cc->order >= zone->compact_order_failed) - zone->compact_order_failed = cc->order + 1; + if (zone_watermark_ok(zone, cc->order, + low_wmark_pages(zone), 0, 0)) + compaction_defer_reset(zone, cc->order, false); /* Currently async compaction is never deferred. */ - else if (!ok && cc->sync) + else if (cc->sync) defer_compaction(zone, cc->order); } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index dee6cf4..04306b9 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -690,15 +690,11 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order) */ int PageHuge(struct page *page) { - compound_page_dtor *dtor; - if (!PageCompound(page)) return 0; page = compound_head(page); - dtor = get_compound_page_dtor(page); - - return dtor == free_huge_page; + return get_compound_page_dtor(page) == free_huge_page; } EXPORT_SYMBOL_GPL(PageHuge); @@ -708,16 +704,11 @@ EXPORT_SYMBOL_GPL(PageHuge); */ int PageHeadHuge(struct page *page_head) { - compound_page_dtor *dtor; - if (!PageHead(page_head)) return 0; - dtor = get_compound_page_dtor(page_head); - - return dtor == free_huge_page; + return get_compound_page_dtor(page_head) == free_huge_page; } -EXPORT_SYMBOL_GPL(PageHeadHuge); pgoff_t __basepage_index(struct page *page) { @@ -1280,9 +1271,9 @@ int __weak alloc_bootmem_huge_page(struct hstate *h) for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) { void *addr; - addr = __alloc_bootmem_node_nopanic(NODE_DATA(node), - huge_page_size(h), huge_page_size(h), 0); - + addr = memblock_virt_alloc_try_nid_nopanic( + huge_page_size(h), huge_page_size(h), + 0, BOOTMEM_ALLOC_ACCESSIBLE, node); if (addr) { /* * Use the beginning of the huge page to store the @@ -1322,8 +1313,8 @@ static void __init gather_bootmem_prealloc(void) #ifdef CONFIG_HIGHMEM page = pfn_to_page(m->phys >> PAGE_SHIFT); - free_bootmem_late((unsigned long)m, - sizeof(struct huge_bootmem_page)); + memblock_free_late(__pa(m), + sizeof(struct huge_bootmem_page)); #else page = virt_to_page(m); #endif @@ -2355,17 +2346,27 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, int cow; struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ + int ret = 0; cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; + mmun_start = vma->vm_start; + mmun_end = vma->vm_end; + if (cow) + mmu_notifier_invalidate_range_start(src, mmun_start, mmun_end); + for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { spinlock_t *src_ptl, *dst_ptl; src_pte = huge_pte_offset(src, addr); if (!src_pte) continue; dst_pte = huge_pte_alloc(dst, addr, sz); - if (!dst_pte) - goto nomem; + if (!dst_pte) { + ret = -ENOMEM; + break; + } /* If the pagetables are shared don't copy or take references */ if (dst_pte == src_pte) @@ -2386,10 +2387,11 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, spin_unlock(src_ptl); spin_unlock(dst_ptl); } - return 0; -nomem: - return -ENOMEM; + if (cow) + mmu_notifier_invalidate_range_end(src, mmun_start, mmun_end); + + return ret; } static int is_hugetlb_entry_migration(pte_t pte) @@ -3079,7 +3081,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, same_page: if (pages) { pages[i] = mem_map_offset(page, pfn_offset); - get_page(pages[i]); + get_page_foll(pages[i]); } if (vmas) diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index 4c84678..95487c7 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c @@ -55,7 +55,7 @@ static int hwpoison_inject(void *data, u64 val) return 0; inject: - printk(KERN_INFO "Injecting memory failure at pfn %lx\n", pfn); + pr_info("Injecting memory failure at pfn %#lx\n", pfn); return memory_failure(pfn, 18, MF_COUNT_INCREASED); } diff --git a/mm/internal.h b/mm/internal.h index 684f7aa..a346ba1 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -47,11 +47,9 @@ static inline void __get_page_tail_foll(struct page *page, * page_cache_get_speculative()) on tail pages. */ VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0); - VM_BUG_ON(atomic_read(&page->_count) != 0); - VM_BUG_ON(page_mapcount(page) < 0); if (get_page_head) atomic_inc(&page->first_page->_count); - atomic_inc(&page->_mapcount); + get_huge_page_tail(page); } /* @@ -1891,21 +1891,24 @@ struct page *ksm_might_need_to_copy(struct page *page, return new_page; } -int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg, - unsigned long *vm_flags) +int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) { struct stable_node *stable_node; struct rmap_item *rmap_item; - unsigned int mapcount = page_mapcount(page); - int referenced = 0; + int ret = SWAP_AGAIN; int search_new_forks = 0; VM_BUG_ON(!PageKsm(page)); + + /* + * Rely on the page lock to protect against concurrent modifications + * to that page's node of the stable tree. + */ VM_BUG_ON(!PageLocked(page)); stable_node = page_stable_node(page); if (!stable_node) - return 0; + return ret; again: hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { struct anon_vma *anon_vma = rmap_item->anon_vma; @@ -1928,113 +1931,16 @@ again: if ((rmap_item->mm == vma->vm_mm) == search_new_forks) continue; - if (memcg && !mm_match_cgroup(vma->vm_mm, memcg)) - continue; - - referenced += page_referenced_one(page, vma, - rmap_item->address, &mapcount, vm_flags); - if (!search_new_forks || !mapcount) - break; - } - anon_vma_unlock_read(anon_vma); - if (!mapcount) - goto out; - } - if (!search_new_forks++) - goto again; -out: - return referenced; -} - -int try_to_unmap_ksm(struct page *page, enum ttu_flags flags) -{ - struct stable_node *stable_node; - struct rmap_item *rmap_item; - int ret = SWAP_AGAIN; - int search_new_forks = 0; - - VM_BUG_ON(!PageKsm(page)); - VM_BUG_ON(!PageLocked(page)); - - stable_node = page_stable_node(page); - if (!stable_node) - return SWAP_FAIL; -again: - hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { - struct anon_vma *anon_vma = rmap_item->anon_vma; - struct anon_vma_chain *vmac; - struct vm_area_struct *vma; - - anon_vma_lock_read(anon_vma); - anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, - 0, ULONG_MAX) { - vma = vmac->vma; - if (rmap_item->address < vma->vm_start || - rmap_item->address >= vma->vm_end) - continue; - /* - * Initially we examine only the vma which covers this - * rmap_item; but later, if there is still work to do, - * we examine covering vmas in other mms: in case they - * were forked from the original since ksmd passed. - */ - if ((rmap_item->mm == vma->vm_mm) == search_new_forks) + if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) continue; - ret = try_to_unmap_one(page, vma, - rmap_item->address, flags); - if (ret != SWAP_AGAIN || !page_mapped(page)) { + ret = rwc->rmap_one(page, vma, + rmap_item->address, rwc->arg); + if (ret != SWAP_AGAIN) { anon_vma_unlock_read(anon_vma); goto out; } - } - anon_vma_unlock_read(anon_vma); - } - if (!search_new_forks++) - goto again; -out: - return ret; -} - -#ifdef CONFIG_MIGRATION -int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *, - struct vm_area_struct *, unsigned long, void *), void *arg) -{ - struct stable_node *stable_node; - struct rmap_item *rmap_item; - int ret = SWAP_AGAIN; - int search_new_forks = 0; - - VM_BUG_ON(!PageKsm(page)); - VM_BUG_ON(!PageLocked(page)); - - stable_node = page_stable_node(page); - if (!stable_node) - return ret; -again: - hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { - struct anon_vma *anon_vma = rmap_item->anon_vma; - struct anon_vma_chain *vmac; - struct vm_area_struct *vma; - - anon_vma_lock_read(anon_vma); - anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, - 0, ULONG_MAX) { - vma = vmac->vma; - if (rmap_item->address < vma->vm_start || - rmap_item->address >= vma->vm_end) - continue; - /* - * Initially we examine only the vma which covers this - * rmap_item; but later, if there is still work to do, - * we examine covering vmas in other mms: in case they - * were forked from the original since ksmd passed. - */ - if ((rmap_item->mm == vma->vm_mm) == search_new_forks) - continue; - - ret = rmap_one(page, vma, rmap_item->address, arg); - if (ret != SWAP_AGAIN) { + if (rwc->done && rwc->done(page)) { anon_vma_unlock_read(anon_vma); goto out; } @@ -2047,6 +1953,7 @@ out: return ret; } +#ifdef CONFIG_MIGRATION void ksm_migrate_page(struct page *newpage, struct page *oldpage) { struct stable_node *stable_node; diff --git a/mm/memblock.c b/mm/memblock.c index 53e477b..1c2ef2c 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -21,6 +21,9 @@ #include <linux/memblock.h> #include <asm-generic/sections.h> +#include <linux/io.h> + +#include "internal.h" static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; @@ -39,6 +42,9 @@ struct memblock memblock __initdata_memblock = { }; int memblock_debug __initdata_memblock; +#ifdef CONFIG_MOVABLE_NODE +bool movable_node_enabled __initdata_memblock = false; +#endif static int memblock_can_resize __initdata_memblock; static int memblock_memory_in_slab __initdata_memblock = 0; static int memblock_reserved_in_slab __initdata_memblock = 0; @@ -91,7 +97,7 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type, * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} * @size: size of free area to find * @align: alignment of free area to find - * @nid: nid of the free area to find, %MAX_NUMNODES for any node + * @nid: nid of the free area to find, %NUMA_NO_NODE for any node * * Utility called from memblock_find_in_range_node(), find free area bottom-up. * @@ -123,7 +129,7 @@ __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end, * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} * @size: size of free area to find * @align: alignment of free area to find - * @nid: nid of the free area to find, %MAX_NUMNODES for any node + * @nid: nid of the free area to find, %NUMA_NO_NODE for any node * * Utility called from memblock_find_in_range_node(), find free area top-down. * @@ -154,11 +160,11 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end, /** * memblock_find_in_range_node - find free area in given range and node - * @start: start of candidate range - * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} * @size: size of free area to find * @align: alignment of free area to find - * @nid: nid of the free area to find, %MAX_NUMNODES for any node + * @start: start of candidate range + * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} + * @nid: nid of the free area to find, %NUMA_NO_NODE for any node * * Find @size free area aligned to @align in the specified range and node. * @@ -173,9 +179,9 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end, * RETURNS: * Found address on success, 0 on failure. */ -phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start, - phys_addr_t end, phys_addr_t size, - phys_addr_t align, int nid) +phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size, + phys_addr_t align, phys_addr_t start, + phys_addr_t end, int nid) { int ret; phys_addr_t kernel_end; @@ -238,8 +244,8 @@ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start, phys_addr_t end, phys_addr_t size, phys_addr_t align) { - return memblock_find_in_range_node(start, end, size, align, - MAX_NUMNODES); + return memblock_find_in_range_node(size, align, start, end, + NUMA_NO_NODE); } static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r) @@ -255,6 +261,7 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u type->cnt = 1; type->regions[0].base = 0; type->regions[0].size = 0; + type->regions[0].flags = 0; memblock_set_region_node(&type->regions[0], MAX_NUMNODES); } } @@ -265,6 +272,19 @@ phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info( if (memblock.reserved.regions == memblock_reserved_init_regions) return 0; + /* + * Don't allow nobootmem allocator to free reserved memory regions + * array if + * - CONFIG_DEBUG_FS is enabled; + * - CONFIG_ARCH_DISCARD_MEMBLOCK is not enabled; + * - reserved memory regions array have been resized during boot. + * Otherwise debug_fs entry "sys/kernel/debug/memblock/reserved" + * will show garbage instead of state of memory reservations. + */ + if (IS_ENABLED(CONFIG_DEBUG_FS) && + !IS_ENABLED(CONFIG_ARCH_DISCARD_MEMBLOCK)) + return 0; + *addr = __pa(memblock.reserved.regions); return PAGE_ALIGN(sizeof(struct memblock_region) * @@ -405,7 +425,8 @@ static void __init_memblock memblock_merge_regions(struct memblock_type *type) if (this->base + this->size != next->base || memblock_get_region_node(this) != - memblock_get_region_node(next)) { + memblock_get_region_node(next) || + this->flags != next->flags) { BUG_ON(this->base + this->size > next->base); i++; continue; @@ -425,13 +446,15 @@ static void __init_memblock memblock_merge_regions(struct memblock_type *type) * @base: base address of the new region * @size: size of the new region * @nid: node id of the new region + * @flags: flags of the new region * * Insert new memblock region [@base,@base+@size) into @type at @idx. * @type must already have extra room to accomodate the new region. */ static void __init_memblock memblock_insert_region(struct memblock_type *type, int idx, phys_addr_t base, - phys_addr_t size, int nid) + phys_addr_t size, + int nid, unsigned long flags) { struct memblock_region *rgn = &type->regions[idx]; @@ -439,6 +462,7 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type, memmove(rgn + 1, rgn, (type->cnt - idx) * sizeof(*rgn)); rgn->base = base; rgn->size = size; + rgn->flags = flags; memblock_set_region_node(rgn, nid); type->cnt++; type->total_size += size; @@ -450,6 +474,7 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type, * @base: base address of the new region * @size: size of the new region * @nid: nid of the new region + * @flags: flags of the new region * * Add new memblock region [@base,@base+@size) into @type. The new region * is allowed to overlap with existing ones - overlaps don't affect already @@ -460,7 +485,8 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type, * 0 on success, -errno on failure. */ static int __init_memblock memblock_add_region(struct memblock_type *type, - phys_addr_t base, phys_addr_t size, int nid) + phys_addr_t base, phys_addr_t size, + int nid, unsigned long flags) { bool insert = false; phys_addr_t obase = base; @@ -475,6 +501,7 @@ static int __init_memblock memblock_add_region(struct memblock_type *type, WARN_ON(type->cnt != 1 || type->total_size); type->regions[0].base = base; type->regions[0].size = size; + type->regions[0].flags = flags; memblock_set_region_node(&type->regions[0], nid); type->total_size = size; return 0; @@ -505,7 +532,8 @@ repeat: nr_new++; if (insert) memblock_insert_region(type, i++, base, - rbase - base, nid); + rbase - base, nid, + flags); } /* area below @rend is dealt with, forget about it */ base = min(rend, end); @@ -515,7 +543,8 @@ repeat: if (base < end) { nr_new++; if (insert) - memblock_insert_region(type, i, base, end - base, nid); + memblock_insert_region(type, i, base, end - base, + nid, flags); } /* @@ -537,12 +566,13 @@ repeat: int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size, int nid) { - return memblock_add_region(&memblock.memory, base, size, nid); + return memblock_add_region(&memblock.memory, base, size, nid, 0); } int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) { - return memblock_add_region(&memblock.memory, base, size, MAX_NUMNODES); + return memblock_add_region(&memblock.memory, base, size, + MAX_NUMNODES, 0); } /** @@ -597,7 +627,8 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type, rgn->size -= base - rbase; type->total_size -= base - rbase; memblock_insert_region(type, i, rbase, base - rbase, - memblock_get_region_node(rgn)); + memblock_get_region_node(rgn), + rgn->flags); } else if (rend > end) { /* * @rgn intersects from above. Split and redo the @@ -607,7 +638,8 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type, rgn->size -= end - rbase; type->total_size -= end - rbase; memblock_insert_region(type, i--, rbase, end - rbase, - memblock_get_region_node(rgn)); + memblock_get_region_node(rgn), + rgn->flags); } else { /* @rgn is fully contained, record it */ if (!*end_rgn) @@ -643,28 +675,89 @@ int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size) { memblock_dbg(" memblock_free: [%#016llx-%#016llx] %pF\n", (unsigned long long)base, - (unsigned long long)base + size, + (unsigned long long)base + size - 1, (void *)_RET_IP_); return __memblock_remove(&memblock.reserved, base, size); } -int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) +static int __init_memblock memblock_reserve_region(phys_addr_t base, + phys_addr_t size, + int nid, + unsigned long flags) { struct memblock_type *_rgn = &memblock.reserved; - memblock_dbg("memblock_reserve: [%#016llx-%#016llx] %pF\n", + memblock_dbg("memblock_reserve: [%#016llx-%#016llx] flags %#02lx %pF\n", (unsigned long long)base, - (unsigned long long)base + size, - (void *)_RET_IP_); + (unsigned long long)base + size - 1, + flags, (void *)_RET_IP_); - return memblock_add_region(_rgn, base, size, MAX_NUMNODES); + return memblock_add_region(_rgn, base, size, nid, flags); +} + +int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) +{ + return memblock_reserve_region(base, size, MAX_NUMNODES, 0); +} + +/** + * memblock_mark_hotplug - Mark hotpluggable memory with flag MEMBLOCK_HOTPLUG. + * @base: the base phys addr of the region + * @size: the size of the region + * + * This function isolates region [@base, @base + @size), and mark it with flag + * MEMBLOCK_HOTPLUG. + * + * Return 0 on succees, -errno on failure. + */ +int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size) +{ + struct memblock_type *type = &memblock.memory; + int i, ret, start_rgn, end_rgn; + + ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn); + if (ret) + return ret; + + for (i = start_rgn; i < end_rgn; i++) + memblock_set_region_flags(&type->regions[i], MEMBLOCK_HOTPLUG); + + memblock_merge_regions(type); + return 0; +} + +/** + * memblock_clear_hotplug - Clear flag MEMBLOCK_HOTPLUG for a specified region. + * @base: the base phys addr of the region + * @size: the size of the region + * + * This function isolates region [@base, @base + @size), and clear flag + * MEMBLOCK_HOTPLUG for the isolated regions. + * + * Return 0 on succees, -errno on failure. + */ +int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size) +{ + struct memblock_type *type = &memblock.memory; + int i, ret, start_rgn, end_rgn; + + ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn); + if (ret) + return ret; + + for (i = start_rgn; i < end_rgn; i++) + memblock_clear_region_flags(&type->regions[i], + MEMBLOCK_HOTPLUG); + + memblock_merge_regions(type); + return 0; } /** * __next_free_mem_range - next function for for_each_free_mem_range() * @idx: pointer to u64 loop variable - * @nid: node selector, %MAX_NUMNODES for all nodes + * @nid: node selector, %NUMA_NO_NODE for all nodes * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL * @out_nid: ptr to int for nid of the range, can be %NULL @@ -693,13 +786,16 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid, int mi = *idx & 0xffffffff; int ri = *idx >> 32; + if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) + nid = NUMA_NO_NODE; + for ( ; mi < mem->cnt; mi++) { struct memblock_region *m = &mem->regions[mi]; phys_addr_t m_start = m->base; phys_addr_t m_end = m->base + m->size; /* only memory regions are associated with nodes, check it */ - if (nid != MAX_NUMNODES && nid != memblock_get_region_node(m)) + if (nid != NUMA_NO_NODE && nid != memblock_get_region_node(m)) continue; /* scan areas before each reservation for intersection */ @@ -740,12 +836,17 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid, /** * __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse() * @idx: pointer to u64 loop variable - * @nid: nid: node selector, %MAX_NUMNODES for all nodes + * @nid: nid: node selector, %NUMA_NO_NODE for all nodes * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL * @out_nid: ptr to int for nid of the range, can be %NULL * * Reverse of __next_free_mem_range(). + * + * Linux kernel cannot migrate pages used by itself. Memory hotplug users won't + * be able to hot-remove hotpluggable memory used by the kernel. So this + * function skip hotpluggable regions if needed when allocating memory for the + * kernel. */ void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid, phys_addr_t *out_start, @@ -756,6 +857,9 @@ void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid, int mi = *idx & 0xffffffff; int ri = *idx >> 32; + if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) + nid = NUMA_NO_NODE; + if (*idx == (u64)ULLONG_MAX) { mi = mem->cnt - 1; ri = rsv->cnt; @@ -767,7 +871,11 @@ void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid, phys_addr_t m_end = m->base + m->size; /* only memory regions are associated with nodes, check it */ - if (nid != MAX_NUMNODES && nid != memblock_get_region_node(m)) + if (nid != NUMA_NO_NODE && nid != memblock_get_region_node(m)) + continue; + + /* skip hotpluggable memory regions if needed */ + if (movable_node_is_enabled() && memblock_is_hotpluggable(m)) continue; /* scan areas before each reservation for intersection */ @@ -837,18 +945,18 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid, * memblock_set_node - set node ID on memblock regions * @base: base of area to set node ID for * @size: size of area to set node ID for + * @type: memblock type to set node ID for * @nid: node ID to set * - * Set the nid of memblock memory regions in [@base,@base+@size) to @nid. + * Set the nid of memblock @type regions in [@base,@base+@size) to @nid. * Regions which cross the area boundaries are split as necessary. * * RETURNS: * 0 on success, -errno on failure. */ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, - int nid) + struct memblock_type *type, int nid) { - struct memblock_type *type = &memblock.memory; int start_rgn, end_rgn; int i, ret; @@ -870,13 +978,13 @@ static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size, { phys_addr_t found; - if (WARN_ON(!align)) - align = __alignof__(long long); + if (!align) + align = SMP_CACHE_BYTES; /* align @size to avoid excessive fragmentation on reserved array */ size = round_up(size, align); - found = memblock_find_in_range_node(0, max_addr, size, align, nid); + found = memblock_find_in_range_node(size, align, 0, max_addr, nid); if (found && !memblock_reserve(found, size)) return found; @@ -890,7 +998,7 @@ phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int n phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) { - return memblock_alloc_base_nid(size, align, max_addr, MAX_NUMNODES); + return memblock_alloc_base_nid(size, align, max_addr, NUMA_NO_NODE); } phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) @@ -920,6 +1028,207 @@ phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, i return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); } +/** + * memblock_virt_alloc_internal - allocate boot memory block + * @size: size of memory block to be allocated in bytes + * @align: alignment of the region and block's size + * @min_addr: the lower bound of the memory region to allocate (phys address) + * @max_addr: the upper bound of the memory region to allocate (phys address) + * @nid: nid of the free area to find, %NUMA_NO_NODE for any node + * + * The @min_addr limit is dropped if it can not be satisfied and the allocation + * will fall back to memory below @min_addr. Also, allocation may fall back + * to any node in the system if the specified node can not + * hold the requested memory. + * + * The allocation is performed from memory region limited by + * memblock.current_limit if @max_addr == %BOOTMEM_ALLOC_ACCESSIBLE. + * + * The memory block is aligned on SMP_CACHE_BYTES if @align == 0. + * + * The phys address of allocated boot memory block is converted to virtual and + * allocated memory is reset to 0. + * + * In addition, function sets the min_count to 0 using kmemleak_alloc for + * allocated boot memory block, so that it is never reported as leaks. + * + * RETURNS: + * Virtual address of allocated memory block on success, NULL on failure. + */ +static void * __init memblock_virt_alloc_internal( + phys_addr_t size, phys_addr_t align, + phys_addr_t min_addr, phys_addr_t max_addr, + int nid) +{ + phys_addr_t alloc; + void *ptr; + + if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) + nid = NUMA_NO_NODE; + + /* + * Detect any accidental use of these APIs after slab is ready, as at + * this moment memblock may be deinitialized already and its + * internal data may be destroyed (after execution of free_all_bootmem) + */ + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc_node(size, GFP_NOWAIT, nid); + + if (!align) + align = SMP_CACHE_BYTES; + + /* align @size to avoid excessive fragmentation on reserved array */ + size = round_up(size, align); + +again: + alloc = memblock_find_in_range_node(size, align, min_addr, max_addr, + nid); + if (alloc) + goto done; + + if (nid != NUMA_NO_NODE) { + alloc = memblock_find_in_range_node(size, align, min_addr, + max_addr, NUMA_NO_NODE); + if (alloc) + goto done; + } + + if (min_addr) { + min_addr = 0; + goto again; + } else { + goto error; + } + +done: + memblock_reserve(alloc, size); + ptr = phys_to_virt(alloc); + memset(ptr, 0, size); + + /* + * The min_count is set to 0 so that bootmem allocated blocks + * are never reported as leaks. This is because many of these blocks + * are only referred via the physical address which is not + * looked up by kmemleak. + */ + kmemleak_alloc(ptr, size, 0, 0); + + return ptr; + +error: + return NULL; +} + +/** + * memblock_virt_alloc_try_nid_nopanic - allocate boot memory block + * @size: size of memory block to be allocated in bytes + * @align: alignment of the region and block's size + * @min_addr: the lower bound of the memory region from where the allocation + * is preferred (phys address) + * @max_addr: the upper bound of the memory region from where the allocation + * is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to + * allocate only from memory limited by memblock.current_limit value + * @nid: nid of the free area to find, %NUMA_NO_NODE for any node + * + * Public version of _memblock_virt_alloc_try_nid_nopanic() which provides + * additional debug information (including caller info), if enabled. + * + * RETURNS: + * Virtual address of allocated memory block on success, NULL on failure. + */ +void * __init memblock_virt_alloc_try_nid_nopanic( + phys_addr_t size, phys_addr_t align, + phys_addr_t min_addr, phys_addr_t max_addr, + int nid) +{ + memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n", + __func__, (u64)size, (u64)align, nid, (u64)min_addr, + (u64)max_addr, (void *)_RET_IP_); + return memblock_virt_alloc_internal(size, align, min_addr, + max_addr, nid); +} + +/** + * memblock_virt_alloc_try_nid - allocate boot memory block with panicking + * @size: size of memory block to be allocated in bytes + * @align: alignment of the region and block's size + * @min_addr: the lower bound of the memory region from where the allocation + * is preferred (phys address) + * @max_addr: the upper bound of the memory region from where the allocation + * is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to + * allocate only from memory limited by memblock.current_limit value + * @nid: nid of the free area to find, %NUMA_NO_NODE for any node + * + * Public panicking version of _memblock_virt_alloc_try_nid_nopanic() + * which provides debug information (including caller info), if enabled, + * and panics if the request can not be satisfied. + * + * RETURNS: + * Virtual address of allocated memory block on success, NULL on failure. + */ +void * __init memblock_virt_alloc_try_nid( + phys_addr_t size, phys_addr_t align, + phys_addr_t min_addr, phys_addr_t max_addr, + int nid) +{ + void *ptr; + + memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n", + __func__, (u64)size, (u64)align, nid, (u64)min_addr, + (u64)max_addr, (void *)_RET_IP_); + ptr = memblock_virt_alloc_internal(size, align, + min_addr, max_addr, nid); + if (ptr) + return ptr; + + panic("%s: Failed to allocate %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx\n", + __func__, (u64)size, (u64)align, nid, (u64)min_addr, + (u64)max_addr); + return NULL; +} + +/** + * __memblock_free_early - free boot memory block + * @base: phys starting address of the boot memory block + * @size: size of the boot memory block in bytes + * + * Free boot memory block previously allocated by memblock_virt_alloc_xx() API. + * The freeing memory will not be released to the buddy allocator. + */ +void __init __memblock_free_early(phys_addr_t base, phys_addr_t size) +{ + memblock_dbg("%s: [%#016llx-%#016llx] %pF\n", + __func__, (u64)base, (u64)base + size - 1, + (void *)_RET_IP_); + kmemleak_free_part(__va(base), size); + __memblock_remove(&memblock.reserved, base, size); +} + +/* + * __memblock_free_late - free bootmem block pages directly to buddy allocator + * @addr: phys starting address of the boot memory block + * @size: size of the boot memory block in bytes + * + * This is only useful when the bootmem allocator has already been torn + * down, but we are still initializing the system. Pages are released directly + * to the buddy allocator, no bootmem metadata is updated because it is gone. + */ +void __init __memblock_free_late(phys_addr_t base, phys_addr_t size) +{ + u64 cursor, end; + + memblock_dbg("%s: [%#016llx-%#016llx] %pF\n", + __func__, (u64)base, (u64)base + size - 1, + (void *)_RET_IP_); + kmemleak_free_part(__va(base), size); + cursor = PFN_UP(base); + end = PFN_DOWN(base + size); + + for (; cursor < end; cursor++) { + __free_pages_bootmem(pfn_to_page(cursor), 0); + totalram_pages++; + } +} /* * Remaining API functions @@ -1101,6 +1410,7 @@ void __init_memblock memblock_set_current_limit(phys_addr_t limit) static void __init_memblock memblock_dump(struct memblock_type *type, char *name) { unsigned long long base, size; + unsigned long flags; int i; pr_info(" %s.cnt = 0x%lx\n", name, type->cnt); @@ -1111,13 +1421,14 @@ static void __init_memblock memblock_dump(struct memblock_type *type, char *name base = rgn->base; size = rgn->size; + flags = rgn->flags; #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP if (memblock_get_region_node(rgn) != MAX_NUMNODES) snprintf(nid_buf, sizeof(nid_buf), " on node %d", memblock_get_region_node(rgn)); #endif - pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes%s\n", - name, i, base, base + size - 1, size, nid_buf); + pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes%s flags: %#lx\n", + name, i, base, base + size - 1, size, nid_buf, flags); } } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7caff36..67dd2a8 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1688,13 +1688,13 @@ static void move_unlock_mem_cgroup(struct mem_cgroup *memcg, */ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) { - struct cgroup *task_cgrp; - struct cgroup *mem_cgrp; /* - * Need a buffer in BSS, can't rely on allocations. The code relies - * on the assumption that OOM is serialized for memory controller. - * If this assumption is broken, revisit this code. + * protects memcg_name and makes sure that parallel ooms do not + * interleave */ + static DEFINE_SPINLOCK(oom_info_lock); + struct cgroup *task_cgrp; + struct cgroup *mem_cgrp; static char memcg_name[PATH_MAX]; int ret; struct mem_cgroup *iter; @@ -1703,6 +1703,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) if (!p) return; + spin_lock(&oom_info_lock); rcu_read_lock(); mem_cgrp = memcg->css.cgroup; @@ -1771,6 +1772,7 @@ done: pr_cont("\n"); } + spin_unlock(&oom_info_lock); } /* @@ -3000,7 +3002,8 @@ static DEFINE_MUTEX(set_limit_mutex); static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg) { return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) && - (memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK); + (memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK) == + KMEM_ACCOUNTED_MASK; } /* @@ -3126,7 +3129,7 @@ int memcg_cache_id(struct mem_cgroup *memcg) * But when we create a new cache, we can call this as well if its parent * is kmem-limited. That will have to hold set_limit_mutex as well. */ -int memcg_update_cache_sizes(struct mem_cgroup *memcg) +static int memcg_update_cache_sizes(struct mem_cgroup *memcg) { int num, ret; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index fabe550..b25ed32 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -611,7 +611,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) } /* - * Dirty cache page page + * Dirty pagecache page * Issues: when the error hit a hole page the error is not properly * propagated. */ @@ -1585,7 +1585,13 @@ static int __soft_offline_page(struct page *page, int flags) ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, MIGRATE_SYNC, MR_MEMORY_FAILURE); if (ret) { - putback_lru_pages(&pagelist); + if (!list_empty(&pagelist)) { + list_del(&page->lru); + dec_zone_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); + putback_lru_page(page); + } + pr_info("soft offline: %#lx: migration failed %d, type %lx\n", pfn, ret, page->flags); if (ret > 0) diff --git a/mm/memory.c b/mm/memory.c index 6768ce9..86487df 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -59,6 +59,7 @@ #include <linux/gfp.h> #include <linux/migrate.h> #include <linux/string.h> +#include <linux/dma-debug.h> #include <asm/io.h> #include <asm/pgalloc.h> @@ -2559,6 +2560,8 @@ static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma) { + debug_dma_assert_idle(src); + /* * If the source page was a PFN mapping, we don't have * a "struct page" for it. We do a best-effort copy by @@ -4272,11 +4275,20 @@ void copy_user_huge_page(struct page *dst, struct page *src, #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ #if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS + +static struct kmem_cache *page_ptl_cachep; + +void __init ptlock_cache_init(void) +{ + page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0, + SLAB_PANIC, NULL); +} + bool ptlock_alloc(struct page *page) { spinlock_t *ptl; - ptl = kmalloc(sizeof(spinlock_t), GFP_KERNEL); + ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL); if (!ptl) return false; page->ptl = ptl; @@ -4285,6 +4297,6 @@ bool ptlock_alloc(struct page *page) void ptlock_free(struct page *page) { - kfree(page->ptl); + kmem_cache_free(page_ptl_cachep, page->ptl); } #endif diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 489f235..cc2ab37 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -9,7 +9,6 @@ #include <linux/swap.h> #include <linux/interrupt.h> #include <linux/pagemap.h> -#include <linux/bootmem.h> #include <linux/compiler.h> #include <linux/export.h> #include <linux/pagevec.h> @@ -269,7 +268,7 @@ static void fix_zone_id(struct zone *zone, unsigned long start_pfn, } /* Can fail with -ENOMEM from allocating a wait table with vmalloc() or - * alloc_bootmem_node_nopanic() */ + * alloc_bootmem_node_nopanic()/memblock_virt_alloc_node_nopanic() */ static int __ref ensure_zone_is_initialized(struct zone *zone, unsigned long start_pfn, unsigned long num_pages) { @@ -1446,6 +1445,7 @@ static int __init cmdline_parse_movable_node(char *p) * the kernel away from hotpluggable memory. */ memblock_set_bottom_up(true); + movable_node_enabled = true; #else pr_warn("movable_node option not supported\n"); #endif diff --git a/mm/migrate.c b/mm/migrate.c index 9194375..a8025be 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -72,28 +72,12 @@ int migrate_prep_local(void) } /* - * Add isolated pages on the list back to the LRU under page lock - * to avoid leaking evictable pages back onto unevictable list. - */ -void putback_lru_pages(struct list_head *l) -{ - struct page *page; - struct page *page2; - - list_for_each_entry_safe(page, page2, l, lru) { - list_del(&page->lru); - dec_zone_page_state(page, NR_ISOLATED_ANON + - page_is_file_cache(page)); - putback_lru_page(page); - } -} - -/* * Put previously isolated pages back onto the appropriate lists * from where they were once taken off for compaction/migration. * - * This function shall be used instead of putback_lru_pages(), - * whenever the isolated pageset has been built by isolate_migratepages_range() + * This function shall be used whenever the isolated pageset has been + * built from lru, balloon, hugetlbfs page. See isolate_migratepages_range() + * and isolate_huge_page(). */ void putback_movable_pages(struct list_head *l) { @@ -199,7 +183,12 @@ out: */ static void remove_migration_ptes(struct page *old, struct page *new) { - rmap_walk(new, remove_migration_pte, old); + struct rmap_walk_control rwc = { + .rmap_one = remove_migration_pte, + .arg = old, + }; + + rmap_walk(new, &rwc); } /* @@ -563,14 +552,6 @@ void migrate_page_copy(struct page *newpage, struct page *page) * Migration functions ***********************************************************/ -/* Always fail migration. Used for mappings that are not movable */ -int fail_migrate_page(struct address_space *mapping, - struct page *newpage, struct page *page) -{ - return -EIO; -} -EXPORT_SYMBOL(fail_migrate_page); - /* * Common logic to directly migrate a single page suitable for * pages that do not use PagePrivate/PagePrivate2. @@ -1008,7 +989,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, { int rc = 0; int *result = NULL; - struct page *new_hpage = get_new_page(hpage, private, &result); + struct page *new_hpage; struct anon_vma *anon_vma = NULL; /* @@ -1018,9 +999,12 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, * tables or check whether the hugepage is pmd-based or not before * kicking migration. */ - if (!hugepage_migration_support(page_hstate(hpage))) + if (!hugepage_migration_support(page_hstate(hpage))) { + putback_active_hugepage(hpage); return -ENOSYS; + } + new_hpage = get_new_page(hpage, private, &result); if (!new_hpage) return -ENOMEM; @@ -1120,7 +1104,12 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, nr_succeeded++; break; default: - /* Permanent failure */ + /* + * Permanent failure (-EBUSY, -ENOSYS, etc.): + * unlike -EAGAIN case, the failed page is + * removed from migration page list and not + * retried in the next outer loop. + */ nr_failed++; break; } @@ -1594,31 +1583,38 @@ bool migrate_ratelimited(int node) } /* Returns true if the node is migrate rate-limited after the update */ -bool numamigrate_update_ratelimit(pg_data_t *pgdat, unsigned long nr_pages) +static bool numamigrate_update_ratelimit(pg_data_t *pgdat, + unsigned long nr_pages) { - bool rate_limited = false; - /* * Rate-limit the amount of data that is being migrated to a node. * Optimal placement is no good if the memory bus is saturated and * all the time is being spent migrating! */ - spin_lock(&pgdat->numabalancing_migrate_lock); if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) { + spin_lock(&pgdat->numabalancing_migrate_lock); pgdat->numabalancing_migrate_nr_pages = 0; pgdat->numabalancing_migrate_next_window = jiffies + msecs_to_jiffies(migrate_interval_millisecs); + spin_unlock(&pgdat->numabalancing_migrate_lock); } - if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) - rate_limited = true; - else - pgdat->numabalancing_migrate_nr_pages += nr_pages; - spin_unlock(&pgdat->numabalancing_migrate_lock); - - return rate_limited; + if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) { + trace_mm_numa_migrate_ratelimit(current, pgdat->node_id, + nr_pages); + return true; + } + + /* + * This is an unlocked non-atomic update so errors are possible. + * The consequences are failing to migrate when we potentiall should + * have which is not severe enough to warrant locking. If it is ever + * a problem, it can be converted to a per-cpu counter. + */ + pgdat->numabalancing_migrate_nr_pages += nr_pages; + return false; } -int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) +static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) { int page_lru; @@ -1705,7 +1701,12 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page, node, MIGRATE_ASYNC, MR_NUMA_MISPLACED); if (nr_remaining) { - putback_lru_pages(&migratepages); + if (!list_empty(&migratepages)) { + list_del(&page->lru); + dec_zone_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); + putback_lru_page(page); + } isolated = 0; } else count_vm_numa_event(NUMA_PAGE_MIGRATE); @@ -709,19 +709,21 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) lru_add_drain_all(); /* flush pagevec */ - down_write(¤t->mm->mmap_sem); len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); start &= PAGE_MASK; - locked = len >> PAGE_SHIFT; - locked += current->mm->locked_vm; - lock_limit = rlimit(RLIMIT_MEMLOCK); lock_limit >>= PAGE_SHIFT; + locked = len >> PAGE_SHIFT; + + down_write(¤t->mm->mmap_sem); + + locked += current->mm->locked_vm; /* check against resource limits */ if ((locked <= lock_limit) || capable(CAP_IPC_LOCK)) error = do_mlock(start, len, 1); + up_write(¤t->mm->mmap_sem); if (!error) error = __mm_populate(start, len, 0); @@ -732,11 +734,13 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) { int ret; - down_write(¤t->mm->mmap_sem); len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); start &= PAGE_MASK; + + down_write(¤t->mm->mmap_sem); ret = do_mlock(start, len, 0); up_write(¤t->mm->mmap_sem); + return ret; } @@ -781,12 +785,12 @@ SYSCALL_DEFINE1(mlockall, int, flags) if (flags & MCL_CURRENT) lru_add_drain_all(); /* flush pagevec */ - down_write(¤t->mm->mmap_sem); - lock_limit = rlimit(RLIMIT_MEMLOCK); lock_limit >>= PAGE_SHIFT; ret = -ENOMEM; + down_write(¤t->mm->mmap_sem); + if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) || capable(CAP_IPC_LOCK)) ret = do_mlockall(flags); @@ -86,6 +86,7 @@ EXPORT_SYMBOL(vm_get_page_prot); int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */ int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */ +unsigned long sysctl_overcommit_kbytes __read_mostly; int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ @@ -1190,6 +1191,24 @@ static inline unsigned long round_hint_to_min(unsigned long hint) return hint; } +static inline int mlock_future_check(struct mm_struct *mm, + unsigned long flags, + unsigned long len) +{ + unsigned long locked, lock_limit; + + /* mlock MCL_FUTURE? */ + if (flags & VM_LOCKED) { + locked = len >> PAGE_SHIFT; + locked += mm->locked_vm; + lock_limit = rlimit(RLIMIT_MEMLOCK); + lock_limit >>= PAGE_SHIFT; + if (locked > lock_limit && !capable(CAP_IPC_LOCK)) + return -EAGAIN; + } + return 0; +} + /* * The caller must hold down_write(¤t->mm->mmap_sem). */ @@ -1251,16 +1270,8 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, if (!can_do_mlock()) return -EPERM; - /* mlock MCL_FUTURE? */ - if (vm_flags & VM_LOCKED) { - unsigned long locked, lock_limit; - locked = len >> PAGE_SHIFT; - locked += mm->locked_vm; - lock_limit = rlimit(RLIMIT_MEMLOCK); - lock_limit >>= PAGE_SHIFT; - if (locked > lock_limit && !capable(CAP_IPC_LOCK)) - return -EAGAIN; - } + if (mlock_future_check(mm, vm_flags, len)) + return -EAGAIN; if (file) { struct inode *inode = file_inode(file); @@ -2591,18 +2602,9 @@ static unsigned long do_brk(unsigned long addr, unsigned long len) if (error & ~PAGE_MASK) return error; - /* - * mlock MCL_FUTURE? - */ - if (mm->def_flags & VM_LOCKED) { - unsigned long locked, lock_limit; - locked = len >> PAGE_SHIFT; - locked += mm->locked_vm; - lock_limit = rlimit(RLIMIT_MEMLOCK); - lock_limit >>= PAGE_SHIFT; - if (locked > lock_limit && !capable(CAP_IPC_LOCK)) - return -EAGAIN; - } + error = mlock_future_check(mm, mm->def_flags, len); + if (error) + return error; /* * mm->mmap_sem is required to protect against another thread diff --git a/mm/mprotect.c b/mm/mprotect.c index bb53a65..7332c17 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -23,6 +23,7 @@ #include <linux/mmu_notifier.h> #include <linux/migrate.h> #include <linux/perf_event.h> +#include <linux/ksm.h> #include <asm/uaccess.h> #include <asm/pgtable.h> #include <asm/cacheflush.h> @@ -63,7 +64,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, ptent = *pte; page = vm_normal_page(vma, addr, oldpte); - if (page) { + if (page && !PageKsm(page)) { if (!pte_numa(oldpte)) { ptent = pte_mknuma(ptent); set_pte_at(mm, addr, pte, ptent); diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 2c254d3..19121ce 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -41,7 +41,7 @@ static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, if (limit > memblock.current_limit) limit = memblock.current_limit; - addr = memblock_find_in_range_node(goal, limit, size, align, nid); + addr = memblock_find_in_range_node(size, align, goal, limit, nid); if (!addr) return NULL; @@ -117,7 +117,7 @@ static unsigned long __init free_low_memory_core_early(void) phys_addr_t start, end, size; u64 i; - for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) + for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL) count += __free_memory_core(start, end); /* free range that is used for reserved array if we allocate it */ @@ -161,7 +161,7 @@ unsigned long __init free_all_bootmem(void) reset_all_zones_managed_pages(); /* - * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id + * We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id * because in some case like Node0 doesn't have RAM installed * low ram will be on Node1 */ @@ -215,7 +215,7 @@ static void * __init ___alloc_bootmem_nopanic(unsigned long size, restart: - ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit); + ptr = __alloc_memory_core_early(NUMA_NO_NODE, size, align, goal, limit); if (ptr) return ptr; @@ -299,7 +299,7 @@ again: if (ptr) return ptr; - ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, + ptr = __alloc_memory_core_early(NUMA_NO_NODE, size, align, goal, limit); if (ptr) return ptr; @@ -60,6 +60,7 @@ unsigned long highest_memmap_pfn; struct percpu_counter vm_committed_as; int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ int sysctl_overcommit_ratio = 50; /* default is 50% */ +unsigned long sysctl_overcommit_kbytes __read_mostly; int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 1e4a600..054ff47 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -47,19 +47,21 @@ static DEFINE_SPINLOCK(zone_scan_lock); #ifdef CONFIG_NUMA /** * has_intersects_mems_allowed() - check task eligiblity for kill - * @tsk: task struct of which task to consider + * @start: task struct of which task to consider * @mask: nodemask passed to page allocator for mempolicy ooms * * Task eligibility is determined by whether or not a candidate task, @tsk, * shares the same mempolicy nodes as current if it is bound by such a policy * and whether or not it has the same set of allowed cpuset nodes. */ -static bool has_intersects_mems_allowed(struct task_struct *tsk, +static bool has_intersects_mems_allowed(struct task_struct *start, const nodemask_t *mask) { - struct task_struct *start = tsk; + struct task_struct *tsk; + bool ret = false; - do { + rcu_read_lock(); + for_each_thread(start, tsk) { if (mask) { /* * If this is a mempolicy constrained oom, tsk's @@ -67,19 +69,20 @@ static bool has_intersects_mems_allowed(struct task_struct *tsk, * mempolicy intersects current, otherwise it may be * needlessly killed. */ - if (mempolicy_nodemask_intersects(tsk, mask)) - return true; + ret = mempolicy_nodemask_intersects(tsk, mask); } else { /* * This is not a mempolicy constrained oom, so only * check the mems of tsk's cpuset. */ - if (cpuset_mems_allowed_intersects(current, tsk)) - return true; + ret = cpuset_mems_allowed_intersects(current, tsk); } - } while_each_thread(start, tsk); + if (ret) + break; + } + rcu_read_unlock(); - return false; + return ret; } #else static bool has_intersects_mems_allowed(struct task_struct *tsk, @@ -97,16 +100,21 @@ static bool has_intersects_mems_allowed(struct task_struct *tsk, */ struct task_struct *find_lock_task_mm(struct task_struct *p) { - struct task_struct *t = p; + struct task_struct *t; - do { + rcu_read_lock(); + + for_each_thread(p, t) { task_lock(t); if (likely(t->mm)) - return t; + goto found; task_unlock(t); - } while_each_thread(p, t); + } + t = NULL; +found: + rcu_read_unlock(); - return NULL; + return t; } /* return true if the task is not adequate as candidate victim task. */ @@ -301,7 +309,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, unsigned long chosen_points = 0; rcu_read_lock(); - do_each_thread(g, p) { + for_each_process_thread(g, p) { unsigned int points; switch (oom_scan_process_thread(p, totalpages, nodemask, @@ -323,7 +331,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, chosen = p; chosen_points = points; } - } while_each_thread(g, p); + } if (chosen) get_task_struct(chosen); rcu_read_unlock(); @@ -406,7 +414,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, { struct task_struct *victim = p; struct task_struct *child; - struct task_struct *t = p; + struct task_struct *t; struct mm_struct *mm; unsigned int victim_points = 0; static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, @@ -437,7 +445,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, * still freeing memory. */ read_lock(&tasklist_lock); - do { + for_each_thread(p, t) { list_for_each_entry(child, &t->children, sibling) { unsigned int child_points; @@ -455,13 +463,11 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, get_task_struct(victim); } } - } while_each_thread(p, t); + } read_unlock(&tasklist_lock); - rcu_read_lock(); p = find_lock_task_mm(victim); if (!p) { - rcu_read_unlock(); put_task_struct(victim); return; } else if (victim != p) { @@ -487,6 +493,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, * That thread will now get access to memory reserves since it has a * pending fatal signal. */ + rcu_read_lock(); for_each_process(p) if (p->mm == mm && !same_thread_group(p, victim) && !(p->flags & PF_KTHREAD)) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5248fe0..533e214 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2072,13 +2072,6 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) return; /* - * Walking all memory to count page types is very expensive and should - * be inhibited in non-blockable contexts. - */ - if (!(gfp_mask & __GFP_WAIT)) - filter |= SHOW_MEM_FILTER_PAGE_COUNT; - - /* * This documents exceptions given to allocations in certain * contexts that are allowed to allocate outside current's set * of allowed nodes. @@ -2242,10 +2235,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, preferred_zone, migratetype); if (page) { preferred_zone->compact_blockskip_flush = false; - preferred_zone->compact_considered = 0; - preferred_zone->compact_defer_shift = 0; - if (order >= preferred_zone->compact_order_failed) - preferred_zone->compact_order_failed = order + 1; + compaction_defer_reset(preferred_zone, order, true); count_vm_event(COMPACTSUCCESS); return page; } @@ -2535,8 +2525,15 @@ rebalance: } /* Atomic allocations - we can't balance anything */ - if (!wait) + if (!wait) { + /* + * All existing users of the deprecated __GFP_NOFAIL are + * blockable, so warn of any new users that actually allow this + * type of allocation to fail. + */ + WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL); goto nopage; + } /* Avoid recursion of direct reclaim */ if (current->flags & PF_MEMALLOC) @@ -3901,6 +3898,7 @@ static void setup_zone_migrate_reserve(struct zone *zone) struct page *page; unsigned long block_migratetype; int reserve; + int old_reserve; /* * Get the start pfn, end pfn and the number of blocks to reserve @@ -3922,6 +3920,12 @@ static void setup_zone_migrate_reserve(struct zone *zone) * future allocation of hugepages at runtime. */ reserve = min(2, reserve); + old_reserve = zone->nr_migrate_reserve_block; + + /* When memory hot-add, we almost always need to do nothing */ + if (reserve == old_reserve) + return; + zone->nr_migrate_reserve_block = reserve; for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { if (!pfn_valid(pfn)) @@ -3959,6 +3963,12 @@ static void setup_zone_migrate_reserve(struct zone *zone) reserve--; continue; } + } else if (!old_reserve) { + /* + * At boot time we don't need to scan the whole zone + * for turning off MIGRATE_RESERVE. + */ + break; } /* @@ -4209,7 +4219,6 @@ static noinline __init_refok int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) { int i; - struct pglist_data *pgdat = zone->zone_pgdat; size_t alloc_size; /* @@ -4225,7 +4234,8 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) if (!slab_is_available()) { zone->wait_table = (wait_queue_head_t *) - alloc_bootmem_node_nopanic(pgdat, alloc_size); + memblock_virt_alloc_node_nopanic( + alloc_size, zone->zone_pgdat->node_id); } else { /* * This case means that a zone whose size was 0 gets new memory @@ -4345,13 +4355,14 @@ bool __meminit early_pfn_in_nid(unsigned long pfn, int node) #endif /** - * free_bootmem_with_active_regions - Call free_bootmem_node for each active range + * free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. - * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node + * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid * * If an architecture guarantees that all ranges registered with * add_active_ranges() contain no holes and may be freed, this - * this function may be used instead of calling free_bootmem() manually. + * this function may be used instead of calling memblock_free_early_nid() + * manually. */ void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) { @@ -4363,9 +4374,9 @@ void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) end_pfn = min(end_pfn, max_low_pfn); if (start_pfn < end_pfn) - free_bootmem_node(NODE_DATA(this_nid), - PFN_PHYS(start_pfn), - (end_pfn - start_pfn) << PAGE_SHIFT); + memblock_free_early_nid(PFN_PHYS(start_pfn), + (end_pfn - start_pfn) << PAGE_SHIFT, + this_nid); } } @@ -4636,8 +4647,9 @@ static void __init setup_usemap(struct pglist_data *pgdat, unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize); zone->pageblock_flags = NULL; if (usemapsize) - zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat, - usemapsize); + zone->pageblock_flags = + memblock_virt_alloc_node_nopanic(usemapsize, + pgdat->node_id); } #else static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone, @@ -4831,7 +4843,8 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) size = (end - start) * sizeof(struct page); map = alloc_remap(pgdat->node_id, size); if (!map) - map = alloc_bootmem_node_nopanic(pgdat, size); + map = memblock_virt_alloc_node_nopanic(size, + pgdat->node_id); pgdat->node_mem_map = map + (pgdat->node_start_pfn - start); } #ifndef CONFIG_NEED_MULTIPLE_NODES @@ -5012,9 +5025,33 @@ static void __init find_zone_movable_pfns_for_nodes(void) nodemask_t saved_node_state = node_states[N_MEMORY]; unsigned long totalpages = early_calculate_totalpages(); int usable_nodes = nodes_weight(node_states[N_MEMORY]); + struct memblock_type *type = &memblock.memory; + + /* Need to find movable_zone earlier when movable_node is specified. */ + find_usable_zone_for_movable(); + + /* + * If movable_node is specified, ignore kernelcore and movablecore + * options. + */ + if (movable_node_is_enabled()) { + for (i = 0; i < type->cnt; i++) { + if (!memblock_is_hotpluggable(&type->regions[i])) + continue; + + nid = type->regions[i].nid; + + usable_startpfn = PFN_DOWN(type->regions[i].base); + zone_movable_pfn[nid] = zone_movable_pfn[nid] ? + min(usable_startpfn, zone_movable_pfn[nid]) : + usable_startpfn; + } + + goto out2; + } /* - * If movablecore was specified, calculate what size of + * If movablecore=nn[KMG] was specified, calculate what size of * kernelcore that corresponds so that memory usable for * any allocation type is evenly spread. If both kernelcore * and movablecore are specified, then the value of kernelcore @@ -5040,7 +5077,6 @@ static void __init find_zone_movable_pfns_for_nodes(void) goto out; /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ - find_usable_zone_for_movable(); usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone]; restart: @@ -5131,6 +5167,7 @@ restart: if (usable_nodes && required_kernelcore > usable_nodes) goto restart; +out2: /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */ for (nid = 0; nid < MAX_NUMNODES; nid++) zone_movable_pfn[nid] = @@ -5857,7 +5894,7 @@ void *__init alloc_large_system_hash(const char *tablename, do { size = bucketsize << log2qty; if (flags & HASH_EARLY) - table = alloc_bootmem_nopanic(size); + table = memblock_virt_alloc_nopanic(size, 0); else if (hashdist) table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); else { diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 3bd0b8e..cfd1628 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -54,8 +54,9 @@ static int __init alloc_node_page_cgroup(int nid) table_size = sizeof(struct page_cgroup) * nr_pages; - base = __alloc_bootmem_node_nopanic(NODE_DATA(nid), - table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); + base = memblock_virt_alloc_try_nid_nopanic( + table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), + BOOTMEM_ALLOC_ACCESSIBLE, nid); if (!base) return -ENOMEM; NODE_DATA(nid)->node_page_cgroup = base; diff --git a/mm/percpu.c b/mm/percpu.c index afbf352..036cfe0 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1063,7 +1063,7 @@ struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups, __alignof__(ai->groups[0].cpu_map[0])); ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]); - ptr = alloc_bootmem_nopanic(PFN_ALIGN(ai_size)); + ptr = memblock_virt_alloc_nopanic(PFN_ALIGN(ai_size), 0); if (!ptr) return NULL; ai = ptr; @@ -1088,7 +1088,7 @@ struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups, */ void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai) { - free_bootmem(__pa(ai), ai->__ai_size); + memblock_free_early(__pa(ai), ai->__ai_size); } /** @@ -1246,10 +1246,12 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0); /* process group information and build config tables accordingly */ - group_offsets = alloc_bootmem(ai->nr_groups * sizeof(group_offsets[0])); - group_sizes = alloc_bootmem(ai->nr_groups * sizeof(group_sizes[0])); - unit_map = alloc_bootmem(nr_cpu_ids * sizeof(unit_map[0])); - unit_off = alloc_bootmem(nr_cpu_ids * sizeof(unit_off[0])); + group_offsets = memblock_virt_alloc(ai->nr_groups * + sizeof(group_offsets[0]), 0); + group_sizes = memblock_virt_alloc(ai->nr_groups * + sizeof(group_sizes[0]), 0); + unit_map = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_map[0]), 0); + unit_off = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_off[0]), 0); for (cpu = 0; cpu < nr_cpu_ids; cpu++) unit_map[cpu] = UINT_MAX; @@ -1311,7 +1313,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, * empty chunks. */ pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2; - pcpu_slot = alloc_bootmem(pcpu_nr_slots * sizeof(pcpu_slot[0])); + pcpu_slot = memblock_virt_alloc( + pcpu_nr_slots * sizeof(pcpu_slot[0]), 0); for (i = 0; i < pcpu_nr_slots; i++) INIT_LIST_HEAD(&pcpu_slot[i]); @@ -1322,7 +1325,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, * covers static area + reserved area (mostly used for module * static percpu allocation). */ - schunk = alloc_bootmem(pcpu_chunk_struct_size); + schunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); INIT_LIST_HEAD(&schunk->list); schunk->base_addr = base_addr; schunk->map = smap; @@ -1346,7 +1349,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, /* init dynamic chunk if necessary */ if (dyn_size) { - dchunk = alloc_bootmem(pcpu_chunk_struct_size); + dchunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); INIT_LIST_HEAD(&dchunk->list); dchunk->base_addr = base_addr; dchunk->map = dmap; @@ -1626,7 +1629,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, size_sum = ai->static_size + ai->reserved_size + ai->dyn_size; areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *)); - areas = alloc_bootmem_nopanic(areas_size); + areas = memblock_virt_alloc_nopanic(areas_size, 0); if (!areas) { rc = -ENOMEM; goto out_free; @@ -1712,7 +1715,7 @@ out_free_areas: out_free: pcpu_free_alloc_info(ai); if (areas) - free_bootmem(__pa(areas), areas_size); + memblock_free_early(__pa(areas), areas_size); return rc; } #endif /* BUILD_EMBED_FIRST_CHUNK */ @@ -1760,7 +1763,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size, /* unaligned allocations can't be freed, round up to page size */ pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() * sizeof(pages[0])); - pages = alloc_bootmem(pages_size); + pages = memblock_virt_alloc(pages_size, 0); /* allocate pages */ j = 0; @@ -1823,7 +1826,7 @@ enomem: free_fn(page_address(pages[j]), PAGE_SIZE); rc = -ENOMEM; out_free_ar: - free_bootmem(__pa(pages), pages_size); + memblock_free_early(__pa(pages), pages_size); pcpu_free_alloc_info(ai); return rc; } @@ -1848,12 +1851,13 @@ EXPORT_SYMBOL(__per_cpu_offset); static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size, size_t align) { - return __alloc_bootmem_nopanic(size, align, __pa(MAX_DMA_ADDRESS)); + return memblock_virt_alloc_from_nopanic( + size, align, __pa(MAX_DMA_ADDRESS)); } static void __init pcpu_dfl_fc_free(void *ptr, size_t size) { - free_bootmem(__pa(ptr), size); + memblock_free_early(__pa(ptr), size); } void __init setup_per_cpu_areas(void) @@ -1896,7 +1900,9 @@ void __init setup_per_cpu_areas(void) void *fc; ai = pcpu_alloc_alloc_info(1, 1); - fc = __alloc_bootmem(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); + fc = memblock_virt_alloc_from_nopanic(unit_size, + PAGE_SIZE, + __pa(MAX_DMA_ADDRESS)); if (!ai || !fc) panic("Failed to allocate memory for percpu areas."); /* kmemleak tracks the percpu allocations separately */ @@ -660,17 +660,22 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) return 1; } +struct page_referenced_arg { + int mapcount; + int referenced; + unsigned long vm_flags; + struct mem_cgroup *memcg; +}; /* - * Subfunctions of page_referenced: page_referenced_one called - * repeatedly from either page_referenced_anon or page_referenced_file. + * arg: page_referenced_arg will be passed */ int page_referenced_one(struct page *page, struct vm_area_struct *vma, - unsigned long address, unsigned int *mapcount, - unsigned long *vm_flags) + unsigned long address, void *arg) { struct mm_struct *mm = vma->vm_mm; spinlock_t *ptl; int referenced = 0; + struct page_referenced_arg *pra = arg; if (unlikely(PageTransHuge(page))) { pmd_t *pmd; @@ -682,13 +687,12 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, pmd = page_check_address_pmd(page, mm, address, PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl); if (!pmd) - goto out; + return SWAP_AGAIN; if (vma->vm_flags & VM_LOCKED) { spin_unlock(ptl); - *mapcount = 0; /* break early from loop */ - *vm_flags |= VM_LOCKED; - goto out; + pra->vm_flags |= VM_LOCKED; + return SWAP_FAIL; /* To break the loop */ } /* go ahead even if the pmd is pmd_trans_splitting() */ @@ -704,13 +708,12 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, */ pte = page_check_address(page, mm, address, &ptl, 0); if (!pte) - goto out; + return SWAP_AGAIN; if (vma->vm_flags & VM_LOCKED) { pte_unmap_unlock(pte, ptl); - *mapcount = 0; /* break early from loop */ - *vm_flags |= VM_LOCKED; - goto out; + pra->vm_flags |= VM_LOCKED; + return SWAP_FAIL; /* To break the loop */ } if (ptep_clear_flush_young_notify(vma, address, pte)) { @@ -727,113 +730,27 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, pte_unmap_unlock(pte, ptl); } - (*mapcount)--; - - if (referenced) - *vm_flags |= vma->vm_flags; -out: - return referenced; -} - -static int page_referenced_anon(struct page *page, - struct mem_cgroup *memcg, - unsigned long *vm_flags) -{ - unsigned int mapcount; - struct anon_vma *anon_vma; - pgoff_t pgoff; - struct anon_vma_chain *avc; - int referenced = 0; - - anon_vma = page_lock_anon_vma_read(page); - if (!anon_vma) - return referenced; - - mapcount = page_mapcount(page); - pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); - anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { - struct vm_area_struct *vma = avc->vma; - unsigned long address = vma_address(page, vma); - /* - * If we are reclaiming on behalf of a cgroup, skip - * counting on behalf of references from different - * cgroups - */ - if (memcg && !mm_match_cgroup(vma->vm_mm, memcg)) - continue; - referenced += page_referenced_one(page, vma, address, - &mapcount, vm_flags); - if (!mapcount) - break; + if (referenced) { + pra->referenced++; + pra->vm_flags |= vma->vm_flags; } - page_unlock_anon_vma_read(anon_vma); - return referenced; + pra->mapcount--; + if (!pra->mapcount) + return SWAP_SUCCESS; /* To break the loop */ + + return SWAP_AGAIN; } -/** - * page_referenced_file - referenced check for object-based rmap - * @page: the page we're checking references on. - * @memcg: target memory control group - * @vm_flags: collect encountered vma->vm_flags who actually referenced the page - * - * For an object-based mapped page, find all the places it is mapped and - * check/clear the referenced flag. This is done by following the page->mapping - * pointer, then walking the chain of vmas it holds. It returns the number - * of references it found. - * - * This function is only called from page_referenced for object-based pages. - */ -static int page_referenced_file(struct page *page, - struct mem_cgroup *memcg, - unsigned long *vm_flags) +static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg) { - unsigned int mapcount; - struct address_space *mapping = page->mapping; - pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); - struct vm_area_struct *vma; - int referenced = 0; - - /* - * The caller's checks on page->mapping and !PageAnon have made - * sure that this is a file page: the check for page->mapping - * excludes the case just before it gets set on an anon page. - */ - BUG_ON(PageAnon(page)); - - /* - * The page lock not only makes sure that page->mapping cannot - * suddenly be NULLified by truncation, it makes sure that the - * structure at mapping cannot be freed and reused yet, - * so we can safely take mapping->i_mmap_mutex. - */ - BUG_ON(!PageLocked(page)); - - mutex_lock(&mapping->i_mmap_mutex); + struct page_referenced_arg *pra = arg; + struct mem_cgroup *memcg = pra->memcg; - /* - * i_mmap_mutex does not stabilize mapcount at all, but mapcount - * is more likely to be accurate if we note it after spinning. - */ - mapcount = page_mapcount(page); - - vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { - unsigned long address = vma_address(page, vma); - /* - * If we are reclaiming on behalf of a cgroup, skip - * counting on behalf of references from different - * cgroups - */ - if (memcg && !mm_match_cgroup(vma->vm_mm, memcg)) - continue; - referenced += page_referenced_one(page, vma, address, - &mapcount, vm_flags); - if (!mapcount) - break; - } + if (!mm_match_cgroup(vma->vm_mm, memcg)) + return true; - mutex_unlock(&mapping->i_mmap_mutex); - return referenced; + return false; } /** @@ -851,41 +768,57 @@ int page_referenced(struct page *page, struct mem_cgroup *memcg, unsigned long *vm_flags) { - int referenced = 0; + int ret; int we_locked = 0; + struct page_referenced_arg pra = { + .mapcount = page_mapcount(page), + .memcg = memcg, + }; + struct rmap_walk_control rwc = { + .rmap_one = page_referenced_one, + .arg = (void *)&pra, + .anon_lock = page_lock_anon_vma_read, + }; *vm_flags = 0; - if (page_mapped(page) && page_rmapping(page)) { - if (!is_locked && (!PageAnon(page) || PageKsm(page))) { - we_locked = trylock_page(page); - if (!we_locked) { - referenced++; - goto out; - } - } - if (unlikely(PageKsm(page))) - referenced += page_referenced_ksm(page, memcg, - vm_flags); - else if (PageAnon(page)) - referenced += page_referenced_anon(page, memcg, - vm_flags); - else if (page->mapping) - referenced += page_referenced_file(page, memcg, - vm_flags); - if (we_locked) - unlock_page(page); + if (!page_mapped(page)) + return 0; + + if (!page_rmapping(page)) + return 0; + + if (!is_locked && (!PageAnon(page) || PageKsm(page))) { + we_locked = trylock_page(page); + if (!we_locked) + return 1; } -out: - return referenced; + + /* + * If we are reclaiming on behalf of a cgroup, skip + * counting on behalf of references from different + * cgroups + */ + if (memcg) { + rwc.invalid_vma = invalid_page_referenced_vma; + } + + ret = rmap_walk(page, &rwc); + *vm_flags = pra.vm_flags; + + if (we_locked) + unlock_page(page); + + return pra.referenced; } static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, - unsigned long address) + unsigned long address, void *arg) { struct mm_struct *mm = vma->vm_mm; pte_t *pte; spinlock_t *ptl; int ret = 0; + int *cleaned = arg; pte = page_check_address(page, mm, address, &ptl, 1); if (!pte) @@ -904,44 +837,44 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, pte_unmap_unlock(pte, ptl); - if (ret) + if (ret) { mmu_notifier_invalidate_page(mm, address); + (*cleaned)++; + } out: - return ret; + return SWAP_AGAIN; } -static int page_mkclean_file(struct address_space *mapping, struct page *page) +static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg) { - pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); - struct vm_area_struct *vma; - int ret = 0; - - BUG_ON(PageAnon(page)); + if (vma->vm_flags & VM_SHARED) + return 0; - mutex_lock(&mapping->i_mmap_mutex); - vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { - if (vma->vm_flags & VM_SHARED) { - unsigned long address = vma_address(page, vma); - ret += page_mkclean_one(page, vma, address); - } - } - mutex_unlock(&mapping->i_mmap_mutex); - return ret; + return 1; } int page_mkclean(struct page *page) { - int ret = 0; + int cleaned = 0; + struct address_space *mapping; + struct rmap_walk_control rwc = { + .arg = (void *)&cleaned, + .rmap_one = page_mkclean_one, + .invalid_vma = invalid_mkclean_vma, + }; BUG_ON(!PageLocked(page)); - if (page_mapped(page)) { - struct address_space *mapping = page_mapping(page); - if (mapping) - ret = page_mkclean_file(mapping, page); - } + if (!page_mapped(page)) + return 0; - return ret; + mapping = page_mapping(page); + if (!mapping) + return 0; + + rmap_walk(page, &rwc); + + return cleaned; } EXPORT_SYMBOL_GPL(page_mkclean); @@ -1177,17 +1110,17 @@ out: } /* - * Subfunctions of try_to_unmap: try_to_unmap_one called - * repeatedly from try_to_unmap_ksm, try_to_unmap_anon or try_to_unmap_file. + * @arg: enum ttu_flags will be passed to this argument */ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, - unsigned long address, enum ttu_flags flags) + unsigned long address, void *arg) { struct mm_struct *mm = vma->vm_mm; pte_t *pte; pte_t pteval; spinlock_t *ptl; int ret = SWAP_AGAIN; + enum ttu_flags flags = (enum ttu_flags)arg; pte = page_check_address(page, mm, address, &ptl, 0); if (!pte) @@ -1426,124 +1359,18 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, return ret; } -bool is_vma_temporary_stack(struct vm_area_struct *vma) -{ - int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); - - if (!maybe_stack) - return false; - - if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) == - VM_STACK_INCOMPLETE_SETUP) - return true; - - return false; -} - -/** - * try_to_unmap_anon - unmap or unlock anonymous page using the object-based - * rmap method - * @page: the page to unmap/unlock - * @flags: action and flags - * - * Find all the mappings of a page using the mapping pointer and the vma chains - * contained in the anon_vma struct it points to. - * - * This function is only called from try_to_unmap/try_to_munlock for - * anonymous pages. - * When called from try_to_munlock(), the mmap_sem of the mm containing the vma - * where the page was found will be held for write. So, we won't recheck - * vm_flags for that VMA. That should be OK, because that vma shouldn't be - * 'LOCKED. - */ -static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) -{ - struct anon_vma *anon_vma; - pgoff_t pgoff; - struct anon_vma_chain *avc; - int ret = SWAP_AGAIN; - - anon_vma = page_lock_anon_vma_read(page); - if (!anon_vma) - return ret; - - pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); - anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { - struct vm_area_struct *vma = avc->vma; - unsigned long address; - - /* - * During exec, a temporary VMA is setup and later moved. - * The VMA is moved under the anon_vma lock but not the - * page tables leading to a race where migration cannot - * find the migration ptes. Rather than increasing the - * locking requirements of exec(), migration skips - * temporary VMAs until after exec() completes. - */ - if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) && - is_vma_temporary_stack(vma)) - continue; - - address = vma_address(page, vma); - ret = try_to_unmap_one(page, vma, address, flags); - if (ret != SWAP_AGAIN || !page_mapped(page)) - break; - } - - page_unlock_anon_vma_read(anon_vma); - return ret; -} - -/** - * try_to_unmap_file - unmap/unlock file page using the object-based rmap method - * @page: the page to unmap/unlock - * @flags: action and flags - * - * Find all the mappings of a page using the mapping pointer and the vma chains - * contained in the address_space struct it points to. - * - * This function is only called from try_to_unmap/try_to_munlock for - * object-based pages. - * When called from try_to_munlock(), the mmap_sem of the mm containing the vma - * where the page was found will be held for write. So, we won't recheck - * vm_flags for that VMA. That should be OK, because that vma shouldn't be - * 'LOCKED. - */ -static int try_to_unmap_file(struct page *page, enum ttu_flags flags) +static int try_to_unmap_nonlinear(struct page *page, + struct address_space *mapping, struct vm_area_struct *vma) { - struct address_space *mapping = page->mapping; - pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); - struct vm_area_struct *vma; int ret = SWAP_AGAIN; unsigned long cursor; unsigned long max_nl_cursor = 0; unsigned long max_nl_size = 0; unsigned int mapcount; - if (PageHuge(page)) - pgoff = page->index << compound_order(page); + list_for_each_entry(vma, + &mapping->i_mmap_nonlinear, shared.nonlinear) { - mutex_lock(&mapping->i_mmap_mutex); - vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { - unsigned long address = vma_address(page, vma); - ret = try_to_unmap_one(page, vma, address, flags); - if (ret != SWAP_AGAIN || !page_mapped(page)) - goto out; - } - - if (list_empty(&mapping->i_mmap_nonlinear)) - goto out; - - /* - * We don't bother to try to find the munlocked page in nonlinears. - * It's costly. Instead, later, page reclaim logic may call - * try_to_unmap(TTU_MUNLOCK) and recover PG_mlocked lazily. - */ - if (TTU_ACTION(flags) == TTU_MUNLOCK) - goto out; - - list_for_each_entry(vma, &mapping->i_mmap_nonlinear, - shared.nonlinear) { cursor = (unsigned long) vma->vm_private_data; if (cursor > max_nl_cursor) max_nl_cursor = cursor; @@ -1553,8 +1380,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) } if (max_nl_size == 0) { /* all nonlinears locked or reserved ? */ - ret = SWAP_FAIL; - goto out; + return SWAP_FAIL; } /* @@ -1566,7 +1392,8 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) */ mapcount = page_mapcount(page); if (!mapcount) - goto out; + return ret; + cond_resched(); max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK; @@ -1574,10 +1401,11 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) max_nl_cursor = CLUSTER_SIZE; do { - list_for_each_entry(vma, &mapping->i_mmap_nonlinear, - shared.nonlinear) { + list_for_each_entry(vma, + &mapping->i_mmap_nonlinear, shared.nonlinear) { + cursor = (unsigned long) vma->vm_private_data; - while ( cursor < max_nl_cursor && + while (cursor < max_nl_cursor && cursor < vma->vm_end - vma->vm_start) { if (try_to_unmap_cluster(cursor, &mapcount, vma, page) == SWAP_MLOCK) @@ -1585,7 +1413,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) cursor += CLUSTER_SIZE; vma->vm_private_data = (void *) cursor; if ((int)mapcount <= 0) - goto out; + return ret; } vma->vm_private_data = (void *) max_nl_cursor; } @@ -1600,11 +1428,34 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) */ list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear) vma->vm_private_data = NULL; -out: - mutex_unlock(&mapping->i_mmap_mutex); + return ret; } +bool is_vma_temporary_stack(struct vm_area_struct *vma) +{ + int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); + + if (!maybe_stack) + return false; + + if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) == + VM_STACK_INCOMPLETE_SETUP) + return true; + + return false; +} + +static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg) +{ + return is_vma_temporary_stack(vma); +} + +static int page_not_mapped(struct page *page) +{ + return !page_mapped(page); +}; + /** * try_to_unmap - try to remove all page table mappings to a page * @page: the page to get unmapped @@ -1622,16 +1473,29 @@ out: int try_to_unmap(struct page *page, enum ttu_flags flags) { int ret; + struct rmap_walk_control rwc = { + .rmap_one = try_to_unmap_one, + .arg = (void *)flags, + .done = page_not_mapped, + .file_nonlinear = try_to_unmap_nonlinear, + .anon_lock = page_lock_anon_vma_read, + }; - BUG_ON(!PageLocked(page)); VM_BUG_ON(!PageHuge(page) && PageTransHuge(page)); - if (unlikely(PageKsm(page))) - ret = try_to_unmap_ksm(page, flags); - else if (PageAnon(page)) - ret = try_to_unmap_anon(page, flags); - else - ret = try_to_unmap_file(page, flags); + /* + * During exec, a temporary VMA is setup and later moved. + * The VMA is moved under the anon_vma lock but not the + * page tables leading to a race where migration cannot + * find the migration ptes. Rather than increasing the + * locking requirements of exec(), migration skips + * temporary VMAs until after exec() completes. + */ + if (flags & TTU_MIGRATION && !PageKsm(page) && PageAnon(page)) + rwc.invalid_vma = invalid_migration_vma; + + ret = rmap_walk(page, &rwc); + if (ret != SWAP_MLOCK && !page_mapped(page)) ret = SWAP_SUCCESS; return ret; @@ -1654,14 +1518,25 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) */ int try_to_munlock(struct page *page) { + int ret; + struct rmap_walk_control rwc = { + .rmap_one = try_to_unmap_one, + .arg = (void *)TTU_MUNLOCK, + .done = page_not_mapped, + /* + * We don't bother to try to find the munlocked page in + * nonlinears. It's costly. Instead, later, page reclaim logic + * may call try_to_unmap() and recover PG_mlocked lazily. + */ + .file_nonlinear = NULL, + .anon_lock = page_lock_anon_vma_read, + + }; + VM_BUG_ON(!PageLocked(page) || PageLRU(page)); - if (unlikely(PageKsm(page))) - return try_to_unmap_ksm(page, TTU_MUNLOCK); - else if (PageAnon(page)) - return try_to_unmap_anon(page, TTU_MUNLOCK); - else - return try_to_unmap_file(page, TTU_MUNLOCK); + ret = rmap_walk(page, &rwc); + return ret; } void __put_anon_vma(struct anon_vma *anon_vma) @@ -1674,18 +1549,13 @@ void __put_anon_vma(struct anon_vma *anon_vma) anon_vma_free(anon_vma); } -#ifdef CONFIG_MIGRATION -/* - * rmap_walk() and its helpers rmap_walk_anon() and rmap_walk_file(): - * Called by migrate.c to remove migration ptes, but might be used more later. - */ -static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, - struct vm_area_struct *, unsigned long, void *), void *arg) +static struct anon_vma *rmap_walk_anon_lock(struct page *page, + struct rmap_walk_control *rwc) { struct anon_vma *anon_vma; - pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); - struct anon_vma_chain *avc; - int ret = SWAP_AGAIN; + + if (rwc->anon_lock) + return rwc->anon_lock(page); /* * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read() @@ -1695,58 +1565,120 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, */ anon_vma = page_anon_vma(page); if (!anon_vma) - return ret; + return NULL; + anon_vma_lock_read(anon_vma); + return anon_vma; +} + +/* + * rmap_walk_anon - do something to anonymous page using the object-based + * rmap method + * @page: the page to be handled + * @rwc: control variable according to each walk type + * + * Find all the mappings of a page using the mapping pointer and the vma chains + * contained in the anon_vma struct it points to. + * + * When called from try_to_munlock(), the mmap_sem of the mm containing the vma + * where the page was found will be held for write. So, we won't recheck + * vm_flags for that VMA. That should be OK, because that vma shouldn't be + * LOCKED. + */ +static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) +{ + struct anon_vma *anon_vma; + pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + struct anon_vma_chain *avc; + int ret = SWAP_AGAIN; + + anon_vma = rmap_walk_anon_lock(page, rwc); + if (!anon_vma) + return ret; + anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { struct vm_area_struct *vma = avc->vma; unsigned long address = vma_address(page, vma); - ret = rmap_one(page, vma, address, arg); + + if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) + continue; + + ret = rwc->rmap_one(page, vma, address, rwc->arg); if (ret != SWAP_AGAIN) break; + if (rwc->done && rwc->done(page)) + break; } anon_vma_unlock_read(anon_vma); return ret; } -static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *, - struct vm_area_struct *, unsigned long, void *), void *arg) +/* + * rmap_walk_file - do something to file page using the object-based rmap method + * @page: the page to be handled + * @rwc: control variable according to each walk type + * + * Find all the mappings of a page using the mapping pointer and the vma chains + * contained in the address_space struct it points to. + * + * When called from try_to_munlock(), the mmap_sem of the mm containing the vma + * where the page was found will be held for write. So, we won't recheck + * vm_flags for that VMA. That should be OK, because that vma shouldn't be + * LOCKED. + */ +static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) { struct address_space *mapping = page->mapping; - pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + pgoff_t pgoff = page->index << compound_order(page); struct vm_area_struct *vma; int ret = SWAP_AGAIN; + /* + * The page lock not only makes sure that page->mapping cannot + * suddenly be NULLified by truncation, it makes sure that the + * structure at mapping cannot be freed and reused yet, + * so we can safely take mapping->i_mmap_mutex. + */ + VM_BUG_ON(!PageLocked(page)); + if (!mapping) return ret; mutex_lock(&mapping->i_mmap_mutex); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { unsigned long address = vma_address(page, vma); - ret = rmap_one(page, vma, address, arg); + + if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) + continue; + + ret = rwc->rmap_one(page, vma, address, rwc->arg); if (ret != SWAP_AGAIN) - break; + goto done; + if (rwc->done && rwc->done(page)) + goto done; } - /* - * No nonlinear handling: being always shared, nonlinear vmas - * never contain migration ptes. Decide what to do about this - * limitation to linear when we need rmap_walk() on nonlinear. - */ + + if (!rwc->file_nonlinear) + goto done; + + if (list_empty(&mapping->i_mmap_nonlinear)) + goto done; + + ret = rwc->file_nonlinear(page, mapping, vma); + +done: mutex_unlock(&mapping->i_mmap_mutex); return ret; } -int rmap_walk(struct page *page, int (*rmap_one)(struct page *, - struct vm_area_struct *, unsigned long, void *), void *arg) +int rmap_walk(struct page *page, struct rmap_walk_control *rwc) { - VM_BUG_ON(!PageLocked(page)); - if (unlikely(PageKsm(page))) - return rmap_walk_ksm(page, rmap_one, arg); + return rmap_walk_ksm(page, rwc); else if (PageAnon(page)) - return rmap_walk_anon(page, rmap_one, arg); + return rmap_walk_anon(page, rwc); else - return rmap_walk_file(page, rmap_one, arg); + return rmap_walk_file(page, rwc); } -#endif /* CONFIG_MIGRATION */ #ifdef CONFIG_HUGETLB_PAGE /* diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 27eeab3..4cba9c2 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -40,7 +40,8 @@ static void * __init_refok __earlyonly_bootmem_alloc(int node, unsigned long align, unsigned long goal) { - return __alloc_bootmem_node_high(NODE_DATA(node), size, align, goal); + return memblock_virt_alloc_try_nid(size, align, goal, + BOOTMEM_ALLOC_ACCESSIBLE, node); } static void *vmemmap_buf; @@ -226,7 +227,8 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, if (vmemmap_buf_start) { /* need to free left buf */ - free_bootmem(__pa(vmemmap_buf), vmemmap_buf_end - vmemmap_buf); + memblock_free_early(__pa(vmemmap_buf), + vmemmap_buf_end - vmemmap_buf); vmemmap_buf = NULL; vmemmap_buf_end = NULL; } diff --git a/mm/sparse.c b/mm/sparse.c index 8cc7be0..63c3ea5 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -69,7 +69,7 @@ static struct mem_section noinline __init_refok *sparse_index_alloc(int nid) else section = kzalloc(array_size, GFP_KERNEL); } else { - section = alloc_bootmem_node(NODE_DATA(nid), array_size); + section = memblock_virt_alloc_node(array_size, nid); } return section; @@ -279,8 +279,9 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, limit = goal + (1UL << PA_SECTION_SHIFT); nid = early_pfn_to_nid(goal >> PAGE_SHIFT); again: - p = ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size, - SMP_CACHE_BYTES, goal, limit); + p = memblock_virt_alloc_try_nid_nopanic(size, + SMP_CACHE_BYTES, goal, limit, + nid); if (!p && limit) { limit = 0; goto again; @@ -331,7 +332,7 @@ static unsigned long * __init sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, unsigned long size) { - return alloc_bootmem_node_nopanic(pgdat, size); + return memblock_virt_alloc_node_nopanic(size, pgdat->node_id); } static void __init check_usemap_section_nr(int nid, unsigned long *usemap) @@ -376,8 +377,9 @@ struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid) return map; size = PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION); - map = __alloc_bootmem_node_high(NODE_DATA(nid), size, - PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); + map = memblock_virt_alloc_try_nid(size, + PAGE_SIZE, __pa(MAX_DMA_ADDRESS), + BOOTMEM_ALLOC_ACCESSIBLE, nid); return map; } void __init sparse_mem_maps_populate_node(struct page **map_map, @@ -401,8 +403,9 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, } size = PAGE_ALIGN(size); - map = __alloc_bootmem_node_high(NODE_DATA(nodeid), size * map_count, - PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); + map = memblock_virt_alloc_try_nid(size * map_count, + PAGE_SIZE, __pa(MAX_DMA_ADDRESS), + BOOTMEM_ALLOC_ACCESSIBLE, nodeid); if (map) { for (pnum = pnum_begin; pnum < pnum_end; pnum++) { if (!present_section_nr(pnum)) @@ -545,7 +548,7 @@ void __init sparse_init(void) * sparse_early_mem_map_alloc, so allocate usemap_map at first. */ size = sizeof(unsigned long *) * NR_MEM_SECTIONS; - usemap_map = alloc_bootmem(size); + usemap_map = memblock_virt_alloc(size, 0); if (!usemap_map) panic("can not allocate usemap_map\n"); alloc_usemap_and_memmap(sparse_early_usemaps_alloc_node, @@ -553,7 +556,7 @@ void __init sparse_init(void) #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER size2 = sizeof(struct page *) * NR_MEM_SECTIONS; - map_map = alloc_bootmem(size2); + map_map = memblock_virt_alloc(size2, 0); if (!map_map) panic("can not allocate map_map\n"); alloc_usemap_and_memmap(sparse_early_mem_maps_alloc_node, @@ -583,9 +586,9 @@ void __init sparse_init(void) vmemmap_populate_print_last(); #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER - free_bootmem(__pa(map_map), size2); + memblock_free_early(__pa(map_map), size2); #endif - free_bootmem(__pa(usemap_map), size); + memblock_free_early(__pa(usemap_map), size); } #ifdef CONFIG_MEMORY_HOTPLUG @@ -31,7 +31,6 @@ #include <linux/memcontrol.h> #include <linux/gfp.h> #include <linux/uio.h> -#include <linux/hugetlb.h> #include "internal.h" @@ -82,118 +81,150 @@ static void __put_compound_page(struct page *page) static void put_compound_page(struct page *page) { - if (unlikely(PageTail(page))) { - /* __split_huge_page_refcount can run under us */ - struct page *page_head = compound_trans_head(page); - - if (likely(page != page_head && - get_page_unless_zero(page_head))) { - unsigned long flags; + struct page *page_head; + if (likely(!PageTail(page))) { + if (put_page_testzero(page)) { /* - * THP can not break up slab pages so avoid taking - * compound_lock(). Slab performs non-atomic bit ops - * on page->flags for better performance. In particular - * slab_unlock() in slub used to be a hot path. It is - * still hot on arches that do not support - * this_cpu_cmpxchg_double(). + * By the time all refcounts have been released + * split_huge_page cannot run anymore from under us. */ - if (PageSlab(page_head) || PageHeadHuge(page_head)) { - if (likely(PageTail(page))) { - /* - * __split_huge_page_refcount - * cannot race here. - */ - VM_BUG_ON(!PageHead(page_head)); - atomic_dec(&page->_mapcount); - if (put_page_testzero(page_head)) - VM_BUG_ON(1); - if (put_page_testzero(page_head)) - __put_compound_page(page_head); - return; - } else - /* - * __split_huge_page_refcount - * run before us, "page" was a - * THP tail. The split - * page_head has been freed - * and reallocated as slab or - * hugetlbfs page of smaller - * order (only possible if - * reallocated as slab on - * x86). - */ - goto skip_lock; - } + if (PageHead(page)) + __put_compound_page(page); + else + __put_single_page(page); + } + return; + } + + /* __split_huge_page_refcount can run under us */ + page_head = compound_trans_head(page); + + /* + * THP can not break up slab pages so avoid taking + * compound_lock() and skip the tail page refcounting (in + * _mapcount) too. Slab performs non-atomic bit ops on + * page->flags for better performance. In particular + * slab_unlock() in slub used to be a hot path. It is still + * hot on arches that do not support + * this_cpu_cmpxchg_double(). + * + * If "page" is part of a slab or hugetlbfs page it cannot be + * splitted and the head page cannot change from under us. And + * if "page" is part of a THP page under splitting, if the + * head page pointed by the THP tail isn't a THP head anymore, + * we'll find PageTail clear after smp_rmb() and we'll treat + * it as a single page. + */ + if (!__compound_tail_refcounted(page_head)) { + /* + * If "page" is a THP tail, we must read the tail page + * flags after the head page flags. The + * split_huge_page side enforces write memory barriers + * between clearing PageTail and before the head page + * can be freed and reallocated. + */ + smp_rmb(); + if (likely(PageTail(page))) { /* - * page_head wasn't a dangling pointer but it - * may not be a head page anymore by the time - * we obtain the lock. That is ok as long as it - * can't be freed from under us. + * __split_huge_page_refcount cannot race + * here. */ - flags = compound_lock_irqsave(page_head); - if (unlikely(!PageTail(page))) { - /* __split_huge_page_refcount run before us */ - compound_unlock_irqrestore(page_head, flags); -skip_lock: - if (put_page_testzero(page_head)) { - /* - * The head page may have been - * freed and reallocated as a - * compound page of smaller - * order and then freed again. - * All we know is that it - * cannot have become: a THP - * page, a compound page of - * higher order, a tail page. - * That is because we still - * hold the refcount of the - * split THP tail and - * page_head was the THP head - * before the split. - */ - if (PageHead(page_head)) - __put_compound_page(page_head); - else - __put_single_page(page_head); - } -out_put_single: - if (put_page_testzero(page)) - __put_single_page(page); - return; + VM_BUG_ON(!PageHead(page_head)); + VM_BUG_ON(page_mapcount(page) != 0); + if (put_page_testzero(page_head)) { + /* + * If this is the tail of a slab + * compound page, the tail pin must + * not be the last reference held on + * the page, because the PG_slab + * cannot be cleared before all tail + * pins (which skips the _mapcount + * tail refcounting) have been + * released. For hugetlbfs the tail + * pin may be the last reference on + * the page instead, because + * PageHeadHuge will not go away until + * the compound page enters the buddy + * allocator. + */ + VM_BUG_ON(PageSlab(page_head)); + __put_compound_page(page_head); } - VM_BUG_ON(page_head != page->first_page); + return; + } else /* - * We can release the refcount taken by - * get_page_unless_zero() now that - * __split_huge_page_refcount() is blocked on - * the compound_lock. + * __split_huge_page_refcount run before us, + * "page" was a THP tail. The split page_head + * has been freed and reallocated as slab or + * hugetlbfs page of smaller order (only + * possible if reallocated as slab on x86). */ - if (put_page_testzero(page_head)) - VM_BUG_ON(1); - /* __split_huge_page_refcount will wait now */ - VM_BUG_ON(page_mapcount(page) <= 0); - atomic_dec(&page->_mapcount); - VM_BUG_ON(atomic_read(&page_head->_count) <= 0); - VM_BUG_ON(atomic_read(&page->_count) != 0); - compound_unlock_irqrestore(page_head, flags); + goto out_put_single; + } + if (likely(page != page_head && get_page_unless_zero(page_head))) { + unsigned long flags; + + /* + * page_head wasn't a dangling pointer but it may not + * be a head page anymore by the time we obtain the + * lock. That is ok as long as it can't be freed from + * under us. + */ + flags = compound_lock_irqsave(page_head); + if (unlikely(!PageTail(page))) { + /* __split_huge_page_refcount run before us */ + compound_unlock_irqrestore(page_head, flags); if (put_page_testzero(page_head)) { + /* + * The head page may have been freed + * and reallocated as a compound page + * of smaller order and then freed + * again. All we know is that it + * cannot have become: a THP page, a + * compound page of higher order, a + * tail page. That is because we + * still hold the refcount of the + * split THP tail and page_head was + * the THP head before the split. + */ if (PageHead(page_head)) __put_compound_page(page_head); else __put_single_page(page_head); } - } else { - /* page_head is a dangling pointer */ - VM_BUG_ON(PageTail(page)); - goto out_put_single; +out_put_single: + if (put_page_testzero(page)) + __put_single_page(page); + return; } - } else if (put_page_testzero(page)) { - if (PageHead(page)) - __put_compound_page(page); - else - __put_single_page(page); + VM_BUG_ON(page_head != page->first_page); + /* + * We can release the refcount taken by + * get_page_unless_zero() now that + * __split_huge_page_refcount() is blocked on the + * compound_lock. + */ + if (put_page_testzero(page_head)) + VM_BUG_ON(1); + /* __split_huge_page_refcount will wait now */ + VM_BUG_ON(page_mapcount(page) <= 0); + atomic_dec(&page->_mapcount); + VM_BUG_ON(atomic_read(&page_head->_count) <= 0); + VM_BUG_ON(atomic_read(&page->_count) != 0); + compound_unlock_irqrestore(page_head, flags); + + if (put_page_testzero(page_head)) { + if (PageHead(page_head)) + __put_compound_page(page_head); + else + __put_single_page(page_head); + } + } else { + /* page_head is a dangling pointer */ + VM_BUG_ON(PageTail(page)); + goto out_put_single; } } @@ -221,36 +252,37 @@ bool __get_page_tail(struct page *page) * split_huge_page(). */ unsigned long flags; - bool got = false; + bool got; struct page *page_head = compound_trans_head(page); - if (likely(page != page_head && get_page_unless_zero(page_head))) { - /* Ref to put_compound_page() comment. */ - if (PageSlab(page_head) || PageHeadHuge(page_head)) { - if (likely(PageTail(page))) { - /* - * This is a hugetlbfs page or a slab - * page. __split_huge_page_refcount - * cannot race here. - */ - VM_BUG_ON(!PageHead(page_head)); - __get_page_tail_foll(page, false); - return true; - } else { - /* - * __split_huge_page_refcount run - * before us, "page" was a THP - * tail. The split page_head has been - * freed and reallocated as slab or - * hugetlbfs page of smaller order - * (only possible if reallocated as - * slab on x86). - */ - put_page(page_head); - return false; - } + /* Ref to put_compound_page() comment. */ + if (!__compound_tail_refcounted(page_head)) { + smp_rmb(); + if (likely(PageTail(page))) { + /* + * This is a hugetlbfs page or a slab + * page. __split_huge_page_refcount + * cannot race here. + */ + VM_BUG_ON(!PageHead(page_head)); + __get_page_tail_foll(page, true); + return true; + } else { + /* + * __split_huge_page_refcount run + * before us, "page" was a THP + * tail. The split page_head has been + * freed and reallocated as slab or + * hugetlbfs page of smaller order + * (only possible if reallocated as + * slab on x86). + */ + return false; } + } + got = false; + if (likely(page != page_head && get_page_unless_zero(page_head))) { /* * page_head wasn't a dangling pointer but it * may not be a head page anymore by the time @@ -404,13 +404,45 @@ struct address_space *page_mapping(struct page *page) return mapping; } +int overcommit_ratio_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + + ret = proc_dointvec(table, write, buffer, lenp, ppos); + if (ret == 0 && write) + sysctl_overcommit_kbytes = 0; + return ret; +} + +int overcommit_kbytes_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + + ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); + if (ret == 0 && write) + sysctl_overcommit_ratio = 0; + return ret; +} + /* * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used */ unsigned long vm_commit_limit(void) { - return ((totalram_pages - hugetlb_total_pages()) - * sysctl_overcommit_ratio / 100) + total_swap_pages; + unsigned long allowed; + + if (sysctl_overcommit_kbytes) + allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10); + else + allowed = ((totalram_pages - hugetlb_total_pages()) + * sysctl_overcommit_ratio / 100); + allowed += total_swap_pages; + + return allowed; } diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 0fdf968..e4f0db2 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -220,12 +220,12 @@ int is_vmalloc_or_module_addr(const void *x) } /* - * Walk a vmap address to the struct page it maps. + * Walk a vmap address to the physical pfn it maps to. */ -struct page *vmalloc_to_page(const void *vmalloc_addr) +unsigned long vmalloc_to_pfn(const void *vmalloc_addr) { unsigned long addr = (unsigned long) vmalloc_addr; - struct page *page = NULL; + unsigned long pfn = 0; pgd_t *pgd = pgd_offset_k(addr); /* @@ -244,23 +244,23 @@ struct page *vmalloc_to_page(const void *vmalloc_addr) ptep = pte_offset_map(pmd, addr); pte = *ptep; if (pte_present(pte)) - page = pte_page(pte); + pfn = pte_pfn(pte); pte_unmap(ptep); } } } - return page; + return pfn; } -EXPORT_SYMBOL(vmalloc_to_page); +EXPORT_SYMBOL(vmalloc_to_pfn); /* - * Map a vmalloc()-space virtual address to the physical page frame number. + * Map a vmalloc()-space virtual address to the struct page. */ -unsigned long vmalloc_to_pfn(const void *vmalloc_addr) +struct page *vmalloc_to_page(const void *vmalloc_addr) { - return page_to_pfn(vmalloc_to_page(vmalloc_addr)); + return pfn_to_page(vmalloc_to_pfn(vmalloc_addr)); } -EXPORT_SYMBOL(vmalloc_to_pfn); +EXPORT_SYMBOL(vmalloc_to_page); /*** Global kva allocator ***/ |