diff options
156 files changed, 3930 insertions, 2919 deletions
diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt index 49b8551..e48c57f 100644 --- a/Documentation/RCU/whatisRCU.txt +++ b/Documentation/RCU/whatisRCU.txt @@ -818,7 +818,7 @@ RCU pointer/list update: list_add_tail_rcu list_del_rcu list_replace_rcu - hlist_add_after_rcu + hlist_add_behind_rcu hlist_add_before_rcu hlist_add_head_rcu hlist_del_rcu diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 883901b..9344d83 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1716,8 +1716,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted. 7 (KERN_DEBUG) debug-level messages log_buf_len=n[KMG] Sets the size of the printk ring buffer, - in bytes. n must be a power of two. The default - size is set in the kernel config file. + in bytes. n must be a power of two and greater + than the minimal size. The minimal size is defined + by LOG_BUF_SHIFT kernel config parameter. There is + also CONFIG_LOG_CPU_MAX_BUF_SHIFT config parameter + that allows to increase the default size depending on + the number of CPUs. See init/Kconfig for more details. logo.nologo [FB] Disables display of the built-in Linux logo. This may be used to provide more screen space for diff --git a/Documentation/trace/postprocess/trace-vmscan-postprocess.pl b/Documentation/trace/postprocess/trace-vmscan-postprocess.pl index 78c9a7b..8f961ef 100644 --- a/Documentation/trace/postprocess/trace-vmscan-postprocess.pl +++ b/Documentation/trace/postprocess/trace-vmscan-postprocess.pl @@ -47,6 +47,10 @@ use constant HIGH_KSWAPD_REWAKEUP => 21; use constant HIGH_NR_SCANNED => 22; use constant HIGH_NR_TAKEN => 23; use constant HIGH_NR_RECLAIMED => 24; +use constant HIGH_NR_FILE_SCANNED => 25; +use constant HIGH_NR_ANON_SCANNED => 26; +use constant HIGH_NR_FILE_RECLAIMED => 27; +use constant HIGH_NR_ANON_RECLAIMED => 28; my %perprocesspid; my %perprocess; @@ -56,14 +60,18 @@ my $opt_read_procstat; my $total_wakeup_kswapd; my ($total_direct_reclaim, $total_direct_nr_scanned); +my ($total_direct_nr_file_scanned, $total_direct_nr_anon_scanned); my ($total_direct_latency, $total_kswapd_latency); my ($total_direct_nr_reclaimed); +my ($total_direct_nr_file_reclaimed, $total_direct_nr_anon_reclaimed); my ($total_direct_writepage_file_sync, $total_direct_writepage_file_async); my ($total_direct_writepage_anon_sync, $total_direct_writepage_anon_async); my ($total_kswapd_nr_scanned, $total_kswapd_wake); +my ($total_kswapd_nr_file_scanned, $total_kswapd_nr_anon_scanned); my ($total_kswapd_writepage_file_sync, $total_kswapd_writepage_file_async); my ($total_kswapd_writepage_anon_sync, $total_kswapd_writepage_anon_async); my ($total_kswapd_nr_reclaimed); +my ($total_kswapd_nr_file_reclaimed, $total_kswapd_nr_anon_reclaimed); # Catch sigint and exit on request my $sigint_report = 0; @@ -374,6 +382,7 @@ EVENT_PROCESS: } my $isolate_mode = $1; my $nr_scanned = $4; + my $file = $6; # To closer match vmstat scanning statistics, only count isolate_both # and isolate_inactive as scanning. isolate_active is rotation @@ -382,6 +391,11 @@ EVENT_PROCESS: # isolate_both == 3 if ($isolate_mode != 2) { $perprocesspid{$process_pid}->{HIGH_NR_SCANNED} += $nr_scanned; + if ($file == 1) { + $perprocesspid{$process_pid}->{HIGH_NR_FILE_SCANNED} += $nr_scanned; + } else { + $perprocesspid{$process_pid}->{HIGH_NR_ANON_SCANNED} += $nr_scanned; + } } } elsif ($tracepoint eq "mm_vmscan_lru_shrink_inactive") { $details = $6; @@ -391,8 +405,19 @@ EVENT_PROCESS: print " $regex_lru_shrink_inactive/o\n"; next; } + my $nr_reclaimed = $4; + my $flags = $6; + my $file = 0; + if ($flags =~ /RECLAIM_WB_FILE/) { + $file = 1; + } $perprocesspid{$process_pid}->{HIGH_NR_RECLAIMED} += $nr_reclaimed; + if ($file) { + $perprocesspid{$process_pid}->{HIGH_NR_FILE_RECLAIMED} += $nr_reclaimed; + } else { + $perprocesspid{$process_pid}->{HIGH_NR_ANON_RECLAIMED} += $nr_reclaimed; + } } elsif ($tracepoint eq "mm_vmscan_writepage") { $details = $6; if ($details !~ /$regex_writepage/o) { @@ -493,7 +518,11 @@ sub dump_stats { $total_direct_reclaim += $stats{$process_pid}->{MM_VMSCAN_DIRECT_RECLAIM_BEGIN}; $total_wakeup_kswapd += $stats{$process_pid}->{MM_VMSCAN_WAKEUP_KSWAPD}; $total_direct_nr_scanned += $stats{$process_pid}->{HIGH_NR_SCANNED}; + $total_direct_nr_file_scanned += $stats{$process_pid}->{HIGH_NR_FILE_SCANNED}; + $total_direct_nr_anon_scanned += $stats{$process_pid}->{HIGH_NR_ANON_SCANNED}; $total_direct_nr_reclaimed += $stats{$process_pid}->{HIGH_NR_RECLAIMED}; + $total_direct_nr_file_reclaimed += $stats{$process_pid}->{HIGH_NR_FILE_RECLAIMED}; + $total_direct_nr_anon_reclaimed += $stats{$process_pid}->{HIGH_NR_ANON_RECLAIMED}; $total_direct_writepage_file_sync += $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_SYNC}; $total_direct_writepage_anon_sync += $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_SYNC}; $total_direct_writepage_file_async += $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_ASYNC}; @@ -513,7 +542,11 @@ sub dump_stats { $stats{$process_pid}->{MM_VMSCAN_DIRECT_RECLAIM_BEGIN}, $stats{$process_pid}->{MM_VMSCAN_WAKEUP_KSWAPD}, $stats{$process_pid}->{HIGH_NR_SCANNED}, + $stats{$process_pid}->{HIGH_NR_FILE_SCANNED}, + $stats{$process_pid}->{HIGH_NR_ANON_SCANNED}, $stats{$process_pid}->{HIGH_NR_RECLAIMED}, + $stats{$process_pid}->{HIGH_NR_FILE_RECLAIMED}, + $stats{$process_pid}->{HIGH_NR_ANON_RECLAIMED}, $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_SYNC} + $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_SYNC}, $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_ASYNC} + $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_ASYNC}, $this_reclaim_delay / 1000); @@ -552,7 +585,11 @@ sub dump_stats { $total_kswapd_wake += $stats{$process_pid}->{MM_VMSCAN_KSWAPD_WAKE}; $total_kswapd_nr_scanned += $stats{$process_pid}->{HIGH_NR_SCANNED}; + $total_kswapd_nr_file_scanned += $stats{$process_pid}->{HIGH_NR_FILE_SCANNED}; + $total_kswapd_nr_anon_scanned += $stats{$process_pid}->{HIGH_NR_ANON_SCANNED}; $total_kswapd_nr_reclaimed += $stats{$process_pid}->{HIGH_NR_RECLAIMED}; + $total_kswapd_nr_file_reclaimed += $stats{$process_pid}->{HIGH_NR_FILE_RECLAIMED}; + $total_kswapd_nr_anon_reclaimed += $stats{$process_pid}->{HIGH_NR_ANON_RECLAIMED}; $total_kswapd_writepage_file_sync += $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_SYNC}; $total_kswapd_writepage_anon_sync += $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_SYNC}; $total_kswapd_writepage_file_async += $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_ASYNC}; @@ -563,7 +600,11 @@ sub dump_stats { $stats{$process_pid}->{MM_VMSCAN_KSWAPD_WAKE}, $stats{$process_pid}->{HIGH_KSWAPD_REWAKEUP}, $stats{$process_pid}->{HIGH_NR_SCANNED}, + $stats{$process_pid}->{HIGH_NR_FILE_SCANNED}, + $stats{$process_pid}->{HIGH_NR_ANON_SCANNED}, $stats{$process_pid}->{HIGH_NR_RECLAIMED}, + $stats{$process_pid}->{HIGH_NR_FILE_RECLAIMED}, + $stats{$process_pid}->{HIGH_NR_ANON_RECLAIMED}, $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_SYNC} + $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_SYNC}, $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_ASYNC} + $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_ASYNC}); @@ -594,7 +635,11 @@ sub dump_stats { print "\nSummary\n"; print "Direct reclaims: $total_direct_reclaim\n"; print "Direct reclaim pages scanned: $total_direct_nr_scanned\n"; + print "Direct reclaim file pages scanned: $total_direct_nr_file_scanned\n"; + print "Direct reclaim anon pages scanned: $total_direct_nr_anon_scanned\n"; print "Direct reclaim pages reclaimed: $total_direct_nr_reclaimed\n"; + print "Direct reclaim file pages reclaimed: $total_direct_nr_file_reclaimed\n"; + print "Direct reclaim anon pages reclaimed: $total_direct_nr_anon_reclaimed\n"; print "Direct reclaim write file sync I/O: $total_direct_writepage_file_sync\n"; print "Direct reclaim write anon sync I/O: $total_direct_writepage_anon_sync\n"; print "Direct reclaim write file async I/O: $total_direct_writepage_file_async\n"; @@ -604,7 +649,11 @@ sub dump_stats { print "\n"; print "Kswapd wakeups: $total_kswapd_wake\n"; print "Kswapd pages scanned: $total_kswapd_nr_scanned\n"; + print "Kswapd file pages scanned: $total_kswapd_nr_file_scanned\n"; + print "Kswapd anon pages scanned: $total_kswapd_nr_anon_scanned\n"; print "Kswapd pages reclaimed: $total_kswapd_nr_reclaimed\n"; + print "Kswapd file pages reclaimed: $total_kswapd_nr_file_reclaimed\n"; + print "Kswapd anon pages reclaimed: $total_kswapd_nr_anon_reclaimed\n"; print "Kswapd reclaim write file sync I/O: $total_kswapd_writepage_file_sync\n"; print "Kswapd reclaim write anon sync I/O: $total_kswapd_writepage_anon_sync\n"; print "Kswapd reclaim write file async I/O: $total_kswapd_writepage_file_async\n"; @@ -629,7 +678,11 @@ sub aggregate_perprocesspid() { $perprocess{$process}->{MM_VMSCAN_WAKEUP_KSWAPD} += $perprocesspid{$process_pid}->{MM_VMSCAN_WAKEUP_KSWAPD}; $perprocess{$process}->{HIGH_KSWAPD_REWAKEUP} += $perprocesspid{$process_pid}->{HIGH_KSWAPD_REWAKEUP}; $perprocess{$process}->{HIGH_NR_SCANNED} += $perprocesspid{$process_pid}->{HIGH_NR_SCANNED}; + $perprocess{$process}->{HIGH_NR_FILE_SCANNED} += $perprocesspid{$process_pid}->{HIGH_NR_FILE_SCANNED}; + $perprocess{$process}->{HIGH_NR_ANON_SCANNED} += $perprocesspid{$process_pid}->{HIGH_NR_ANON_SCANNED}; $perprocess{$process}->{HIGH_NR_RECLAIMED} += $perprocesspid{$process_pid}->{HIGH_NR_RECLAIMED}; + $perprocess{$process}->{HIGH_NR_FILE_RECLAIMED} += $perprocesspid{$process_pid}->{HIGH_NR_FILE_RECLAIMED}; + $perprocess{$process}->{HIGH_NR_ANON_RECLAIMED} += $perprocesspid{$process_pid}->{HIGH_NR_ANON_RECLAIMED}; $perprocess{$process}->{MM_VMSCAN_WRITEPAGE_FILE_SYNC} += $perprocesspid{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_SYNC}; $perprocess{$process}->{MM_VMSCAN_WRITEPAGE_ANON_SYNC} += $perprocesspid{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_SYNC}; $perprocess{$process}->{MM_VMSCAN_WRITEPAGE_FILE_ASYNC} += $perprocesspid{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_ASYNC}; @@ -621,6 +621,9 @@ else KBUILD_CFLAGS += -O2 endif +# Tell gcc to never replace conditional load with a non-conditional one +KBUILD_CFLAGS += $(call cc-option,--param=allow-store-data-races=0) + ifdef CONFIG_READABLE_ASM # Disable optimizations that make assembler listings hard to read. # reorder blocks reorders the control in the function @@ -636,6 +639,22 @@ KBUILD_CFLAGS += $(call cc-option,-Wframe-larger-than=${CONFIG_FRAME_WARN}) endif # Handle stack protector mode. +# +# Since kbuild can potentially perform two passes (first with the old +# .config values and then with updated .config values), we cannot error out +# if a desired compiler option is unsupported. If we were to error, kbuild +# could never get to the second pass and actually notice that we changed +# the option to something that was supported. +# +# Additionally, we don't want to fallback and/or silently change which compiler +# flags will be used, since that leads to producing kernels with different +# security feature characteristics depending on the compiler used. ("But I +# selected CC_STACKPROTECTOR_STRONG! Why did it build with _REGULAR?!") +# +# The middle ground is to warn here so that the failed option is obvious, but +# to let the build fail with bad compiler flags so that we can't produce a +# kernel when there is a CONFIG and compiler mismatch. +# ifdef CONFIG_CC_STACKPROTECTOR_REGULAR stackp-flag := -fstack-protector ifeq ($(call cc-option, $(stackp-flag)),) diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c index 1f88db0..7a996aa 100644 --- a/arch/arm/mm/dma-mapping.c +++ b/arch/arm/mm/dma-mapping.c @@ -26,6 +26,7 @@ #include <linux/io.h> #include <linux/vmalloc.h> #include <linux/sizes.h> +#include <linux/cma.h> #include <asm/memory.h> #include <asm/highmem.h> diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 25c3502..892d43e 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -631,7 +631,8 @@ int arch_add_memory(int nid, u64 start, u64 size) pgdat = NODE_DATA(nid); - zone = pgdat->node_zones + ZONE_NORMAL; + zone = pgdat->node_zones + + zone_for_memory(nid, start, size, ZONE_NORMAL); ret = __add_pages(nid, zone, start_pfn, nr_pages); if (ret) diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile index ce569b6..72905c3 100644 --- a/arch/powerpc/kvm/Makefile +++ b/arch/powerpc/kvm/Makefile @@ -90,7 +90,6 @@ kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) += \ book3s_hv_rm_mmu.o \ book3s_hv_ras.o \ book3s_hv_builtin.o \ - book3s_hv_cma.o \ $(kvm-book3s_64-builtin-xics-objs-y) endif diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 68468d6..a01744f 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -37,8 +37,6 @@ #include <asm/ppc-opcode.h> #include <asm/cputable.h> -#include "book3s_hv_cma.h" - /* POWER7 has 10-bit LPIDs, PPC970 has 6-bit LPIDs */ #define MAX_LPID_970 63 @@ -64,10 +62,10 @@ long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp) } kvm->arch.hpt_cma_alloc = 0; - VM_BUG_ON(order < KVM_CMA_CHUNK_ORDER); page = kvm_alloc_hpt(1 << (order - PAGE_SHIFT)); if (page) { hpt = (unsigned long)pfn_to_kaddr(page_to_pfn(page)); + memset((void *)hpt, 0, (1 << order)); kvm->arch.hpt_cma_alloc = 1; } diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index 7cde8a6..6cf498a 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -16,12 +16,14 @@ #include <linux/init.h> #include <linux/memblock.h> #include <linux/sizes.h> +#include <linux/cma.h> #include <asm/cputable.h> #include <asm/kvm_ppc.h> #include <asm/kvm_book3s.h> -#include "book3s_hv_cma.h" +#define KVM_CMA_CHUNK_ORDER 18 + /* * Hash page table alignment on newer cpus(CPU_FTR_ARCH_206) * should be power of 2. @@ -43,6 +45,8 @@ static unsigned long kvm_cma_resv_ratio = 5; unsigned long kvm_rma_pages = (1 << 27) >> PAGE_SHIFT; /* 128MB */ EXPORT_SYMBOL_GPL(kvm_rma_pages); +static struct cma *kvm_cma; + /* Work out RMLS (real mode limit selector) field value for a given RMA size. Assumes POWER7 or PPC970. */ static inline int lpcr_rmls(unsigned long rma_size) @@ -97,7 +101,7 @@ struct kvm_rma_info *kvm_alloc_rma() ri = kmalloc(sizeof(struct kvm_rma_info), GFP_KERNEL); if (!ri) return NULL; - page = kvm_alloc_cma(kvm_rma_pages, kvm_rma_pages); + page = cma_alloc(kvm_cma, kvm_rma_pages, get_order(kvm_rma_pages)); if (!page) goto err_out; atomic_set(&ri->use_count, 1); @@ -112,7 +116,7 @@ EXPORT_SYMBOL_GPL(kvm_alloc_rma); void kvm_release_rma(struct kvm_rma_info *ri) { if (atomic_dec_and_test(&ri->use_count)) { - kvm_release_cma(pfn_to_page(ri->base_pfn), kvm_rma_pages); + cma_release(kvm_cma, pfn_to_page(ri->base_pfn), kvm_rma_pages); kfree(ri); } } @@ -131,16 +135,18 @@ struct page *kvm_alloc_hpt(unsigned long nr_pages) { unsigned long align_pages = HPT_ALIGN_PAGES; + VM_BUG_ON(get_order(nr_pages) < KVM_CMA_CHUNK_ORDER - PAGE_SHIFT); + /* Old CPUs require HPT aligned on a multiple of its size */ if (!cpu_has_feature(CPU_FTR_ARCH_206)) align_pages = nr_pages; - return kvm_alloc_cma(nr_pages, align_pages); + return cma_alloc(kvm_cma, nr_pages, get_order(align_pages)); } EXPORT_SYMBOL_GPL(kvm_alloc_hpt); void kvm_release_hpt(struct page *page, unsigned long nr_pages) { - kvm_release_cma(page, nr_pages); + cma_release(kvm_cma, page, nr_pages); } EXPORT_SYMBOL_GPL(kvm_release_hpt); @@ -179,7 +185,8 @@ void __init kvm_cma_reserve(void) align_size = HPT_ALIGN_PAGES << PAGE_SHIFT; align_size = max(kvm_rma_pages << PAGE_SHIFT, align_size); - kvm_cma_declare_contiguous(selected_size, align_size); + cma_declare_contiguous(0, selected_size, 0, align_size, + KVM_CMA_CHUNK_ORDER - PAGE_SHIFT, false, &kvm_cma); } } diff --git a/arch/powerpc/kvm/book3s_hv_cma.c b/arch/powerpc/kvm/book3s_hv_cma.c deleted file mode 100644 index d9d3d85..0000000 --- a/arch/powerpc/kvm/book3s_hv_cma.c +++ /dev/null @@ -1,240 +0,0 @@ -/* - * Contiguous Memory Allocator for ppc KVM hash pagetable based on CMA - * for DMA mapping framework - * - * Copyright IBM Corporation, 2013 - * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation; either version 2 of the - * License or (at your optional) any later version of the license. - * - */ -#define pr_fmt(fmt) "kvm_cma: " fmt - -#ifdef CONFIG_CMA_DEBUG -#ifndef DEBUG -# define DEBUG -#endif -#endif - -#include <linux/memblock.h> -#include <linux/mutex.h> -#include <linux/sizes.h> -#include <linux/slab.h> - -#include "book3s_hv_cma.h" - -struct kvm_cma { - unsigned long base_pfn; - unsigned long count; - unsigned long *bitmap; -}; - -static DEFINE_MUTEX(kvm_cma_mutex); -static struct kvm_cma kvm_cma_area; - -/** - * kvm_cma_declare_contiguous() - reserve area for contiguous memory handling - * for kvm hash pagetable - * @size: Size of the reserved memory. - * @alignment: Alignment for the contiguous memory area - * - * This function reserves memory for kvm cma area. It should be - * called by arch code when early allocator (memblock or bootmem) - * is still activate. - */ -long __init kvm_cma_declare_contiguous(phys_addr_t size, phys_addr_t alignment) -{ - long base_pfn; - phys_addr_t addr; - struct kvm_cma *cma = &kvm_cma_area; - - pr_debug("%s(size %lx)\n", __func__, (unsigned long)size); - - if (!size) - return -EINVAL; - /* - * Sanitise input arguments. - * We should be pageblock aligned for CMA. - */ - alignment = max(alignment, (phys_addr_t)(PAGE_SIZE << pageblock_order)); - size = ALIGN(size, alignment); - /* - * Reserve memory - * Use __memblock_alloc_base() since - * memblock_alloc_base() panic()s. - */ - addr = __memblock_alloc_base(size, alignment, 0); - if (!addr) { - base_pfn = -ENOMEM; - goto err; - } else - base_pfn = PFN_DOWN(addr); - - /* - * Each reserved area must be initialised later, when more kernel - * subsystems (like slab allocator) are available. - */ - cma->base_pfn = base_pfn; - cma->count = size >> PAGE_SHIFT; - pr_info("CMA: reserved %ld MiB\n", (unsigned long)size / SZ_1M); - return 0; -err: - pr_err("CMA: failed to reserve %ld MiB\n", (unsigned long)size / SZ_1M); - return base_pfn; -} - -/** - * kvm_alloc_cma() - allocate pages from contiguous area - * @nr_pages: Requested number of pages. - * @align_pages: Requested alignment in number of pages - * - * This function allocates memory buffer for hash pagetable. - */ -struct page *kvm_alloc_cma(unsigned long nr_pages, unsigned long align_pages) -{ - int ret; - struct page *page = NULL; - struct kvm_cma *cma = &kvm_cma_area; - unsigned long chunk_count, nr_chunk; - unsigned long mask, pfn, pageno, start = 0; - - - if (!cma || !cma->count) - return NULL; - - pr_debug("%s(cma %p, count %lu, align pages %lu)\n", __func__, - (void *)cma, nr_pages, align_pages); - - if (!nr_pages) - return NULL; - /* - * align mask with chunk size. The bit tracks pages in chunk size - */ - VM_BUG_ON(!is_power_of_2(align_pages)); - mask = (align_pages >> (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT)) - 1; - BUILD_BUG_ON(PAGE_SHIFT > KVM_CMA_CHUNK_ORDER); - - chunk_count = cma->count >> (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT); - nr_chunk = nr_pages >> (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT); - - mutex_lock(&kvm_cma_mutex); - for (;;) { - pageno = bitmap_find_next_zero_area(cma->bitmap, chunk_count, - start, nr_chunk, mask); - if (pageno >= chunk_count) - break; - - pfn = cma->base_pfn + (pageno << (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT)); - ret = alloc_contig_range(pfn, pfn + nr_pages, MIGRATE_CMA); - if (ret == 0) { - bitmap_set(cma->bitmap, pageno, nr_chunk); - page = pfn_to_page(pfn); - memset(pfn_to_kaddr(pfn), 0, nr_pages << PAGE_SHIFT); - break; - } else if (ret != -EBUSY) { - break; - } - pr_debug("%s(): memory range at %p is busy, retrying\n", - __func__, pfn_to_page(pfn)); - /* try again with a bit different memory target */ - start = pageno + mask + 1; - } - mutex_unlock(&kvm_cma_mutex); - pr_debug("%s(): returned %p\n", __func__, page); - return page; -} - -/** - * kvm_release_cma() - release allocated pages for hash pagetable - * @pages: Allocated pages. - * @nr_pages: Number of allocated pages. - * - * This function releases memory allocated by kvm_alloc_cma(). - * It returns false when provided pages do not belong to contiguous area and - * true otherwise. - */ -bool kvm_release_cma(struct page *pages, unsigned long nr_pages) -{ - unsigned long pfn; - unsigned long nr_chunk; - struct kvm_cma *cma = &kvm_cma_area; - - if (!cma || !pages) - return false; - - pr_debug("%s(page %p count %lu)\n", __func__, (void *)pages, nr_pages); - - pfn = page_to_pfn(pages); - - if (pfn < cma->base_pfn || pfn >= cma->base_pfn + cma->count) - return false; - - VM_BUG_ON(pfn + nr_pages > cma->base_pfn + cma->count); - nr_chunk = nr_pages >> (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT); - - mutex_lock(&kvm_cma_mutex); - bitmap_clear(cma->bitmap, - (pfn - cma->base_pfn) >> (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT), - nr_chunk); - free_contig_range(pfn, nr_pages); - mutex_unlock(&kvm_cma_mutex); - - return true; -} - -static int __init kvm_cma_activate_area(unsigned long base_pfn, - unsigned long count) -{ - unsigned long pfn = base_pfn; - unsigned i = count >> pageblock_order; - struct zone *zone; - - WARN_ON_ONCE(!pfn_valid(pfn)); - zone = page_zone(pfn_to_page(pfn)); - do { - unsigned j; - base_pfn = pfn; - for (j = pageblock_nr_pages; j; --j, pfn++) { - WARN_ON_ONCE(!pfn_valid(pfn)); - /* - * alloc_contig_range requires the pfn range - * specified to be in the same zone. Make this - * simple by forcing the entire CMA resv range - * to be in the same zone. - */ - if (page_zone(pfn_to_page(pfn)) != zone) - return -EINVAL; - } - init_cma_reserved_pageblock(pfn_to_page(base_pfn)); - } while (--i); - return 0; -} - -static int __init kvm_cma_init_reserved_areas(void) -{ - int bitmap_size, ret; - unsigned long chunk_count; - struct kvm_cma *cma = &kvm_cma_area; - - pr_debug("%s()\n", __func__); - if (!cma->count) - return 0; - chunk_count = cma->count >> (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT); - bitmap_size = BITS_TO_LONGS(chunk_count) * sizeof(long); - cma->bitmap = kzalloc(bitmap_size, GFP_KERNEL); - if (!cma->bitmap) - return -ENOMEM; - - ret = kvm_cma_activate_area(cma->base_pfn, cma->count); - if (ret) - goto error; - return 0; - -error: - kfree(cma->bitmap); - return ret; -} -core_initcall(kvm_cma_init_reserved_areas); diff --git a/arch/powerpc/kvm/book3s_hv_cma.h b/arch/powerpc/kvm/book3s_hv_cma.h deleted file mode 100644 index 655144f..0000000 --- a/arch/powerpc/kvm/book3s_hv_cma.h +++ /dev/null @@ -1,27 +0,0 @@ -/* - * Contiguous Memory Allocator for ppc KVM hash pagetable based on CMA - * for DMA mapping framework - * - * Copyright IBM Corporation, 2013 - * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation; either version 2 of the - * License or (at your optional) any later version of the license. - * - */ - -#ifndef __POWERPC_KVM_CMA_ALLOC_H__ -#define __POWERPC_KVM_CMA_ALLOC_H__ -/* - * Both RMA and Hash page allocation will be multiple of 256K. - */ -#define KVM_CMA_CHUNK_ORDER 18 - -extern struct page *kvm_alloc_cma(unsigned long nr_pages, - unsigned long align_pages); -extern bool kvm_release_cma(struct page *pages, unsigned long nr_pages); -extern long kvm_cma_declare_contiguous(phys_addr_t size, - phys_addr_t alignment) __init; -#endif diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 2c8e90f..e0f7a18 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -128,7 +128,8 @@ int arch_add_memory(int nid, u64 start, u64 size) return -EINVAL; /* this should work for most non-highmem platforms */ - zone = pgdata->node_zones; + zone = pgdata->node_zones + + zone_for_memory(nid, start, size, 0); return __add_pages(nid, zone, start_pfn, nr_pages); } diff --git a/arch/score/include/uapi/asm/ptrace.h b/arch/score/include/uapi/asm/ptrace.h index f59771a..5c5e794 100644 --- a/arch/score/include/uapi/asm/ptrace.h +++ b/arch/score/include/uapi/asm/ptrace.h @@ -4,17 +4,6 @@ #define PTRACE_GETREGS 12 #define PTRACE_SETREGS 13 -#define PC 32 -#define CONDITION 33 -#define ECR 34 -#define EMA 35 -#define CEH 36 -#define CEL 37 -#define COUNTER 38 -#define LDCR 39 -#define STCR 40 -#define PSR 41 - #define SINGLESTEP16_INSN 0x7006 #define SINGLESTEP32_INSN 0x840C8000 #define BREAKPOINT16_INSN 0x7002 /* work on SPG300 */ diff --git a/arch/sh/drivers/dma/Kconfig b/arch/sh/drivers/dma/Kconfig index cfd5b90..78bc97b 100644 --- a/arch/sh/drivers/dma/Kconfig +++ b/arch/sh/drivers/dma/Kconfig @@ -12,9 +12,8 @@ config SH_DMA_IRQ_MULTI default y if CPU_SUBTYPE_SH7750 || CPU_SUBTYPE_SH7751 || \ CPU_SUBTYPE_SH7750S || CPU_SUBTYPE_SH7750R || \ CPU_SUBTYPE_SH7751R || CPU_SUBTYPE_SH7091 || \ - CPU_SUBTYPE_SH7763 || CPU_SUBTYPE_SH7764 || \ - CPU_SUBTYPE_SH7780 || CPU_SUBTYPE_SH7785 || \ - CPU_SUBTYPE_SH7760 + CPU_SUBTYPE_SH7763 || CPU_SUBTYPE_SH7780 || \ + CPU_SUBTYPE_SH7785 || CPU_SUBTYPE_SH7760 config SH_DMA_API depends on SH_DMA diff --git a/arch/sh/include/asm/io_noioport.h b/arch/sh/include/asm/io_noioport.h index 4d48f14..c727e6d 100644 --- a/arch/sh/include/asm/io_noioport.h +++ b/arch/sh/include/asm/io_noioport.h @@ -34,6 +34,17 @@ static inline void outl(unsigned int x, unsigned long port) BUG(); } +static inline void __iomem *ioport_map(unsigned long port, unsigned int size) +{ + BUG(); + return NULL; +} + +static inline void ioport_unmap(void __iomem *addr) +{ + BUG(); +} + #define inb_p(addr) inb(addr) #define inw_p(addr) inw(addr) #define inl_p(addr) inl(addr) diff --git a/arch/sh/include/cpu-sh4/cpu/dma-register.h b/arch/sh/include/cpu-sh4/cpu/dma-register.h index 02788b6..9cd81e5 100644 --- a/arch/sh/include/cpu-sh4/cpu/dma-register.h +++ b/arch/sh/include/cpu-sh4/cpu/dma-register.h @@ -32,7 +32,6 @@ #define CHCR_TS_HIGH_SHIFT (20 - 2) /* 2 bits for shifted low TS */ #elif defined(CONFIG_CPU_SUBTYPE_SH7757) || \ defined(CONFIG_CPU_SUBTYPE_SH7763) || \ - defined(CONFIG_CPU_SUBTYPE_SH7764) || \ defined(CONFIG_CPU_SUBTYPE_SH7780) || \ defined(CONFIG_CPU_SUBTYPE_SH7785) #define CHCR_TS_LOW_MASK 0x00000018 diff --git a/arch/sh/include/cpu-sh4a/cpu/dma.h b/arch/sh/include/cpu-sh4a/cpu/dma.h index 89afb65..8ceccce 100644 --- a/arch/sh/include/cpu-sh4a/cpu/dma.h +++ b/arch/sh/include/cpu-sh4a/cpu/dma.h @@ -14,8 +14,7 @@ #define DMTE4_IRQ evt2irq(0xb80) #define DMAE0_IRQ evt2irq(0xbc0) /* DMA Error IRQ*/ #define SH_DMAC_BASE0 0xFE008020 -#elif defined(CONFIG_CPU_SUBTYPE_SH7763) || \ - defined(CONFIG_CPU_SUBTYPE_SH7764) +#elif defined(CONFIG_CPU_SUBTYPE_SH7763) #define DMTE0_IRQ evt2irq(0x640) #define DMTE4_IRQ evt2irq(0x780) #define DMAE0_IRQ evt2irq(0x6c0) diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7724.c b/arch/sh/kernel/cpu/sh4a/clock-sh7724.c index f579dd5..c187b95 100644 --- a/arch/sh/kernel/cpu/sh4a/clock-sh7724.c +++ b/arch/sh/kernel/cpu/sh4a/clock-sh7724.c @@ -307,7 +307,7 @@ static struct clk_lookup lookups[] = { CLKDEV_ICK_ID("fck", "sh-tmu.0", &mstp_clks[HWBLK_TMU0]), CLKDEV_ICK_ID("fck", "sh-tmu.1", &mstp_clks[HWBLK_TMU1]), - CLKDEV_ICK_ID("fck", "sh-cmt-16.0", &mstp_clks[HWBLK_CMT]), + CLKDEV_ICK_ID("fck", "sh-cmt-32.0", &mstp_clks[HWBLK_CMT]), CLKDEV_DEV_ID("sh-wdt.0", &mstp_clks[HWBLK_RWDT]), CLKDEV_DEV_ID("sh-dma-engine.1", &mstp_clks[HWBLK_DMAC1]), @@ -332,6 +332,8 @@ static struct clk_lookup lookups[] = { CLKDEV_CON_ID("tsif0", &mstp_clks[HWBLK_TSIF]), CLKDEV_DEV_ID("renesas_usbhs.1", &mstp_clks[HWBLK_USB1]), CLKDEV_DEV_ID("renesas_usbhs.0", &mstp_clks[HWBLK_USB0]), + CLKDEV_CON_ID("usb1", &mstp_clks[HWBLK_USB1]), + CLKDEV_CON_ID("usb0", &mstp_clks[HWBLK_USB0]), CLKDEV_CON_ID("2dg0", &mstp_clks[HWBLK_2DG]), CLKDEV_DEV_ID("sh_mobile_sdhi.0", &mstp_clks[HWBLK_SDHI0]), CLKDEV_DEV_ID("sh_mobile_sdhi.1", &mstp_clks[HWBLK_SDHI1]), diff --git a/arch/sh/kernel/time.c b/arch/sh/kernel/time.c index 552c8fc..d6d0a98 100644 --- a/arch/sh/kernel/time.c +++ b/arch/sh/kernel/time.c @@ -80,10 +80,8 @@ static int __init rtc_generic_init(void) return -ENODEV; pdev = platform_device_register_simple("rtc-generic", -1, NULL, 0); - if (IS_ERR(pdev)) - return PTR_ERR(pdev); - return 0; + return PTR_ERR_OR_ZERO(pdev); } module_init(rtc_generic_init); diff --git a/arch/sh/mm/asids-debugfs.c b/arch/sh/mm/asids-debugfs.c index 74c03ec..ecfc6b0 100644 --- a/arch/sh/mm/asids-debugfs.c +++ b/arch/sh/mm/asids-debugfs.c @@ -67,10 +67,8 @@ static int __init asids_debugfs_init(void) NULL, &asids_debugfs_fops); if (!asids_dentry) return -ENOMEM; - if (IS_ERR(asids_dentry)) - return PTR_ERR(asids_dentry); - return 0; + return PTR_ERR_OR_ZERO(asids_dentry); } module_init(asids_debugfs_init); diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index 2d089fe..2790b6a 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -495,8 +495,9 @@ int arch_add_memory(int nid, u64 start, u64 size) pgdat = NODE_DATA(nid); /* We only have ZONE_NORMAL, so this is easy.. */ - ret = __add_pages(nid, pgdat->node_zones + ZONE_NORMAL, - start_pfn, nr_pages); + ret = __add_pages(nid, pgdat->node_zones + + zone_for_memory(nid, start, size, ZONE_NORMAL), + start_pfn, nr_pages); if (unlikely(ret)) printk("%s: Failed, __add_pages() == %d\n", __func__, ret); diff --git a/arch/tile/kernel/module.c b/arch/tile/kernel/module.c index 4918d91..d19b13e 100644 --- a/arch/tile/kernel/module.c +++ b/arch/tile/kernel/module.c @@ -58,7 +58,7 @@ void *module_alloc(unsigned long size) area->nr_pages = npages; area->pages = pages; - if (map_vm_area(area, prot_rwx, &pages)) { + if (map_vm_area(area, prot_rwx, pages)) { vunmap(area->addr); goto error; } diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 1dbade8..a241946 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1218,7 +1218,8 @@ good_area: /* * If for any reason at all we couldn't handle the fault, * make sure we exit gracefully rather than endlessly redo - * the fault: + * the fault. Since we never set FAULT_FLAG_RETRY_NOWAIT, if + * we get VM_FAULT_RETRY back, the mmap_sem has been unlocked. */ fault = handle_mm_fault(mm, vma, address, flags); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index e395048..7d05565 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -825,7 +825,8 @@ void __init mem_init(void) int arch_add_memory(int nid, u64 start, u64 size) { struct pglist_data *pgdata = NODE_DATA(nid); - struct zone *zone = pgdata->node_zones + ZONE_HIGHMEM; + struct zone *zone = pgdata->node_zones + + zone_for_memory(nid, start, size, ZONE_HIGHMEM); unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index df1a992..5621c47 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -691,7 +691,8 @@ static void update_end_of_memory_vars(u64 start, u64 size) int arch_add_memory(int nid, u64 start, u64 size) { struct pglist_data *pgdat = NODE_DATA(nid); - struct zone *zone = pgdat->node_zones + ZONE_NORMAL; + struct zone *zone = pgdat->node_zones + + zone_for_memory(nid, start, size, ZONE_NORMAL); unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; int ret; diff --git a/drivers/ata/Kconfig b/drivers/ata/Kconfig index e65d400..e1b9278 100644 --- a/drivers/ata/Kconfig +++ b/drivers/ata/Kconfig @@ -16,6 +16,7 @@ menuconfig ATA depends on BLOCK depends on !(M32R || M68K || S390) || BROKEN select SCSI + select GLOB ---help--- If you want to use an ATA hard disk, ATA tape drive, ATA CD-ROM or any other ATA device under Linux, say Y and make sure that you know diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index 677c0c1..dbdc5d3 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -59,6 +59,7 @@ #include <linux/async.h> #include <linux/log2.h> #include <linux/slab.h> +#include <linux/glob.h> #include <scsi/scsi.h> #include <scsi/scsi_cmnd.h> #include <scsi/scsi_host.h> @@ -4250,73 +4251,6 @@ static const struct ata_blacklist_entry ata_device_blacklist [] = { { } }; -/** - * glob_match - match a text string against a glob-style pattern - * @text: the string to be examined - * @pattern: the glob-style pattern to be matched against - * - * Either/both of text and pattern can be empty strings. - * - * Match text against a glob-style pattern, with wildcards and simple sets: - * - * ? matches any single character. - * * matches any run of characters. - * [xyz] matches a single character from the set: x, y, or z. - * [a-d] matches a single character from the range: a, b, c, or d. - * [a-d0-9] matches a single character from either range. - * - * The special characters ?, [, -, or *, can be matched using a set, eg. [*] - * Behaviour with malformed patterns is undefined, though generally reasonable. - * - * Sample patterns: "SD1?", "SD1[0-5]", "*R0", "SD*1?[012]*xx" - * - * This function uses one level of recursion per '*' in pattern. - * Since it calls _nothing_ else, and has _no_ explicit local variables, - * this will not cause stack problems for any reasonable use here. - * - * RETURNS: - * 0 on match, 1 otherwise. - */ -static int glob_match (const char *text, const char *pattern) -{ - do { - /* Match single character or a '?' wildcard */ - if (*text == *pattern || *pattern == '?') { - if (!*pattern++) - return 0; /* End of both strings: match */ - } else { - /* Match single char against a '[' bracketed ']' pattern set */ - if (!*text || *pattern != '[') - break; /* Not a pattern set */ - while (*++pattern && *pattern != ']' && *text != *pattern) { - if (*pattern == '-' && *(pattern - 1) != '[') - if (*text > *(pattern - 1) && *text < *(pattern + 1)) { - ++pattern; - break; - } - } - if (!*pattern || *pattern == ']') - return 1; /* No match */ - while (*pattern && *pattern++ != ']'); - } - } while (*++text && *pattern); - - /* Match any run of chars against a '*' wildcard */ - if (*pattern == '*') { - if (!*++pattern) - return 0; /* Match: avoid recursion at end of pattern */ - /* Loop to handle additional pattern chars after the wildcard */ - while (*text) { - if (glob_match(text, pattern) == 0) - return 0; /* Remainder matched */ - ++text; /* Absorb (match) this char and try again */ - } - } - if (!*text && !*pattern) - return 0; /* End of both strings: match */ - return 1; /* No match */ -} - static unsigned long ata_dev_blacklisted(const struct ata_device *dev) { unsigned char model_num[ATA_ID_PROD_LEN + 1]; @@ -4327,10 +4261,10 @@ static unsigned long ata_dev_blacklisted(const struct ata_device *dev) ata_id_c_string(dev->id, model_rev, ATA_ID_FW_REV, sizeof(model_rev)); while (ad->model_num) { - if (!glob_match(model_num, ad->model_num)) { + if (glob_match(model_num, ad->model_num)) { if (ad->model_rev == NULL) return ad->horkage; - if (!glob_match(model_rev, ad->model_rev)) + if (glob_match(model_rev, ad->model_rev)) return ad->horkage; } ad++; diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig index 88500fe..4e7f0ff 100644 --- a/drivers/base/Kconfig +++ b/drivers/base/Kconfig @@ -289,16 +289,6 @@ config CMA_ALIGNMENT If unsure, leave the default value "8". -config CMA_AREAS - int "Maximum count of the CMA device-private areas" - default 7 - help - CMA allows to create CMA areas for particular devices. This parameter - sets the maximum number of such device private CMA areas in the - system. - - If unsure, leave the default value "7". - endif endmenu diff --git a/drivers/base/dma-contiguous.c b/drivers/base/dma-contiguous.c index 6467c91..6606abd 100644 --- a/drivers/base/dma-contiguous.c +++ b/drivers/base/dma-contiguous.c @@ -24,23 +24,9 @@ #include <linux/memblock.h> #include <linux/err.h> -#include <linux/mm.h> -#include <linux/mutex.h> -#include <linux/page-isolation.h> #include <linux/sizes.h> -#include <linux/slab.h> -#include <linux/swap.h> -#include <linux/mm_types.h> #include <linux/dma-contiguous.h> - -struct cma { - unsigned long base_pfn; - unsigned long count; - unsigned long *bitmap; - struct mutex lock; -}; - -struct cma *dma_contiguous_default_area; +#include <linux/cma.h> #ifdef CONFIG_CMA_SIZE_MBYTES #define CMA_SIZE_MBYTES CONFIG_CMA_SIZE_MBYTES @@ -48,6 +34,8 @@ struct cma *dma_contiguous_default_area; #define CMA_SIZE_MBYTES 0 #endif +struct cma *dma_contiguous_default_area; + /* * Default global CMA area size can be defined in kernel's .config. * This is useful mainly for distro maintainers to create a kernel @@ -154,65 +142,6 @@ void __init dma_contiguous_reserve(phys_addr_t limit) } } -static DEFINE_MUTEX(cma_mutex); - -static int __init cma_activate_area(struct cma *cma) -{ - int bitmap_size = BITS_TO_LONGS(cma->count) * sizeof(long); - unsigned long base_pfn = cma->base_pfn, pfn = base_pfn; - unsigned i = cma->count >> pageblock_order; - struct zone *zone; - - cma->bitmap = kzalloc(bitmap_size, GFP_KERNEL); - - if (!cma->bitmap) - return -ENOMEM; - - WARN_ON_ONCE(!pfn_valid(pfn)); - zone = page_zone(pfn_to_page(pfn)); - - do { - unsigned j; - base_pfn = pfn; - for (j = pageblock_nr_pages; j; --j, pfn++) { - WARN_ON_ONCE(!pfn_valid(pfn)); - /* - * alloc_contig_range requires the pfn range - * specified to be in the same zone. Make this - * simple by forcing the entire CMA resv range - * to be in the same zone. - */ - if (page_zone(pfn_to_page(pfn)) != zone) - goto err; - } - init_cma_reserved_pageblock(pfn_to_page(base_pfn)); - } while (--i); - - mutex_init(&cma->lock); - return 0; - -err: - kfree(cma->bitmap); - return -EINVAL; -} - -static struct cma cma_areas[MAX_CMA_AREAS]; -static unsigned cma_area_count; - -static int __init cma_init_reserved_areas(void) -{ - int i; - - for (i = 0; i < cma_area_count; i++) { - int ret = cma_activate_area(&cma_areas[i]); - if (ret) - return ret; - } - - return 0; -} -core_initcall(cma_init_reserved_areas); - /** * dma_contiguous_reserve_area() - reserve custom contiguous area * @size: Size of the reserved area (in bytes), @@ -234,72 +163,17 @@ int __init dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base, phys_addr_t limit, struct cma **res_cma, bool fixed) { - struct cma *cma = &cma_areas[cma_area_count]; - phys_addr_t alignment; - int ret = 0; - - pr_debug("%s(size %lx, base %08lx, limit %08lx)\n", __func__, - (unsigned long)size, (unsigned long)base, - (unsigned long)limit); - - /* Sanity checks */ - if (cma_area_count == ARRAY_SIZE(cma_areas)) { - pr_err("Not enough slots for CMA reserved regions!\n"); - return -ENOSPC; - } - - if (!size) - return -EINVAL; - - /* Sanitise input arguments */ - alignment = PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order); - base = ALIGN(base, alignment); - size = ALIGN(size, alignment); - limit &= ~(alignment - 1); - - /* Reserve memory */ - if (base && fixed) { - if (memblock_is_region_reserved(base, size) || - memblock_reserve(base, size) < 0) { - ret = -EBUSY; - goto err; - } - } else { - phys_addr_t addr = memblock_alloc_range(size, alignment, base, - limit); - if (!addr) { - ret = -ENOMEM; - goto err; - } else { - base = addr; - } - } - - /* - * Each reserved area must be initialised later, when more kernel - * subsystems (like slab allocator) are available. - */ - cma->base_pfn = PFN_DOWN(base); - cma->count = size >> PAGE_SHIFT; - *res_cma = cma; - cma_area_count++; + int ret; - pr_info("CMA: reserved %ld MiB at %08lx\n", (unsigned long)size / SZ_1M, - (unsigned long)base); + ret = cma_declare_contiguous(base, size, limit, 0, 0, fixed, res_cma); + if (ret) + return ret; /* Architecture specific contiguous memory fixup. */ - dma_contiguous_early_fixup(base, size); - return 0; -err: - pr_err("CMA: failed to reserve %ld MiB\n", (unsigned long)size / SZ_1M); - return ret; -} + dma_contiguous_early_fixup(cma_get_base(*res_cma), + cma_get_size(*res_cma)); -static void clear_cma_bitmap(struct cma *cma, unsigned long pfn, int count) -{ - mutex_lock(&cma->lock); - bitmap_clear(cma->bitmap, pfn - cma->base_pfn, count); - mutex_unlock(&cma->lock); + return 0; } /** @@ -316,62 +190,10 @@ static void clear_cma_bitmap(struct cma *cma, unsigned long pfn, int count) struct page *dma_alloc_from_contiguous(struct device *dev, int count, unsigned int align) { - unsigned long mask, pfn, pageno, start = 0; - struct cma *cma = dev_get_cma_area(dev); - struct page *page = NULL; - int ret; - - if (!cma || !cma->count) - return NULL; - if (align > CONFIG_CMA_ALIGNMENT) align = CONFIG_CMA_ALIGNMENT; - pr_debug("%s(cma %p, count %d, align %d)\n", __func__, (void *)cma, - count, align); - - if (!count) - return NULL; - - mask = (1 << align) - 1; - - - for (;;) { - mutex_lock(&cma->lock); - pageno = bitmap_find_next_zero_area(cma->bitmap, cma->count, - start, count, mask); - if (pageno >= cma->count) { - mutex_unlock(&cma->lock); - break; - } - bitmap_set(cma->bitmap, pageno, count); - /* - * It's safe to drop the lock here. We've marked this region for - * our exclusive use. If the migration fails we will take the - * lock again and unmark it. - */ - mutex_unlock(&cma->lock); - - pfn = cma->base_pfn + pageno; - mutex_lock(&cma_mutex); - ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA); - mutex_unlock(&cma_mutex); - if (ret == 0) { - page = pfn_to_page(pfn); - break; - } else if (ret != -EBUSY) { - clear_cma_bitmap(cma, pfn, count); - break; - } - clear_cma_bitmap(cma, pfn, count); - pr_debug("%s(): memory range at %p is busy, retrying\n", - __func__, pfn_to_page(pfn)); - /* try again with a bit different memory target */ - start = pageno + mask + 1; - } - - pr_debug("%s(): returned %p\n", __func__, page); - return page; + return cma_alloc(dev_get_cma_area(dev), count, align); } /** @@ -387,23 +209,5 @@ struct page *dma_alloc_from_contiguous(struct device *dev, int count, bool dma_release_from_contiguous(struct device *dev, struct page *pages, int count) { - struct cma *cma = dev_get_cma_area(dev); - unsigned long pfn; - - if (!cma || !pages) - return false; - - pr_debug("%s(page %p)\n", __func__, (void *)pages); - - pfn = page_to_pfn(pages); - - if (pfn < cma->base_pfn || pfn >= cma->base_pfn + cma->count) - return false; - - VM_BUG_ON(pfn + count > cma->base_pfn + cma->count); - - free_contig_range(pfn, count); - clear_cma_bitmap(cma, pfn, count); - - return true; + return cma_release(dev_get_cma_area(dev), pages, count); } diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 89f752d..a2e13e2 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -284,7 +284,7 @@ static int memory_subsys_online(struct device *dev) * attribute and need to set the online_type. */ if (mem->online_type < 0) - mem->online_type = ONLINE_KEEP; + mem->online_type = MMOP_ONLINE_KEEP; ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE); @@ -315,23 +315,23 @@ store_mem_state(struct device *dev, if (ret) return ret; - if (!strncmp(buf, "online_kernel", min_t(int, count, 13))) - online_type = ONLINE_KERNEL; - else if (!strncmp(buf, "online_movable", min_t(int, count, 14))) - online_type = ONLINE_MOVABLE; - else if (!strncmp(buf, "online", min_t(int, count, 6))) - online_type = ONLINE_KEEP; - else if (!strncmp(buf, "offline", min_t(int, count, 7))) - online_type = -1; + if (sysfs_streq(buf, "online_kernel")) + online_type = MMOP_ONLINE_KERNEL; + else if (sysfs_streq(buf, "online_movable")) + online_type = MMOP_ONLINE_MOVABLE; + else if (sysfs_streq(buf, "online")) + online_type = MMOP_ONLINE_KEEP; + else if (sysfs_streq(buf, "offline")) + online_type = MMOP_OFFLINE; else { ret = -EINVAL; goto err; } switch (online_type) { - case ONLINE_KERNEL: - case ONLINE_MOVABLE: - case ONLINE_KEEP: + case MMOP_ONLINE_KERNEL: + case MMOP_ONLINE_MOVABLE: + case MMOP_ONLINE_KEEP: /* * mem->online_type is not protected so there can be a * race here. However, when racing online, the first @@ -342,7 +342,7 @@ store_mem_state(struct device *dev, mem->online_type = online_type; ret = device_online(&mem->dev); break; - case -1: + case MMOP_OFFLINE: ret = device_offline(&mem->dev); break; default: @@ -406,7 +406,9 @@ memory_probe_store(struct device *dev, struct device_attribute *attr, int i, ret; unsigned long pages_per_block = PAGES_PER_SECTION * sections_per_block; - phys_addr = simple_strtoull(buf, NULL, 0); + ret = kstrtoull(buf, 0, &phys_addr); + if (ret) + return ret; if (phys_addr & ((pages_per_block << PAGE_SHIFT) - 1)) return -EINVAL; diff --git a/drivers/base/node.c b/drivers/base/node.c index 8f7ed99..c6d3ae0 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -126,7 +126,7 @@ static ssize_t node_read_meminfo(struct device *dev, nid, K(node_page_state(nid, NR_FILE_PAGES)), nid, K(node_page_state(nid, NR_FILE_MAPPED)), nid, K(node_page_state(nid, NR_ANON_PAGES)), - nid, K(node_page_state(nid, NR_SHMEM)), + nid, K(i.sharedram), nid, node_page_state(nid, NR_KERNEL_STACK) * THREAD_SIZE / 1024, nid, K(node_page_state(nid, NR_PAGETABLE)), diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 36e54be..dfa4024 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -183,19 +183,32 @@ static ssize_t comp_algorithm_store(struct device *dev, static int zram_test_flag(struct zram_meta *meta, u32 index, enum zram_pageflags flag) { - return meta->table[index].flags & BIT(flag); + return meta->table[index].value & BIT(flag); } static void zram_set_flag(struct zram_meta *meta, u32 index, enum zram_pageflags flag) { - meta->table[index].flags |= BIT(flag); + meta->table[index].value |= BIT(flag); } static void zram_clear_flag(struct zram_meta *meta, u32 index, enum zram_pageflags flag) { - meta->table[index].flags &= ~BIT(flag); + meta->table[index].value &= ~BIT(flag); +} + +static size_t zram_get_obj_size(struct zram_meta *meta, u32 index) +{ + return meta->table[index].value & (BIT(ZRAM_FLAG_SHIFT) - 1); +} + +static void zram_set_obj_size(struct zram_meta *meta, + u32 index, size_t size) +{ + unsigned long flags = meta->table[index].value >> ZRAM_FLAG_SHIFT; + + meta->table[index].value = (flags << ZRAM_FLAG_SHIFT) | size; } static inline int is_partial_io(struct bio_vec *bvec) @@ -255,7 +268,6 @@ static struct zram_meta *zram_meta_alloc(u64 disksize) goto free_table; } - rwlock_init(&meta->tb_lock); return meta; free_table: @@ -304,7 +316,12 @@ static void handle_zero_page(struct bio_vec *bvec) flush_dcache_page(page); } -/* NOTE: caller should hold meta->tb_lock with write-side */ + +/* + * To protect concurrent access to the same index entry, + * caller should hold this table index entry's bit_spinlock to + * indicate this index entry is accessing. + */ static void zram_free_page(struct zram *zram, size_t index) { struct zram_meta *meta = zram->meta; @@ -324,11 +341,12 @@ static void zram_free_page(struct zram *zram, size_t index) zs_free(meta->mem_pool, handle); - atomic64_sub(meta->table[index].size, &zram->stats.compr_data_size); + atomic64_sub(zram_get_obj_size(meta, index), + &zram->stats.compr_data_size); atomic64_dec(&zram->stats.pages_stored); meta->table[index].handle = 0; - meta->table[index].size = 0; + zram_set_obj_size(meta, index, 0); } static int zram_decompress_page(struct zram *zram, char *mem, u32 index) @@ -337,14 +355,14 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index) unsigned char *cmem; struct zram_meta *meta = zram->meta; unsigned long handle; - u16 size; + size_t size; - read_lock(&meta->tb_lock); + bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); handle = meta->table[index].handle; - size = meta->table[index].size; + size = zram_get_obj_size(meta, index); if (!handle || zram_test_flag(meta, index, ZRAM_ZERO)) { - read_unlock(&meta->tb_lock); + bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); clear_page(mem); return 0; } @@ -355,7 +373,7 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index) else ret = zcomp_decompress(zram->comp, cmem, size, mem); zs_unmap_object(meta->mem_pool, handle); - read_unlock(&meta->tb_lock); + bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); /* Should NEVER happen. Return bio error if it does. */ if (unlikely(ret)) { @@ -376,14 +394,14 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, struct zram_meta *meta = zram->meta; page = bvec->bv_page; - read_lock(&meta->tb_lock); + bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); if (unlikely(!meta->table[index].handle) || zram_test_flag(meta, index, ZRAM_ZERO)) { - read_unlock(&meta->tb_lock); + bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); handle_zero_page(bvec); return 0; } - read_unlock(&meta->tb_lock); + bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); if (is_partial_io(bvec)) /* Use a temporary buffer to decompress the page */ @@ -461,10 +479,10 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, if (page_zero_filled(uncmem)) { kunmap_atomic(user_mem); /* Free memory associated with this sector now. */ - write_lock(&zram->meta->tb_lock); + bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); zram_free_page(zram, index); zram_set_flag(meta, index, ZRAM_ZERO); - write_unlock(&zram->meta->tb_lock); + bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); atomic64_inc(&zram->stats.zero_pages); ret = 0; @@ -514,12 +532,12 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, * Free memory associated with this sector * before overwriting unused sectors. */ - write_lock(&zram->meta->tb_lock); + bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); zram_free_page(zram, index); meta->table[index].handle = handle; - meta->table[index].size = clen; - write_unlock(&zram->meta->tb_lock); + zram_set_obj_size(meta, index, clen); + bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); /* Update stats */ atomic64_add(clen, &zram->stats.compr_data_size); @@ -560,6 +578,7 @@ static void zram_bio_discard(struct zram *zram, u32 index, int offset, struct bio *bio) { size_t n = bio->bi_iter.bi_size; + struct zram_meta *meta = zram->meta; /* * zram manages data in physical block size units. Because logical block @@ -580,13 +599,9 @@ static void zram_bio_discard(struct zram *zram, u32 index, } while (n >= PAGE_SIZE) { - /* - * Discard request can be large so the lock hold times could be - * lengthy. So take the lock once per page. - */ - write_lock(&zram->meta->tb_lock); + bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); zram_free_page(zram, index); - write_unlock(&zram->meta->tb_lock); + bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); index++; n -= PAGE_SIZE; } @@ -821,9 +836,9 @@ static void zram_slot_free_notify(struct block_device *bdev, zram = bdev->bd_disk->private_data; meta = zram->meta; - write_lock(&meta->tb_lock); + bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); zram_free_page(zram, index); - write_unlock(&meta->tb_lock); + bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); atomic64_inc(&zram->stats.notify_free); } diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 7f21c14..5b0afde 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -43,7 +43,6 @@ static const size_t max_zpage_size = PAGE_SIZE / 4 * 3; /*-- End of configurable params */ #define SECTOR_SHIFT 9 -#define SECTOR_SIZE (1 << SECTOR_SHIFT) #define SECTORS_PER_PAGE_SHIFT (PAGE_SHIFT - SECTOR_SHIFT) #define SECTORS_PER_PAGE (1 << SECTORS_PER_PAGE_SHIFT) #define ZRAM_LOGICAL_BLOCK_SHIFT 12 @@ -51,10 +50,24 @@ static const size_t max_zpage_size = PAGE_SIZE / 4 * 3; #define ZRAM_SECTOR_PER_LOGICAL_BLOCK \ (1 << (ZRAM_LOGICAL_BLOCK_SHIFT - SECTOR_SHIFT)) -/* Flags for zram pages (table[page_no].flags) */ + +/* + * The lower ZRAM_FLAG_SHIFT bits of table.value is for + * object size (excluding header), the higher bits is for + * zram_pageflags. + * + * zram is mainly used for memory efficiency so we want to keep memory + * footprint small so we can squeeze size and flags into a field. + * The lower ZRAM_FLAG_SHIFT bits is for object size (excluding header), + * the higher bits is for zram_pageflags. + */ +#define ZRAM_FLAG_SHIFT 24 + +/* Flags for zram pages (table[page_no].value) */ enum zram_pageflags { /* Page consists entirely of zeros */ - ZRAM_ZERO, + ZRAM_ZERO = ZRAM_FLAG_SHIFT + 1, + ZRAM_ACCESS, /* page in now accessed */ __NR_ZRAM_PAGEFLAGS, }; @@ -62,11 +75,10 @@ enum zram_pageflags { /*-- Data structures */ /* Allocated for each disk page */ -struct table { +struct zram_table_entry { unsigned long handle; - u16 size; /* object size (excluding header) */ - u8 flags; -} __aligned(4); + unsigned long value; +}; struct zram_stats { atomic64_t compr_data_size; /* compressed size of pages stored */ @@ -81,8 +93,7 @@ struct zram_stats { }; struct zram_meta { - rwlock_t tb_lock; /* protect table */ - struct table *table; + struct zram_table_entry *table; struct zs_pool *mem_pool; }; diff --git a/drivers/firmware/memmap.c b/drivers/firmware/memmap.c index 17cf96c..79f18e6 100644 --- a/drivers/firmware/memmap.c +++ b/drivers/firmware/memmap.c @@ -286,7 +286,11 @@ int __meminit firmware_map_add_hotplug(u64 start, u64 end, const char *type) { struct firmware_map_entry *entry; - entry = firmware_map_find_entry_bootmem(start, end, type); + entry = firmware_map_find_entry(start, end - 1, type); + if (entry) + return 0; + + entry = firmware_map_find_entry_bootmem(start, end - 1, type); if (!entry) { entry = kzalloc(sizeof(struct firmware_map_entry), GFP_ATOMIC); if (!entry) diff --git a/drivers/gpu/drm/drm_hashtab.c b/drivers/gpu/drm/drm_hashtab.c index 7e4bae7..c3b80fd 100644 --- a/drivers/gpu/drm/drm_hashtab.c +++ b/drivers/gpu/drm/drm_hashtab.c @@ -125,7 +125,7 @@ int drm_ht_insert_item(struct drm_open_hash *ht, struct drm_hash_item *item) parent = &entry->head; } if (parent) { - hlist_add_after_rcu(parent, &item->head); + hlist_add_behind_rcu(&item->head, parent); } else { hlist_add_head_rcu(&item->head, h_list); } diff --git a/drivers/hwmon/asus_atk0110.c b/drivers/hwmon/asus_atk0110.c index ae208f6..cccef87 100644 --- a/drivers/hwmon/asus_atk0110.c +++ b/drivers/hwmon/asus_atk0110.c @@ -688,7 +688,7 @@ static int atk_debugfs_gitm_get(void *p, u64 *val) DEFINE_SIMPLE_ATTRIBUTE(atk_debugfs_gitm, atk_debugfs_gitm_get, NULL, - "0x%08llx\n") + "0x%08llx\n"); static int atk_acpi_print(char *buf, size_t sz, union acpi_object *obj) { diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c index 0bf1e4e..6590558 100644 --- a/drivers/lguest/core.c +++ b/drivers/lguest/core.c @@ -42,7 +42,6 @@ DEFINE_MUTEX(lguest_lock); static __init int map_switcher(void) { int i, err; - struct page **pagep; /* * Map the Switcher in to high memory. @@ -110,11 +109,9 @@ static __init int map_switcher(void) * This code actually sets up the pages we've allocated to appear at * switcher_addr. map_vm_area() takes the vma we allocated above, the * kind of pages we're mapping (kernel pages), and a pointer to our - * array of struct pages. It increments that pointer, but we don't - * care. + * array of struct pages. */ - pagep = lg_switcher_pages; - err = map_vm_area(switcher_vma, PAGE_KERNEL_EXEC, &pagep); + err = map_vm_area(switcher_vma, PAGE_KERNEL_EXEC, lg_switcher_pages); if (err) { printk("lguest: map_vm_area failed: %i\n", err); goto free_vma; diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c index 681a9e8..e8ba747 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c +++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c @@ -1948,7 +1948,7 @@ static int i40e_update_ethtool_fdir_entry(struct i40e_vsi *vsi, /* add filter to the list */ if (parent) - hlist_add_after(&parent->fdir_node, &input->fdir_node); + hlist_add_behind(&input->fdir_node, &parent->fdir_node); else hlist_add_head(&input->fdir_node, &pf->fdir_filter_list); diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c index 94a1c07..e4100b5 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c @@ -2517,7 +2517,7 @@ static int ixgbe_update_ethtool_fdir_entry(struct ixgbe_adapter *adapter, /* add filter to the list */ if (parent) - hlist_add_after(&parent->fdir_node, &input->fdir_node); + hlist_add_behind(&input->fdir_node, &parent->fdir_node); else hlist_add_head(&input->fdir_node, &adapter->fdir_filter_list); diff --git a/drivers/staging/android/binder.c b/drivers/staging/android/binder.c index 02b0379..4f34dc0 100644 --- a/drivers/staging/android/binder.c +++ b/drivers/staging/android/binder.c @@ -585,7 +585,6 @@ static int binder_update_page_range(struct binder_proc *proc, int allocate, for (page_addr = start; page_addr < end; page_addr += PAGE_SIZE) { int ret; - struct page **page_array_ptr; page = &proc->pages[(page_addr - proc->buffer) / PAGE_SIZE]; @@ -598,8 +597,7 @@ static int binder_update_page_range(struct binder_proc *proc, int allocate, } tmp_area.addr = page_addr; tmp_area.size = PAGE_SIZE + PAGE_SIZE /* guard page? */; - page_array_ptr = page; - ret = map_vm_area(&tmp_area, PAGE_KERNEL, &page_array_ptr); + ret = map_vm_area(&tmp_area, PAGE_KERNEL, page); if (ret) { pr_err("%d: binder_alloc_buf failed to map page at %p in kernel\n", proc->pid, page_addr); diff --git a/drivers/staging/lustre/lustre/libcfs/hash.c b/drivers/staging/lustre/lustre/libcfs/hash.c index 5dde794..8ef1deb 100644 --- a/drivers/staging/lustre/lustre/libcfs/hash.c +++ b/drivers/staging/lustre/lustre/libcfs/hash.c @@ -351,7 +351,7 @@ cfs_hash_dh_hnode_add(struct cfs_hash *hs, struct cfs_hash_bd *bd, cfs_hash_dhead_t, dh_head); if (dh->dh_tail != NULL) /* not empty */ - hlist_add_after(dh->dh_tail, hnode); + hlist_add_behind(hnode, dh->dh_tail); else /* empty list */ hlist_add_head(hnode, &dh->dh_head); dh->dh_tail = hnode; @@ -406,7 +406,7 @@ cfs_hash_dd_hnode_add(struct cfs_hash *hs, struct cfs_hash_bd *bd, cfs_hash_dhead_dep_t, dd_head); if (dh->dd_tail != NULL) /* not empty */ - hlist_add_after(dh->dd_tail, hnode); + hlist_add_behind(hnode, dh->dd_tail); else /* empty list */ hlist_add_head(hnode, &dh->dd_head); dh->dd_tail = hnode; diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c index 454b658..42bad18 100644 --- a/drivers/tty/sysrq.c +++ b/drivers/tty/sysrq.c @@ -355,7 +355,7 @@ static struct sysrq_key_op sysrq_term_op = { static void moom_callback(struct work_struct *ignored) { - out_of_memory(node_zonelist(first_online_node, GFP_KERNEL), GFP_KERNEL, + out_of_memory(node_zonelist(first_memory_node, GFP_KERNEL), GFP_KERNEL, 0, NULL, true); } diff --git a/fs/fscache/main.c b/fs/fscache/main.c index a31b83c..b39d487 100644 --- a/fs/fscache/main.c +++ b/fs/fscache/main.c @@ -67,7 +67,7 @@ static int fscache_max_active_sysctl(struct ctl_table *table, int write, return ret; } -struct ctl_table fscache_sysctls[] = { +static struct ctl_table fscache_sysctls[] = { { .procname = "object_max_active", .data = &fscache_object_max_active, @@ -87,7 +87,7 @@ struct ctl_table fscache_sysctls[] = { {} }; -struct ctl_table fscache_sysctls_root[] = { +static struct ctl_table fscache_sysctls_root[] = { { .procname = "fscache", .mode = 0555, diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c index 4814031..380d86e 100644 --- a/fs/logfs/readwrite.c +++ b/fs/logfs/readwrite.c @@ -1019,11 +1019,11 @@ static int __logfs_is_valid_block(struct inode *inode, u64 bix, u64 ofs) /** * logfs_is_valid_block - check whether this block is still valid * - * @sb - superblock - * @ofs - block physical offset - * @ino - block inode number - * @bix - block index - * @level - block level + * @sb: superblock + * @ofs: block physical offset + * @ino: block inode number + * @bix: block index + * @gc_level: block level * * Returns 0 if the block is invalid, 1 if it is valid and 2 if it will * become invalid once the journal is written. @@ -2226,10 +2226,9 @@ void btree_write_block(struct logfs_block *block) * * @inode: parent inode (ifile or directory) * @buf: object to write (inode or dentry) - * @n: object size - * @_pos: object number (file position in blocks/objects) + * @count: object size + * @bix: block index * @flags: write flags - * @lock: 0 if write lock is already taken, 1 otherwise * @shadow_tree: shadow below this inode * * FIXME: All caller of this put a 200-300 byte variable on the stack, diff --git a/fs/namespace.c b/fs/namespace.c index 182bc41..2a1447c 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -798,7 +798,7 @@ static void commit_tree(struct mount *mnt, struct mount *shadows) list_splice(&head, n->list.prev); if (shadows) - hlist_add_after_rcu(&shadows->mnt_hash, &mnt->mnt_hash); + hlist_add_behind_rcu(&mnt->mnt_hash, &shadows->mnt_hash); else hlist_add_head_rcu(&mnt->mnt_hash, m_hash(&parent->mnt, mnt->mnt_mountpoint)); diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index ee9cb37..30d3add 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -70,8 +70,15 @@ static int fanotify_get_response(struct fsnotify_group *group, wait_event(group->fanotify_data.access_waitq, event->response || atomic_read(&group->fanotify_data.bypass_perm)); - if (!event->response) /* bypass_perm set */ + if (!event->response) { /* bypass_perm set */ + /* + * Event was canceled because group is being destroyed. Remove + * it from group's event list because we are responsible for + * freeing the permission event. + */ + fsnotify_remove_event(group, &event->fae.fse); return 0; + } /* userspace responded, convert to something usable */ switch (event->response) { @@ -210,7 +217,7 @@ static int fanotify_handle_event(struct fsnotify_group *group, return -ENOMEM; fsn_event = &event->fse; - ret = fsnotify_add_notify_event(group, fsn_event, fanotify_merge); + ret = fsnotify_add_event(group, fsn_event, fanotify_merge); if (ret) { /* Permission events shouldn't be merged */ BUG_ON(ret == 1 && mask & FAN_ALL_PERM_EVENTS); diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 3fdc8a3..b13992a 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -66,7 +66,7 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group, /* held the notification_mutex the whole time, so this is the * same event we peeked above */ - return fsnotify_remove_notify_event(group); + return fsnotify_remove_first_event(group); } static int create_fd(struct fsnotify_group *group, @@ -359,6 +359,11 @@ static int fanotify_release(struct inode *ignored, struct file *file) #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS struct fanotify_perm_event_info *event, *next; + /* + * There may be still new events arriving in the notification queue + * but since userspace cannot use fanotify fd anymore, no event can + * enter or leave access_list by now. + */ spin_lock(&group->fanotify_data.access_lock); atomic_inc(&group->fanotify_data.bypass_perm); @@ -373,6 +378,13 @@ static int fanotify_release(struct inode *ignored, struct file *file) } spin_unlock(&group->fanotify_data.access_lock); + /* + * Since bypass_perm is set, newly queued events will not wait for + * access response. Wake up the already sleeping ones now. + * synchronize_srcu() in fsnotify_destroy_group() will wait for all + * processes sleeping in fanotify_handle_event() waiting for access + * response and thus also for all permission events to be freed. + */ wake_up(&group->fanotify_data.access_waitq); #endif diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c index 74825be..9ce0622 100644 --- a/fs/notify/inode_mark.c +++ b/fs/notify/inode_mark.c @@ -232,7 +232,7 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark, BUG_ON(last == NULL); /* mark should be the last entry. last is the current last entry */ - hlist_add_after_rcu(&last->i.i_list, &mark->i.i_list); + hlist_add_behind_rcu(&mark->i.i_list, &last->i.i_list); out: fsnotify_recalc_inode_mask_locked(inode); spin_unlock(&inode->i_lock); diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index 43ab1e1..0f88bc0 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -108,7 +108,7 @@ int inotify_handle_event(struct fsnotify_group *group, if (len) strcpy(event->name, file_name); - ret = fsnotify_add_notify_event(group, fsn_event, inotify_merge); + ret = fsnotify_add_event(group, fsn_event, inotify_merge); if (ret) { /* Our event wasn't used in the end. Free it. */ fsnotify_destroy_event(group, fsn_event); diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index cc423a3..daf7665 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -149,7 +149,7 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group, if (fsnotify_notify_queue_is_empty(group)) return NULL; - event = fsnotify_peek_notify_event(group); + event = fsnotify_peek_first_event(group); pr_debug("%s: group=%p event=%p\n", __func__, group, event); @@ -159,7 +159,7 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group, /* held the notification_mutex the whole time, so this is the * same event we peeked above */ - fsnotify_remove_notify_event(group); + fsnotify_remove_first_event(group); return event; } diff --git a/fs/notify/notification.c b/fs/notify/notification.c index 1e58402..a95d8e0 100644 --- a/fs/notify/notification.c +++ b/fs/notify/notification.c @@ -73,7 +73,8 @@ void fsnotify_destroy_event(struct fsnotify_group *group, /* Overflow events are per-group and we don't want to free them */ if (!event || event->mask == FS_Q_OVERFLOW) return; - + /* If the event is still queued, we have a problem... */ + WARN_ON(!list_empty(&event->list)); group->ops->free_event(event); } @@ -83,10 +84,10 @@ void fsnotify_destroy_event(struct fsnotify_group *group, * added to the queue, 1 if the event was merged with some other queued event, * 2 if the queue of events has overflown. */ -int fsnotify_add_notify_event(struct fsnotify_group *group, - struct fsnotify_event *event, - int (*merge)(struct list_head *, - struct fsnotify_event *)) +int fsnotify_add_event(struct fsnotify_group *group, + struct fsnotify_event *event, + int (*merge)(struct list_head *, + struct fsnotify_event *)) { int ret = 0; struct list_head *list = &group->notification_list; @@ -125,10 +126,25 @@ queue: } /* + * Remove @event from group's notification queue. It is the responsibility of + * the caller to destroy the event. + */ +void fsnotify_remove_event(struct fsnotify_group *group, + struct fsnotify_event *event) +{ + mutex_lock(&group->notification_mutex); + if (!list_empty(&event->list)) { + list_del_init(&event->list); + group->q_len--; + } + mutex_unlock(&group->notification_mutex); +} + +/* * Remove and return the first event from the notification list. It is the * responsibility of the caller to destroy the obtained event */ -struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group) +struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group) { struct fsnotify_event *event; @@ -140,7 +156,7 @@ struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group struct fsnotify_event, list); /* * We need to init list head for the case of overflow event so that - * check in fsnotify_add_notify_events() works + * check in fsnotify_add_event() works */ list_del_init(&event->list); group->q_len--; @@ -149,9 +165,10 @@ struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group } /* - * This will not remove the event, that must be done with fsnotify_remove_notify_event() + * This will not remove the event, that must be done with + * fsnotify_remove_first_event() */ -struct fsnotify_event *fsnotify_peek_notify_event(struct fsnotify_group *group) +struct fsnotify_event *fsnotify_peek_first_event(struct fsnotify_group *group) { BUG_ON(!mutex_is_locked(&group->notification_mutex)); @@ -169,7 +186,7 @@ void fsnotify_flush_notify(struct fsnotify_group *group) mutex_lock(&group->notification_mutex); while (!fsnotify_notify_queue_is_empty(group)) { - event = fsnotify_remove_notify_event(group); + event = fsnotify_remove_first_event(group); fsnotify_destroy_event(group, event); } mutex_unlock(&group->notification_mutex); diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c index 68ca5a8..ac851e8 100644 --- a/fs/notify/vfsmount_mark.c +++ b/fs/notify/vfsmount_mark.c @@ -191,7 +191,7 @@ int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark, BUG_ON(last == NULL); /* mark should be the last entry. last is the current last entry */ - hlist_add_after_rcu(&last->m.m_list, &mark->m.m_list); + hlist_add_behind_rcu(&mark->m.m_list, &last->m.m_list); out: fsnotify_recalc_vfsmount_mask_locked(mnt); spin_unlock(&mnt->mnt_root->d_lock); diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 5c9e2c8..f5ec1ce 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -74,8 +74,6 @@ static int ntfs_file_open(struct inode *vi, struct file *filp) * ntfs_attr_extend_initialized - extend the initialized size of an attribute * @ni: ntfs inode of the attribute to extend * @new_init_size: requested new initialized size in bytes - * @cached_page: store any allocated but unused page here - * @lru_pvec: lru-buffering pagevec of the caller * * Extend the initialized size of an attribute described by the ntfs inode @ni * to @new_init_size bytes. This involves zeroing any non-sparse space between @@ -395,7 +393,6 @@ static inline void ntfs_fault_in_pages_readable_iovec(const struct iovec *iov, * @nr_pages: number of page cache pages to obtain * @pages: array of pages in which to return the obtained page cache pages * @cached_page: allocated but as yet unused page - * @lru_pvec: lru-buffering pagevec of caller * * Obtain @nr_pages locked page cache pages from the mapping @mapping and * starting at index @index. diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 9d8fcf2..a93bf98 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -4961,6 +4961,15 @@ leftright: el = path_leaf_el(path); split_index = ocfs2_search_extent_list(el, cpos); + if (split_index == -1) { + ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), + "Owner %llu has an extent at cpos %u " + "which can no longer be found.\n", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + cpos); + ret = -EROFS; + goto out; + } goto leftright; } out: @@ -5135,7 +5144,7 @@ int ocfs2_change_extent_flag(handle_t *handle, el = path_leaf_el(left_path); index = ocfs2_search_extent_list(el, cpos); - if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { + if (index == -1) { ocfs2_error(sb, "Owner %llu has an extent at cpos %u which can no " "longer be found.\n", @@ -5491,7 +5500,7 @@ int ocfs2_remove_extent(handle_t *handle, el = path_leaf_el(path); index = ocfs2_search_extent_list(el, cpos); - if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { + if (index == -1) { ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), "Owner %llu has an extent at cpos %u which can no " "longer be found.\n", @@ -5557,7 +5566,7 @@ int ocfs2_remove_extent(handle_t *handle, el = path_leaf_el(path); index = ocfs2_search_extent_list(el, cpos); - if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { + if (index == -1) { ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), "Owner %llu: split at cpos %u lost record.", (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 39efc505..3fcf205 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -1923,12 +1923,11 @@ static int dlm_join_domain(struct dlm_ctxt *dlm) goto bail; } - if (total_backoff > - msecs_to_jiffies(DLM_JOIN_TIMEOUT_MSECS)) { + if (total_backoff > DLM_JOIN_TIMEOUT_MSECS) { status = -ERESTARTSYS; mlog(ML_NOTICE, "Timed out joining dlm domain " "%s after %u msecs\n", dlm->name, - jiffies_to_msecs(total_backoff)); + total_backoff); goto bail; } diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 82abf0c..3ec906e 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -2405,6 +2405,10 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm, if (res->state & DLM_LOCK_RES_MIGRATING) return 0; + /* delay migration when the lockres is in RECOCERING state */ + if (res->state & DLM_LOCK_RES_RECOVERING) + return 0; + if (res->owner != dlm->node_num) return 0; diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index 599eb4c..6219aaa 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -98,7 +98,7 @@ static int __ocfs2_move_extent(handle_t *handle, el = path_leaf_el(path); index = ocfs2_search_extent_list(el, cpos); - if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { + if (index == -1) { ocfs2_error(inode->i_sb, "Inode %llu has an extent at cpos %u which can no " "longer be found.\n", diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 636aab6..d81f6e2 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -3109,7 +3109,7 @@ static int ocfs2_clear_ext_refcount(handle_t *handle, el = path_leaf_el(path); index = ocfs2_search_extent_list(el, cpos); - if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { + if (index == -1) { ocfs2_error(sb, "Inode %llu has an extent at cpos %u which can no " "longer be found.\n", diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index 1424c15..a88b2a4 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c @@ -382,7 +382,7 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb, trace_ocfs2_map_slot_buffers(bytes, si->si_blocks); - si->si_bh = kzalloc(sizeof(struct buffer_head *) * si->si_blocks, + si->si_bh = kcalloc(si->si_blocks, sizeof(struct buffer_head *), GFP_KERNEL); if (!si->si_bh) { status = -ENOMEM; diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 7445af0..aa1eee0 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -168,7 +168,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) K(global_page_state(NR_WRITEBACK)), K(global_page_state(NR_ANON_PAGES)), K(global_page_state(NR_FILE_MAPPED)), - K(global_page_state(NR_SHMEM)), + K(i.sharedram), K(global_page_state(NR_SLAB_RECLAIMABLE) + global_page_state(NR_SLAB_UNRECLAIMABLE)), K(global_page_state(NR_SLAB_RECLAIMABLE)), diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index cfa63ee..dfc791c 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -925,15 +925,30 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end, struct mm_walk *walk) { struct pagemapread *pm = walk->private; - unsigned long addr; + unsigned long addr = start; int err = 0; - pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2)); - for (addr = start; addr < end; addr += PAGE_SIZE) { - err = add_to_pagemap(addr, &pme, pm); - if (err) - break; + while (addr < end) { + struct vm_area_struct *vma = find_vma(walk->mm, addr); + pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2)); + unsigned long vm_end; + + if (!vma) { + vm_end = end; + } else { + vm_end = min(end, vma->vm_end); + if (vma->vm_flags & VM_SOFTDIRTY) + pme.pme |= PM_STATUS2(pm->v2, __PM_SOFT_DIRTY); + } + + for (; addr < vm_end; addr += PAGE_SIZE) { + err = add_to_pagemap(addr, &pme, pm); + if (err) + goto out; + } } + +out: return err; } diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c index 62a0de6..43e7a7e 100644 --- a/fs/squashfs/file_direct.c +++ b/fs/squashfs/file_direct.c @@ -44,7 +44,7 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize) pages = end_index - start_index + 1; - page = kmalloc(sizeof(void *) * pages, GFP_KERNEL); + page = kmalloc_array(pages, sizeof(void *), GFP_KERNEL); if (page == NULL) return res; diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 031c8d67..5056bab 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -27,6 +27,8 @@ * the filesystem. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/fs.h> #include <linux/vfs.h> #include <linux/slab.h> @@ -448,8 +450,7 @@ static int __init init_squashfs_fs(void) return err; } - printk(KERN_INFO "squashfs: version 4.0 (2009/01/31) " - "Phillip Lougher\n"); + pr_info("version 4.0 (2009/01/31) Phillip Lougher\n"); return 0; } diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 7ad6345..e1c8d08 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -88,32 +88,32 @@ * lib/bitmap.c provides these functions: */ -extern int __bitmap_empty(const unsigned long *bitmap, int bits); -extern int __bitmap_full(const unsigned long *bitmap, int bits); +extern int __bitmap_empty(const unsigned long *bitmap, unsigned int nbits); +extern int __bitmap_full(const unsigned long *bitmap, unsigned int nbits); extern int __bitmap_equal(const unsigned long *bitmap1, - const unsigned long *bitmap2, int bits); + const unsigned long *bitmap2, unsigned int nbits); extern void __bitmap_complement(unsigned long *dst, const unsigned long *src, - int bits); + unsigned int nbits); extern void __bitmap_shift_right(unsigned long *dst, const unsigned long *src, int shift, int bits); extern void __bitmap_shift_left(unsigned long *dst, const unsigned long *src, int shift, int bits); extern int __bitmap_and(unsigned long *dst, const unsigned long *bitmap1, - const unsigned long *bitmap2, int bits); + const unsigned long *bitmap2, unsigned int nbits); extern void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1, - const unsigned long *bitmap2, int bits); + const unsigned long *bitmap2, unsigned int nbits); extern void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1, - const unsigned long *bitmap2, int bits); + const unsigned long *bitmap2, unsigned int nbits); extern int __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1, - const unsigned long *bitmap2, int bits); + const unsigned long *bitmap2, unsigned int nbits); extern int __bitmap_intersects(const unsigned long *bitmap1, - const unsigned long *bitmap2, int bits); + const unsigned long *bitmap2, unsigned int nbits); extern int __bitmap_subset(const unsigned long *bitmap1, - const unsigned long *bitmap2, int bits); -extern int __bitmap_weight(const unsigned long *bitmap, int bits); + const unsigned long *bitmap2, unsigned int nbits); +extern int __bitmap_weight(const unsigned long *bitmap, unsigned int nbits); -extern void bitmap_set(unsigned long *map, int i, int len); -extern void bitmap_clear(unsigned long *map, int start, int nr); +extern void bitmap_set(unsigned long *map, unsigned int start, int len); +extern void bitmap_clear(unsigned long *map, unsigned int start, int len); extern unsigned long bitmap_find_next_zero_area(unsigned long *map, unsigned long size, unsigned long start, @@ -140,9 +140,9 @@ extern void bitmap_onto(unsigned long *dst, const unsigned long *orig, const unsigned long *relmap, int bits); extern void bitmap_fold(unsigned long *dst, const unsigned long *orig, int sz, int bits); -extern int bitmap_find_free_region(unsigned long *bitmap, int bits, int order); -extern void bitmap_release_region(unsigned long *bitmap, int pos, int order); -extern int bitmap_allocate_region(unsigned long *bitmap, int pos, int order); +extern int bitmap_find_free_region(unsigned long *bitmap, unsigned int bits, int order); +extern void bitmap_release_region(unsigned long *bitmap, unsigned int pos, int order); +extern int bitmap_allocate_region(unsigned long *bitmap, unsigned int pos, int order); extern void bitmap_copy_le(void *dst, const unsigned long *src, int nbits); extern int bitmap_ord_to_pos(const unsigned long *bitmap, int n, int bits); @@ -188,15 +188,15 @@ static inline void bitmap_copy(unsigned long *dst, const unsigned long *src, } static inline int bitmap_and(unsigned long *dst, const unsigned long *src1, - const unsigned long *src2, int nbits) + const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) - return (*dst = *src1 & *src2) != 0; + return (*dst = *src1 & *src2 & BITMAP_LAST_WORD_MASK(nbits)) != 0; return __bitmap_and(dst, src1, src2, nbits); } static inline void bitmap_or(unsigned long *dst, const unsigned long *src1, - const unsigned long *src2, int nbits) + const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) *dst = *src1 | *src2; @@ -205,7 +205,7 @@ static inline void bitmap_or(unsigned long *dst, const unsigned long *src1, } static inline void bitmap_xor(unsigned long *dst, const unsigned long *src1, - const unsigned long *src2, int nbits) + const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) *dst = *src1 ^ *src2; @@ -214,24 +214,24 @@ static inline void bitmap_xor(unsigned long *dst, const unsigned long *src1, } static inline int bitmap_andnot(unsigned long *dst, const unsigned long *src1, - const unsigned long *src2, int nbits) + const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) - return (*dst = *src1 & ~(*src2)) != 0; + return (*dst = *src1 & ~(*src2) & BITMAP_LAST_WORD_MASK(nbits)) != 0; return __bitmap_andnot(dst, src1, src2, nbits); } static inline void bitmap_complement(unsigned long *dst, const unsigned long *src, - int nbits) + unsigned int nbits) { if (small_const_nbits(nbits)) - *dst = ~(*src) & BITMAP_LAST_WORD_MASK(nbits); + *dst = ~(*src); else __bitmap_complement(dst, src, nbits); } static inline int bitmap_equal(const unsigned long *src1, - const unsigned long *src2, int nbits) + const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) return ! ((*src1 ^ *src2) & BITMAP_LAST_WORD_MASK(nbits)); @@ -240,7 +240,7 @@ static inline int bitmap_equal(const unsigned long *src1, } static inline int bitmap_intersects(const unsigned long *src1, - const unsigned long *src2, int nbits) + const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) return ((*src1 & *src2) & BITMAP_LAST_WORD_MASK(nbits)) != 0; @@ -249,7 +249,7 @@ static inline int bitmap_intersects(const unsigned long *src1, } static inline int bitmap_subset(const unsigned long *src1, - const unsigned long *src2, int nbits) + const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) return ! ((*src1 & ~(*src2)) & BITMAP_LAST_WORD_MASK(nbits)); @@ -257,7 +257,7 @@ static inline int bitmap_subset(const unsigned long *src1, return __bitmap_subset(src1, src2, nbits); } -static inline int bitmap_empty(const unsigned long *src, int nbits) +static inline int bitmap_empty(const unsigned long *src, unsigned nbits) { if (small_const_nbits(nbits)) return ! (*src & BITMAP_LAST_WORD_MASK(nbits)); @@ -265,7 +265,7 @@ static inline int bitmap_empty(const unsigned long *src, int nbits) return __bitmap_empty(src, nbits); } -static inline int bitmap_full(const unsigned long *src, int nbits) +static inline int bitmap_full(const unsigned long *src, unsigned int nbits) { if (small_const_nbits(nbits)) return ! (~(*src) & BITMAP_LAST_WORD_MASK(nbits)); @@ -273,7 +273,7 @@ static inline int bitmap_full(const unsigned long *src, int nbits) return __bitmap_full(src, nbits); } -static inline int bitmap_weight(const unsigned long *src, int nbits) +static inline int bitmap_weight(const unsigned long *src, unsigned int nbits) { if (small_const_nbits(nbits)) return hweight_long(*src & BITMAP_LAST_WORD_MASK(nbits)); @@ -284,7 +284,7 @@ static inline void bitmap_shift_right(unsigned long *dst, const unsigned long *src, int n, int nbits) { if (small_const_nbits(nbits)) - *dst = *src >> n; + *dst = (*src & BITMAP_LAST_WORD_MASK(nbits)) >> n; else __bitmap_shift_right(dst, src, n, nbits); } diff --git a/include/linux/byteorder/generic.h b/include/linux/byteorder/generic.h index 0846e6b..89f67c1 100644 --- a/include/linux/byteorder/generic.h +++ b/include/linux/byteorder/generic.h @@ -2,7 +2,7 @@ #define _LINUX_BYTEORDER_GENERIC_H /* - * linux/byteorder_generic.h + * linux/byteorder/generic.h * Generic Byte-reordering support * * The "... p" macros, like le64_to_cpup, can be used with pointers diff --git a/include/linux/cma.h b/include/linux/cma.h new file mode 100644 index 0000000..371b930 --- /dev/null +++ b/include/linux/cma.h @@ -0,0 +1,27 @@ +#ifndef __CMA_H__ +#define __CMA_H__ + +/* + * There is always at least global CMA area and a few optional + * areas configured in kernel .config. + */ +#ifdef CONFIG_CMA_AREAS +#define MAX_CMA_AREAS (1 + CONFIG_CMA_AREAS) + +#else +#define MAX_CMA_AREAS (0) + +#endif + +struct cma; + +extern phys_addr_t cma_get_base(struct cma *cma); +extern unsigned long cma_get_size(struct cma *cma); + +extern int __init cma_declare_contiguous(phys_addr_t size, + phys_addr_t base, phys_addr_t limit, + phys_addr_t alignment, unsigned int order_per_bit, + bool fixed, struct cma **res_cma); +extern struct page *cma_alloc(struct cma *cma, int count, unsigned int align); +extern bool cma_release(struct cma *cma, struct page *pages, int count); +#endif diff --git a/include/linux/dma-contiguous.h b/include/linux/dma-contiguous.h index 772eab5..569bbd0 100644 --- a/include/linux/dma-contiguous.h +++ b/include/linux/dma-contiguous.h @@ -53,18 +53,13 @@ #ifdef __KERNEL__ +#include <linux/device.h> + struct cma; struct page; -struct device; #ifdef CONFIG_DMA_CMA -/* - * There is always at least global CMA area and a few optional device - * private areas configured in kernel .config. - */ -#define MAX_CMA_AREAS (1 + CONFIG_CMA_AREAS) - extern struct cma *dma_contiguous_default_area; static inline struct cma *dev_get_cma_area(struct device *dev) @@ -123,8 +118,6 @@ bool dma_release_from_contiguous(struct device *dev, struct page *pages, #else -#define MAX_CMA_AREAS (0) - static inline struct cma *dev_get_cma_area(struct device *dev) { return NULL; diff --git a/include/linux/fs.h b/include/linux/fs.h index 2daccaf..1ab6c69 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2688,7 +2688,7 @@ static const struct file_operations __fops = { \ .read = simple_attr_read, \ .write = simple_attr_write, \ .llseek = generic_file_llseek, \ -}; +} static inline __printf(1, 2) void __simple_attr_check_format(const char *fmt, ...) diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index fc7718c..ca060d7 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -322,16 +322,18 @@ extern int fsnotify_fasync(int fd, struct file *file, int on); extern void fsnotify_destroy_event(struct fsnotify_group *group, struct fsnotify_event *event); /* attach the event to the group notification queue */ -extern int fsnotify_add_notify_event(struct fsnotify_group *group, - struct fsnotify_event *event, - int (*merge)(struct list_head *, - struct fsnotify_event *)); +extern int fsnotify_add_event(struct fsnotify_group *group, + struct fsnotify_event *event, + int (*merge)(struct list_head *, + struct fsnotify_event *)); +/* Remove passed event from groups notification queue */ +extern void fsnotify_remove_event(struct fsnotify_group *group, struct fsnotify_event *event); /* true if the group notification queue is empty */ extern bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group); /* return, but do not dequeue the first event on the notification queue */ -extern struct fsnotify_event *fsnotify_peek_notify_event(struct fsnotify_group *group); +extern struct fsnotify_event *fsnotify_peek_first_event(struct fsnotify_group *group); /* return AND dequeue the first event on the notification queue */ -extern struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group); +extern struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group); /* functions used to manipulate the marks attached to inodes */ diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 6eb1fb3..5e7219d 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -360,7 +360,7 @@ extern unsigned long get_zeroed_page(gfp_t gfp_mask); void *alloc_pages_exact(size_t size, gfp_t gfp_mask); void free_pages_exact(void *virt, size_t size); /* This is different from alloc_pages_exact_node !!! */ -void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask); +void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask); #define __get_free_page(gfp_mask) \ __get_free_pages((gfp_mask), 0) diff --git a/include/linux/glob.h b/include/linux/glob.h new file mode 100644 index 0000000..861d834 --- /dev/null +++ b/include/linux/glob.h @@ -0,0 +1,9 @@ +#ifndef _LINUX_GLOB_H +#define _LINUX_GLOB_H + +#include <linux/types.h> /* For bool */ +#include <linux/compiler.h> /* For __pure */ + +bool __pure glob_match(char const *pat, char const *str); + +#endif /* _LINUX_GLOB_H */ diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 7fb31da..9286a46 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -93,7 +93,7 @@ static inline int kmap_atomic_idx_push(void) #ifdef CONFIG_DEBUG_HIGHMEM WARN_ON_ONCE(in_irq() && !irqs_disabled()); - BUG_ON(idx > KM_TYPE_NR); + BUG_ON(idx >= KM_TYPE_NR); #endif return idx; } diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index b826239..63579cb 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -93,10 +93,6 @@ extern bool is_vma_temporary_stack(struct vm_area_struct *vma); #endif /* CONFIG_DEBUG_VM */ extern unsigned long transparent_hugepage_flags; -extern int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, - pmd_t *dst_pmd, pmd_t *src_pmd, - struct vm_area_struct *vma, - unsigned long addr, unsigned long end); extern int split_huge_page_to_list(struct page *page, struct list_head *list); static inline int split_huge_page(struct page *page) { diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index a23c096..6e6d338 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -87,7 +87,6 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud); #endif extern unsigned long hugepages_treat_as_movable; -extern const unsigned long hugetlb_zero, hugetlb_infinity; extern int sysctl_hugetlb_shm_group; extern struct list_head huge_boot_pages; diff --git a/include/linux/kernel.h b/include/linux/kernel.h index a9e2268..3dc22ab 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -493,11 +493,6 @@ static inline char *hex_byte_pack_upper(char *buf, u8 byte) return buf; } -static inline char * __deprecated pack_hex_byte(char *buf, u8 byte) -{ - return hex_byte_pack(buf, byte); -} - extern int hex_to_bin(char ch); extern int __must_check hex2bin(u8 *dst, const char *src, size_t count); diff --git a/include/linux/klist.h b/include/linux/klist.h index a370ce5..61e5b72 100644 --- a/include/linux/klist.h +++ b/include/linux/klist.h @@ -44,7 +44,7 @@ struct klist_node { extern void klist_add_tail(struct klist_node *n, struct klist *k); extern void klist_add_head(struct klist_node *n, struct klist *k); -extern void klist_add_after(struct klist_node *n, struct klist_node *pos); +extern void klist_add_behind(struct klist_node *n, struct klist_node *pos); extern void klist_add_before(struct klist_node *n, struct klist_node *pos); extern void klist_del(struct klist_node *n); diff --git a/include/linux/list.h b/include/linux/list.h index ef95941..cbbb96f 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -654,15 +654,15 @@ static inline void hlist_add_before(struct hlist_node *n, *(n->pprev) = n; } -static inline void hlist_add_after(struct hlist_node *n, - struct hlist_node *next) +static inline void hlist_add_behind(struct hlist_node *n, + struct hlist_node *prev) { - next->next = n->next; - n->next = next; - next->pprev = &n->next; + n->next = prev->next; + prev->next = n; + n->pprev = &prev->next; - if(next->next) - next->next->pprev = &next->next; + if (n->next) + n->next->pprev = &n->next; } /* after that we'll appear to be on some hlist and hlist_del will work */ diff --git a/include/linux/memblock.h b/include/linux/memblock.h index b660e05..e8cc453 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -249,7 +249,7 @@ phys_addr_t memblock_alloc(phys_addr_t size, phys_addr_t align); /* * Set the allocation direction to bottom-up or top-down. */ -static inline void memblock_set_bottom_up(bool enable) +static inline void __init memblock_set_bottom_up(bool enable) { memblock.bottom_up = enable; } @@ -264,7 +264,7 @@ static inline bool memblock_bottom_up(void) return memblock.bottom_up; } #else -static inline void memblock_set_bottom_up(bool enable) {} +static inline void __init memblock_set_bottom_up(bool enable) {} static inline bool memblock_bottom_up(void) { return false; } #endif diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 010d125..d9524c4 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -26,11 +26,12 @@ enum { MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE = NODE_INFO, }; -/* Types for control the zone type of onlined memory */ +/* Types for control the zone type of onlined and offlined memory */ enum { - ONLINE_KEEP, - ONLINE_KERNEL, - ONLINE_MOVABLE, + MMOP_OFFLINE = -1, + MMOP_ONLINE_KEEP, + MMOP_ONLINE_KERNEL, + MMOP_ONLINE_MOVABLE, }; /* @@ -258,6 +259,7 @@ static inline void remove_memory(int nid, u64 start, u64 size) {} extern int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, void *arg, int (*func)(struct memory_block *, void *)); extern int add_memory(int nid, u64 start, u64 size); +extern int zone_for_memory(int nid, u64 start, u64 size, int zone_default); extern int arch_add_memory(int nid, u64 start, u64 size); extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages); extern bool is_memblock_offlined(struct memory_block *mem); diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h index edd82a1..2f348d0 100644 --- a/include/linux/mmdebug.h +++ b/include/linux/mmdebug.h @@ -20,11 +20,13 @@ extern void dump_page_badflags(struct page *page, const char *reason, } while (0) #define VM_WARN_ON(cond) WARN_ON(cond) #define VM_WARN_ON_ONCE(cond) WARN_ON_ONCE(cond) +#define VM_WARN_ONCE(cond, format...) WARN_ONCE(cond, format) #else #define VM_BUG_ON(cond) BUILD_BUG_ON_INVALID(cond) #define VM_BUG_ON_PAGE(cond, page) VM_BUG_ON(cond) #define VM_WARN_ON(cond) BUILD_BUG_ON_INVALID(cond) #define VM_WARN_ON_ONCE(cond) BUILD_BUG_ON_INVALID(cond) +#define VM_WARN_ONCE(cond, format...) BUILD_BUG_ON_INVALID(cond) #endif #ifdef CONFIG_DEBUG_VIRTUAL diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index deca874..2728869 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -170,6 +170,8 @@ extern int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm); extern void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm); +extern void mmu_notifier_unregister_no_release(struct mmu_notifier *mn, + struct mm_struct *mm); extern void __mmu_notifier_mm_destroy(struct mm_struct *mm); extern void __mmu_notifier_release(struct mm_struct *mm); extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm, @@ -288,6 +290,10 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) set_pte_at(___mm, ___address, __ptep, ___pte); \ }) +extern void mmu_notifier_call_srcu(struct rcu_head *rcu, + void (*func)(struct rcu_head *rcu)); +extern void mmu_notifier_synchronize(void); + #else /* CONFIG_MMU_NOTIFIER */ static inline void mmu_notifier_release(struct mm_struct *mm) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 6cbd1b6..318df70 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -143,6 +143,7 @@ enum zone_stat_item { NR_SHMEM, /* shmem pages (included tmpfs/GEM pages) */ NR_DIRTIED, /* page dirtyings since bootup */ NR_WRITTEN, /* page writings since bootup */ + NR_PAGES_SCANNED, /* pages scanned since last reclaim */ #ifdef CONFIG_NUMA NUMA_HIT, /* allocated in intended node */ NUMA_MISS, /* allocated in non intended node */ @@ -324,19 +325,12 @@ enum zone_type { #ifndef __GENERATING_BOUNDS_H struct zone { - /* Fields commonly accessed by the page allocator */ + /* Read-mostly fields */ /* zone watermarks, access with *_wmark_pages(zone) macros */ unsigned long watermark[NR_WMARK]; /* - * When free pages are below this point, additional steps are taken - * when reading the number of free pages to avoid per-cpu counter - * drift allowing watermarks to be breached - */ - unsigned long percpu_drift_mark; - - /* * We don't know if the memory that we're going to allocate will be freeable * or/and it will be released eventually, so to avoid totally wasting several * GB of ram we must reserve some of the lower zone memory (otherwise we risk @@ -344,41 +338,26 @@ struct zone { * on the higher zones). This array is recalculated at runtime if the * sysctl_lowmem_reserve_ratio sysctl changes. */ - unsigned long lowmem_reserve[MAX_NR_ZONES]; - - /* - * This is a per-zone reserve of pages that should not be - * considered dirtyable memory. - */ - unsigned long dirty_balance_reserve; + long lowmem_reserve[MAX_NR_ZONES]; #ifdef CONFIG_NUMA int node; +#endif + /* - * zone reclaim becomes active if more unmapped pages exist. + * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on + * this zone's LRU. Maintained by the pageout code. */ - unsigned long min_unmapped_pages; - unsigned long min_slab_pages; -#endif + unsigned int inactive_ratio; + + struct pglist_data *zone_pgdat; struct per_cpu_pageset __percpu *pageset; + /* - * free areas of different sizes + * This is a per-zone reserve of pages that should not be + * considered dirtyable memory. */ - spinlock_t lock; -#if defined CONFIG_COMPACTION || defined CONFIG_CMA - /* Set to true when the PG_migrate_skip bits should be cleared */ - bool compact_blockskip_flush; - - /* pfn where compaction free scanner should start */ - unsigned long compact_cached_free_pfn; - /* pfn where async and sync compaction migration scanner should start */ - unsigned long compact_cached_migrate_pfn[2]; -#endif -#ifdef CONFIG_MEMORY_HOTPLUG - /* see spanned/present_pages for more description */ - seqlock_t span_seqlock; -#endif - struct free_area free_area[MAX_ORDER]; + unsigned long dirty_balance_reserve; #ifndef CONFIG_SPARSEMEM /* @@ -388,74 +367,14 @@ struct zone { unsigned long *pageblock_flags; #endif /* CONFIG_SPARSEMEM */ -#ifdef CONFIG_COMPACTION - /* - * On compaction failure, 1<<compact_defer_shift compactions - * are skipped before trying again. The number attempted since - * last failure is tracked with compact_considered. - */ - unsigned int compact_considered; - unsigned int compact_defer_shift; - int compact_order_failed; -#endif - - ZONE_PADDING(_pad1_) - - /* Fields commonly accessed by the page reclaim scanner */ - spinlock_t lru_lock; - struct lruvec lruvec; - - /* Evictions & activations on the inactive file list */ - atomic_long_t inactive_age; - - unsigned long pages_scanned; /* since last reclaim */ - unsigned long flags; /* zone flags, see below */ - - /* Zone statistics */ - atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; - - /* - * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on - * this zone's LRU. Maintained by the pageout code. - */ - unsigned int inactive_ratio; - - - ZONE_PADDING(_pad2_) - /* Rarely used or read-mostly fields */ - +#ifdef CONFIG_NUMA /* - * wait_table -- the array holding the hash table - * wait_table_hash_nr_entries -- the size of the hash table array - * wait_table_bits -- wait_table_size == (1 << wait_table_bits) - * - * The purpose of all these is to keep track of the people - * waiting for a page to become available and make them - * runnable again when possible. The trouble is that this - * consumes a lot of space, especially when so few things - * wait on pages at a given time. So instead of using - * per-page waitqueues, we use a waitqueue hash table. - * - * The bucket discipline is to sleep on the same queue when - * colliding and wake all in that wait queue when removing. - * When something wakes, it must check to be sure its page is - * truly available, a la thundering herd. The cost of a - * collision is great, but given the expected load of the - * table, they should be so rare as to be outweighed by the - * benefits from the saved space. - * - * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the - * primary users of these fields, and in mm/page_alloc.c - * free_area_init_core() performs the initialization of them. + * zone reclaim becomes active if more unmapped pages exist. */ - wait_queue_head_t * wait_table; - unsigned long wait_table_hash_nr_entries; - unsigned long wait_table_bits; + unsigned long min_unmapped_pages; + unsigned long min_slab_pages; +#endif /* CONFIG_NUMA */ - /* - * Discontig memory support fields. - */ - struct pglist_data *zone_pgdat; /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */ unsigned long zone_start_pfn; @@ -500,9 +419,11 @@ struct zone { * adjust_managed_page_count() should be used instead of directly * touching zone->managed_pages and totalram_pages. */ + unsigned long managed_pages; unsigned long spanned_pages; unsigned long present_pages; - unsigned long managed_pages; + + const char *name; /* * Number of MIGRATE_RESEVE page block. To maintain for just @@ -510,10 +431,94 @@ struct zone { */ int nr_migrate_reserve_block; +#ifdef CONFIG_MEMORY_HOTPLUG + /* see spanned/present_pages for more description */ + seqlock_t span_seqlock; +#endif + /* - * rarely used fields: + * wait_table -- the array holding the hash table + * wait_table_hash_nr_entries -- the size of the hash table array + * wait_table_bits -- wait_table_size == (1 << wait_table_bits) + * + * The purpose of all these is to keep track of the people + * waiting for a page to become available and make them + * runnable again when possible. The trouble is that this + * consumes a lot of space, especially when so few things + * wait on pages at a given time. So instead of using + * per-page waitqueues, we use a waitqueue hash table. + * + * The bucket discipline is to sleep on the same queue when + * colliding and wake all in that wait queue when removing. + * When something wakes, it must check to be sure its page is + * truly available, a la thundering herd. The cost of a + * collision is great, but given the expected load of the + * table, they should be so rare as to be outweighed by the + * benefits from the saved space. + * + * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the + * primary users of these fields, and in mm/page_alloc.c + * free_area_init_core() performs the initialization of them. */ - const char *name; + wait_queue_head_t *wait_table; + unsigned long wait_table_hash_nr_entries; + unsigned long wait_table_bits; + + ZONE_PADDING(_pad1_) + + /* Write-intensive fields used from the page allocator */ + spinlock_t lock; + + /* free areas of different sizes */ + struct free_area free_area[MAX_ORDER]; + + /* zone flags, see below */ + unsigned long flags; + + ZONE_PADDING(_pad2_) + + /* Write-intensive fields used by page reclaim */ + + /* Fields commonly accessed by the page reclaim scanner */ + spinlock_t lru_lock; + struct lruvec lruvec; + + /* Evictions & activations on the inactive file list */ + atomic_long_t inactive_age; + + /* + * When free pages are below this point, additional steps are taken + * when reading the number of free pages to avoid per-cpu counter + * drift allowing watermarks to be breached + */ + unsigned long percpu_drift_mark; + +#if defined CONFIG_COMPACTION || defined CONFIG_CMA + /* pfn where compaction free scanner should start */ + unsigned long compact_cached_free_pfn; + /* pfn where async and sync compaction migration scanner should start */ + unsigned long compact_cached_migrate_pfn[2]; +#endif + +#ifdef CONFIG_COMPACTION + /* + * On compaction failure, 1<<compact_defer_shift compactions + * are skipped before trying again. The number attempted since + * last failure is tracked with compact_considered. + */ + unsigned int compact_considered; + unsigned int compact_defer_shift; + int compact_order_failed; +#endif + +#if defined CONFIG_COMPACTION || defined CONFIG_CMA + /* Set to true when the PG_migrate_skip bits should be cleared */ + bool compact_blockskip_flush; +#endif + + ZONE_PADDING(_pad3_) + /* Zone statistics */ + atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; } ____cacheline_internodealigned_in_smp; typedef enum { @@ -529,6 +534,7 @@ typedef enum { ZONE_WRITEBACK, /* reclaim scanning has recently found * many pages under writeback */ + ZONE_FAIR_DEPLETED, /* fair zone policy batch depleted */ } zone_flags_t; static inline void zone_set_flag(struct zone *zone, zone_flags_t flag) @@ -566,6 +572,11 @@ static inline int zone_is_reclaim_locked(const struct zone *zone) return test_bit(ZONE_RECLAIM_LOCKED, &zone->flags); } +static inline int zone_is_fair_depleted(const struct zone *zone) +{ + return test_bit(ZONE_FAIR_DEPLETED, &zone->flags); +} + static inline int zone_is_oom_locked(const struct zone *zone) { return test_bit(ZONE_OOM_LOCKED, &zone->flags); @@ -872,6 +883,8 @@ static inline int zone_movable_is_highmem(void) { #if defined(CONFIG_HIGHMEM) && defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) return movable_zone == ZONE_HIGHMEM; +#elif defined(CONFIG_HIGHMEM) + return (ZONE_MOVABLE - 1) == ZONE_HIGHMEM; #else return 0; #endif diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index 58b9a02..83a6aed 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -430,7 +430,15 @@ static inline int num_node_state(enum node_states state) for_each_node_mask((__node), node_states[__state]) #define first_online_node first_node(node_states[N_ONLINE]) -#define next_online_node(nid) next_node((nid), node_states[N_ONLINE]) +#define first_memory_node first_node(node_states[N_MEMORY]) +static inline int next_online_node(int nid) +{ + return next_node(nid, node_states[N_ONLINE]); +} +static inline int next_memory_node(int nid) +{ + return next_node(nid, node_states[N_MEMORY]); +} extern int nr_node_ids; extern int nr_online_nodes; @@ -471,6 +479,7 @@ static inline int num_node_state(enum node_states state) for ( (node) = 0; (node) == 0; (node) = 1) #define first_online_node 0 +#define first_memory_node 0 #define next_online_node(nid) (MAX_NUMNODES) #define nr_node_ids 1 #define nr_online_nodes 1 diff --git a/include/linux/oom.h b/include/linux/oom.h index 4cd6267..647395a 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -55,8 +55,8 @@ extern void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, struct mem_cgroup *memcg, nodemask_t *nodemask, const char *message); -extern int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags); -extern void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags); +extern bool oom_zonelist_trylock(struct zonelist *zonelist, gfp_t gfp_flags); +extern void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_flags); extern void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, int order, const nodemask_t *nodemask); diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 8304959..e1f5fcd 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -171,13 +171,12 @@ static inline int __TestClearPage##uname(struct page *page) \ #define __PAGEFLAG(uname, lname) TESTPAGEFLAG(uname, lname) \ __SETPAGEFLAG(uname, lname) __CLEARPAGEFLAG(uname, lname) -#define PAGEFLAG_FALSE(uname) \ -static inline int Page##uname(const struct page *page) \ - { return 0; } - #define TESTSCFLAG(uname, lname) \ TESTSETFLAG(uname, lname) TESTCLEARFLAG(uname, lname) +#define TESTPAGEFLAG_FALSE(uname) \ +static inline int Page##uname(const struct page *page) { return 0; } + #define SETPAGEFLAG_NOOP(uname) \ static inline void SetPage##uname(struct page *page) { } @@ -187,12 +186,21 @@ static inline void ClearPage##uname(struct page *page) { } #define __CLEARPAGEFLAG_NOOP(uname) \ static inline void __ClearPage##uname(struct page *page) { } +#define TESTSETFLAG_FALSE(uname) \ +static inline int TestSetPage##uname(struct page *page) { return 0; } + #define TESTCLEARFLAG_FALSE(uname) \ static inline int TestClearPage##uname(struct page *page) { return 0; } #define __TESTCLEARFLAG_FALSE(uname) \ static inline int __TestClearPage##uname(struct page *page) { return 0; } +#define PAGEFLAG_FALSE(uname) TESTPAGEFLAG_FALSE(uname) \ + SETPAGEFLAG_NOOP(uname) CLEARPAGEFLAG_NOOP(uname) + +#define TESTSCFLAG_FALSE(uname) \ + TESTSETFLAG_FALSE(uname) TESTCLEARFLAG_FALSE(uname) + struct page; /* forward declaration */ TESTPAGEFLAG(Locked, locked) @@ -248,7 +256,6 @@ PAGEFLAG_FALSE(HighMem) PAGEFLAG(SwapCache, swapcache) #else PAGEFLAG_FALSE(SwapCache) - SETPAGEFLAG_NOOP(SwapCache) CLEARPAGEFLAG_NOOP(SwapCache) #endif PAGEFLAG(Unevictable, unevictable) __CLEARPAGEFLAG(Unevictable, unevictable) @@ -258,8 +265,8 @@ PAGEFLAG(Unevictable, unevictable) __CLEARPAGEFLAG(Unevictable, unevictable) PAGEFLAG(Mlocked, mlocked) __CLEARPAGEFLAG(Mlocked, mlocked) TESTSCFLAG(Mlocked, mlocked) __TESTCLEARFLAG(Mlocked, mlocked) #else -PAGEFLAG_FALSE(Mlocked) SETPAGEFLAG_NOOP(Mlocked) - TESTCLEARFLAG_FALSE(Mlocked) __TESTCLEARFLAG_FALSE(Mlocked) +PAGEFLAG_FALSE(Mlocked) __CLEARPAGEFLAG_NOOP(Mlocked) + TESTSCFLAG_FALSE(Mlocked) __TESTCLEARFLAG_FALSE(Mlocked) #endif #ifdef CONFIG_ARCH_USES_PG_UNCACHED diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index e1474ae..3df8c7d 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -484,6 +484,9 @@ static inline int lock_page_killable(struct page *page) /* * lock_page_or_retry - Lock the page, unless this would block and the * caller indicated that it can handle a retry. + * + * Return value and mmap_sem implications depend on flags; see + * __lock_page_or_retry(). */ static inline int lock_page_or_retry(struct page *page, struct mm_struct *mm, unsigned int flags) diff --git a/include/linux/printk.h b/include/linux/printk.h index 319ff7e..0990997 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -31,7 +31,7 @@ static inline const char *printk_skip_level(const char *buffer) } /* printk's without a loglevel use this.. */ -#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL +#define MESSAGE_LOGLEVEL_DEFAULT CONFIG_MESSAGE_LOGLEVEL_DEFAULT /* We show everything that is MORE important than this.. */ #define CONSOLE_LOGLEVEL_SILENT 0 /* Mum's the word */ diff --git a/include/linux/rculist.h b/include/linux/rculist.h index 8183b46f..372ad5e 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -432,9 +432,9 @@ static inline void hlist_add_before_rcu(struct hlist_node *n, } /** - * hlist_add_after_rcu - * @prev: the existing element to add the new element after. + * hlist_add_behind_rcu * @n: the new element to add to the hash list. + * @prev: the existing element to add the new element after. * * Description: * Adds the specified element to the specified hlist @@ -449,8 +449,8 @@ static inline void hlist_add_before_rcu(struct hlist_node *n, * hlist_for_each_entry_rcu(), used to prevent memory-consistency * problems on Alpha CPUs. */ -static inline void hlist_add_after_rcu(struct hlist_node *prev, - struct hlist_node *n) +static inline void hlist_add_behind_rcu(struct hlist_node *n, + struct hlist_node *prev) { n->next = prev->next; n->pprev = &prev->next; diff --git a/include/linux/swap.h b/include/linux/swap.h index 4bdbee8..1eb6404 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -311,7 +311,6 @@ extern void lru_add_page_tail(struct page *page, struct page *page_tail, struct lruvec *lruvec, struct list_head *head); extern void activate_page(struct page *); extern void mark_page_accessed(struct page *); -extern void init_page_accessed(struct page *page); extern void lru_add_drain(void); extern void lru_add_drain_cpu(int cpu); extern void lru_add_drain_all(void); diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 4b8a891..b87696f 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -113,7 +113,7 @@ extern struct vm_struct *remove_vm_area(const void *addr); extern struct vm_struct *find_vm_area(const void *addr); extern int map_vm_area(struct vm_struct *area, pgprot_t prot, - struct page ***pages); + struct page **pages); #ifdef CONFIG_MMU extern int map_kernel_range_noflush(unsigned long start, unsigned long size, pgprot_t prot, struct page **pages); diff --git a/include/linux/zbud.h b/include/linux/zbud.h index 13af0d4..f9d41a6 100644 --- a/include/linux/zbud.h +++ b/include/linux/zbud.h @@ -11,7 +11,7 @@ struct zbud_ops { struct zbud_pool *zbud_create_pool(gfp_t gfp, struct zbud_ops *ops); void zbud_destroy_pool(struct zbud_pool *pool); -int zbud_alloc(struct zbud_pool *pool, unsigned int size, gfp_t gfp, +int zbud_alloc(struct zbud_pool *pool, size_t size, gfp_t gfp, unsigned long *handle); void zbud_free(struct zbud_pool *pool, unsigned long handle); int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries); diff --git a/include/linux/zlib.h b/include/linux/zlib.h index 9c5a6b4..197abb2 100644 --- a/include/linux/zlib.h +++ b/include/linux/zlib.h @@ -493,64 +493,6 @@ extern int deflateInit2 (z_streamp strm, method). msg is set to null if there is no error message. deflateInit2 does not perform any compression: this will be done by deflate(). */ - -#if 0 -extern int zlib_deflateSetDictionary (z_streamp strm, - const Byte *dictionary, - uInt dictLength); -#endif -/* - Initializes the compression dictionary from the given byte sequence - without producing any compressed output. This function must be called - immediately after deflateInit, deflateInit2 or deflateReset, before any - call of deflate. The compressor and decompressor must use exactly the same - dictionary (see inflateSetDictionary). - - The dictionary should consist of strings (byte sequences) that are likely - to be encountered later in the data to be compressed, with the most commonly - used strings preferably put towards the end of the dictionary. Using a - dictionary is most useful when the data to be compressed is short and can be - predicted with good accuracy; the data can then be compressed better than - with the default empty dictionary. - - Depending on the size of the compression data structures selected by - deflateInit or deflateInit2, a part of the dictionary may in effect be - discarded, for example if the dictionary is larger than the window size in - deflate or deflate2. Thus the strings most likely to be useful should be - put at the end of the dictionary, not at the front. - - Upon return of this function, strm->adler is set to the Adler32 value - of the dictionary; the decompressor may later use this value to determine - which dictionary has been used by the compressor. (The Adler32 value - applies to the whole dictionary even if only a subset of the dictionary is - actually used by the compressor.) - - deflateSetDictionary returns Z_OK if success, or Z_STREAM_ERROR if a - parameter is invalid (such as NULL dictionary) or the stream state is - inconsistent (for example if deflate has already been called for this stream - or if the compression method is bsort). deflateSetDictionary does not - perform any compression: this will be done by deflate(). -*/ - -#if 0 -extern int zlib_deflateCopy (z_streamp dest, z_streamp source); -#endif - -/* - Sets the destination stream as a complete copy of the source stream. - - This function can be useful when several compression strategies will be - tried, for example when there are several ways of pre-processing the input - data with a filter. The streams that will be discarded should then be freed - by calling deflateEnd. Note that deflateCopy duplicates the internal - compression state which can be quite large, so this strategy is slow and - can consume lots of memory. - - deflateCopy returns Z_OK if success, Z_MEM_ERROR if there was not - enough memory, Z_STREAM_ERROR if the source stream state was inconsistent - (such as zalloc being NULL). msg is left unchanged in both source and - destination. -*/ extern int zlib_deflateReset (z_streamp strm); /* @@ -568,27 +510,6 @@ static inline unsigned long deflateBound(unsigned long s) return s + ((s + 7) >> 3) + ((s + 63) >> 6) + 11; } -#if 0 -extern int zlib_deflateParams (z_streamp strm, int level, int strategy); -#endif -/* - Dynamically update the compression level and compression strategy. The - interpretation of level and strategy is as in deflateInit2. This can be - used to switch between compression and straight copy of the input data, or - to switch to a different kind of input data requiring a different - strategy. If the compression level is changed, the input available so far - is compressed with the old level (and may be flushed); the new level will - take effect only at the next call of deflate(). - - Before the call of deflateParams, the stream state must be set as for - a call of deflate(), since the currently available input may have to - be compressed and flushed. In particular, strm->avail_out must be non-zero. - - deflateParams returns Z_OK if success, Z_STREAM_ERROR if the source - stream state was inconsistent or if a parameter was invalid, Z_BUF_ERROR - if strm->avail_out was zero. -*/ - /* extern int inflateInit2 (z_streamp strm, int windowBits); @@ -631,45 +552,6 @@ extern int inflateInit2 (z_streamp strm, int windowBits); and avail_out are unchanged.) */ -extern int zlib_inflateSetDictionary (z_streamp strm, - const Byte *dictionary, - uInt dictLength); -/* - Initializes the decompression dictionary from the given uncompressed byte - sequence. This function must be called immediately after a call of inflate, - if that call returned Z_NEED_DICT. The dictionary chosen by the compressor - can be determined from the adler32 value returned by that call of inflate. - The compressor and decompressor must use exactly the same dictionary (see - deflateSetDictionary). For raw inflate, this function can be called - immediately after inflateInit2() or inflateReset() and before any call of - inflate() to set the dictionary. The application must insure that the - dictionary that was used for compression is provided. - - inflateSetDictionary returns Z_OK if success, Z_STREAM_ERROR if a - parameter is invalid (such as NULL dictionary) or the stream state is - inconsistent, Z_DATA_ERROR if the given dictionary doesn't match the - expected one (incorrect adler32 value). inflateSetDictionary does not - perform any decompression: this will be done by subsequent calls of - inflate(). -*/ - -#if 0 -extern int zlib_inflateSync (z_streamp strm); -#endif -/* - Skips invalid compressed data until a full flush point (see above the - description of deflate with Z_FULL_FLUSH) can be found, or until all - available input is skipped. No output is provided. - - inflateSync returns Z_OK if a full flush point has been found, Z_BUF_ERROR - if no more input was provided, Z_DATA_ERROR if no flush point has been found, - or Z_STREAM_ERROR if the stream structure was inconsistent. In the success - case, the application may save the current current value of total_in which - indicates where valid compressed data was found. In the error case, the - application may repeatedly call inflateSync, providing more input each time, - until success or end of the input data. -*/ - extern int zlib_inflateReset (z_streamp strm); /* This function is equivalent to inflateEnd followed by inflateInit, diff --git a/include/linux/zpool.h b/include/linux/zpool.h new file mode 100644 index 0000000..f14bd75 --- /dev/null +++ b/include/linux/zpool.h @@ -0,0 +1,106 @@ +/* + * zpool memory storage api + * + * Copyright (C) 2014 Dan Streetman + * + * This is a common frontend for the zbud and zsmalloc memory + * storage pool implementations. Typically, this is used to + * store compressed memory. + */ + +#ifndef _ZPOOL_H_ +#define _ZPOOL_H_ + +struct zpool; + +struct zpool_ops { + int (*evict)(struct zpool *pool, unsigned long handle); +}; + +/* + * Control how a handle is mapped. It will be ignored if the + * implementation does not support it. Its use is optional. + * Note that this does not refer to memory protection, it + * refers to how the memory will be copied in/out if copying + * is necessary during mapping; read-write is the safest as + * it copies the existing memory in on map, and copies the + * changed memory back out on unmap. Write-only does not copy + * in the memory and should only be used for initialization. + * If in doubt, use ZPOOL_MM_DEFAULT which is read-write. + */ +enum zpool_mapmode { + ZPOOL_MM_RW, /* normal read-write mapping */ + ZPOOL_MM_RO, /* read-only (no copy-out at unmap time) */ + ZPOOL_MM_WO, /* write-only (no copy-in at map time) */ + + ZPOOL_MM_DEFAULT = ZPOOL_MM_RW +}; + +struct zpool *zpool_create_pool(char *type, gfp_t gfp, struct zpool_ops *ops); + +char *zpool_get_type(struct zpool *pool); + +void zpool_destroy_pool(struct zpool *pool); + +int zpool_malloc(struct zpool *pool, size_t size, gfp_t gfp, + unsigned long *handle); + +void zpool_free(struct zpool *pool, unsigned long handle); + +int zpool_shrink(struct zpool *pool, unsigned int pages, + unsigned int *reclaimed); + +void *zpool_map_handle(struct zpool *pool, unsigned long handle, + enum zpool_mapmode mm); + +void zpool_unmap_handle(struct zpool *pool, unsigned long handle); + +u64 zpool_get_total_size(struct zpool *pool); + + +/** + * struct zpool_driver - driver implementation for zpool + * @type: name of the driver. + * @list: entry in the list of zpool drivers. + * @create: create a new pool. + * @destroy: destroy a pool. + * @malloc: allocate mem from a pool. + * @free: free mem from a pool. + * @shrink: shrink the pool. + * @map: map a handle. + * @unmap: unmap a handle. + * @total_size: get total size of a pool. + * + * This is created by a zpool implementation and registered + * with zpool. + */ +struct zpool_driver { + char *type; + struct module *owner; + atomic_t refcount; + struct list_head list; + + void *(*create)(gfp_t gfp, struct zpool_ops *ops); + void (*destroy)(void *pool); + + int (*malloc)(void *pool, size_t size, gfp_t gfp, + unsigned long *handle); + void (*free)(void *pool, unsigned long handle); + + int (*shrink)(void *pool, unsigned int pages, + unsigned int *reclaimed); + + void *(*map)(void *pool, unsigned long handle, + enum zpool_mapmode mm); + void (*unmap)(void *pool, unsigned long handle); + + u64 (*total_size)(void *pool); +}; + +void zpool_register_driver(struct zpool_driver *driver); + +int zpool_unregister_driver(struct zpool_driver *driver); + +int zpool_evict(void *pool, unsigned long handle); + +#endif diff --git a/include/trace/events/migrate.h b/include/trace/events/migrate.h index 4e4f2f8..dd2b546 100644 --- a/include/trace/events/migrate.h +++ b/include/trace/events/migrate.h @@ -17,6 +17,7 @@ {MR_MEMORY_HOTPLUG, "memory_hotplug"}, \ {MR_SYSCALL, "syscall_or_cpuset"}, \ {MR_MEMPOLICY_MBIND, "mempolicy_mbind"}, \ + {MR_NUMA_MISPLACED, "numa_misplaced"}, \ {MR_CMA, "cma"} TRACE_EVENT(mm_migrate_pages, diff --git a/include/trace/events/pagemap.h b/include/trace/events/pagemap.h index 1c9fabd..ce0803b 100644 --- a/include/trace/events/pagemap.h +++ b/include/trace/events/pagemap.h @@ -28,12 +28,10 @@ TRACE_EVENT(mm_lru_insertion, TP_PROTO( struct page *page, - unsigned long pfn, - int lru, - unsigned long flags + int lru ), - TP_ARGS(page, pfn, lru, flags), + TP_ARGS(page, lru), TP_STRUCT__entry( __field(struct page *, page ) @@ -44,9 +42,9 @@ TRACE_EVENT(mm_lru_insertion, TP_fast_assign( __entry->page = page; - __entry->pfn = pfn; + __entry->pfn = page_to_pfn(page); __entry->lru = lru; - __entry->flags = flags; + __entry->flags = trace_pagemap_flags(page); ), /* Flag format is based on page-types.c formatting for pagemap */ @@ -64,9 +62,9 @@ TRACE_EVENT(mm_lru_insertion, TRACE_EVENT(mm_lru_activate, - TP_PROTO(struct page *page, unsigned long pfn), + TP_PROTO(struct page *page), - TP_ARGS(page, pfn), + TP_ARGS(page), TP_STRUCT__entry( __field(struct page *, page ) @@ -75,7 +73,7 @@ TRACE_EVENT(mm_lru_activate, TP_fast_assign( __entry->page = page; - __entry->pfn = pfn; + __entry->pfn = page_to_pfn(page); ), /* Flag format is based on page-types.c formatting for pagemap */ diff --git a/init/Kconfig b/init/Kconfig index 41066e4..a291b7e 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -807,15 +807,53 @@ config LOG_BUF_SHIFT range 12 21 default 17 help - Select kernel log buffer size as a power of 2. + Select the minimal kernel log buffer size as a power of 2. + The final size is affected by LOG_CPU_MAX_BUF_SHIFT config + parameter, see below. Any higher size also might be forced + by "log_buf_len" boot parameter. + Examples: - 17 => 128 KB + 17 => 128 KB 16 => 64 KB - 15 => 32 KB - 14 => 16 KB + 15 => 32 KB + 14 => 16 KB 13 => 8 KB 12 => 4 KB +config LOG_CPU_MAX_BUF_SHIFT + int "CPU kernel log buffer size contribution (13 => 8 KB, 17 => 128KB)" + range 0 21 + default 12 if !BASE_SMALL + default 0 if BASE_SMALL + help + This option allows to increase the default ring buffer size + according to the number of CPUs. The value defines the contribution + of each CPU as a power of 2. The used space is typically only few + lines however it might be much more when problems are reported, + e.g. backtraces. + + The increased size means that a new buffer has to be allocated and + the original static one is unused. It makes sense only on systems + with more CPUs. Therefore this value is used only when the sum of + contributions is greater than the half of the default kernel ring + buffer as defined by LOG_BUF_SHIFT. The default values are set + so that more than 64 CPUs are needed to trigger the allocation. + + Also this option is ignored when "log_buf_len" kernel parameter is + used as it forces an exact (power of two) size of the ring buffer. + + The number of possible CPUs is used for this computation ignoring + hotplugging making the compuation optimal for the the worst case + scenerio while allowing a simple algorithm to be used from bootup. + + Examples shift values and their meaning: + 17 => 128 KB for each CPU + 16 => 64 KB for each CPU + 15 => 32 KB for each CPU + 14 => 16 KB for each CPU + 13 => 8 KB for each CPU + 12 => 4 KB for each CPU + # # Architectures with an unreliable sched_clock() should select this: # diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 8e9bc9c..c447cd9 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -106,7 +106,7 @@ static inline struct audit_entry *audit_init_entry(u32 field_count) if (unlikely(!entry)) return NULL; - fields = kzalloc(sizeof(*fields) * field_count, GFP_KERNEL); + fields = kcalloc(field_count, sizeof(*fields), GFP_KERNEL); if (unlikely(!fields)) { kfree(entry); return NULL; @@ -160,7 +160,7 @@ static __u32 *classes[AUDIT_SYSCALL_CLASSES]; int __init audit_register_class(int class, unsigned *list) { - __u32 *p = kzalloc(AUDIT_BITMASK_SIZE * sizeof(__u32), GFP_KERNEL); + __u32 *p = kcalloc(AUDIT_BITMASK_SIZE, sizeof(__u32), GFP_KERNEL); if (!p) return -ENOMEM; while (*list != ~0U) { diff --git a/kernel/exit.c b/kernel/exit.c index e5c4668..88c6b3e 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -455,6 +455,7 @@ static void exit_mm(struct task_struct * tsk) task_unlock(tsk); mm_update_next_owner(mm); mmput(mm); + clear_thread_flag(TIF_MEMDIE); } /* diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 13e839d..de1a6bb 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -45,6 +45,7 @@ #include <linux/poll.h> #include <linux/irq_work.h> #include <linux/utsname.h> +#include <linux/ctype.h> #include <asm/uaccess.h> @@ -56,7 +57,7 @@ int console_printk[4] = { CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */ - DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */ + MESSAGE_LOGLEVEL_DEFAULT, /* default_message_loglevel */ CONSOLE_LOGLEVEL_MIN, /* minimum_console_loglevel */ CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */ }; @@ -113,9 +114,9 @@ static int __down_trylock_console_sem(unsigned long ip) * This is used for debugging the mess that is the VT code by * keeping track if we have the console semaphore held. It's * definitely not the perfect debug tool (we don't know if _WE_ - * hold it are racing, but it helps tracking those weird code - * path in the console code where we end up in places I want - * locked without the console sempahore held + * hold it and are racing, but it helps tracking those weird code + * paths in the console code where we end up in places I want + * locked without the console sempahore held). */ static int console_locked, console_suspended; @@ -146,8 +147,8 @@ static int console_may_schedule; * the overall length of the record. * * The heads to the first and last entry in the buffer, as well as the - * sequence numbers of these both entries are maintained when messages - * are stored.. + * sequence numbers of these entries are maintained when messages are + * stored. * * If the heads indicate available messages, the length in the header * tells the start next message. A length == 0 for the next message @@ -257,7 +258,7 @@ static u64 clear_seq; static u32 clear_idx; #define PREFIX_MAX 32 -#define LOG_LINE_MAX 1024 - PREFIX_MAX +#define LOG_LINE_MAX (1024 - PREFIX_MAX) /* record buffer */ #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) @@ -266,6 +267,7 @@ static u32 clear_idx; #define LOG_ALIGN __alignof__(struct printk_log) #endif #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) +#define __LOG_CPU_MAX_BUF_LEN (1 << CONFIG_LOG_CPU_MAX_BUF_SHIFT) static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); static char *log_buf = __log_buf; static u32 log_buf_len = __LOG_BUF_LEN; @@ -344,7 +346,7 @@ static int log_make_free_space(u32 msg_size) while (log_first_seq < log_next_seq) { if (logbuf_has_space(msg_size, false)) return 0; - /* drop old messages until we have enough continuous space */ + /* drop old messages until we have enough contiguous space */ log_first_idx = log_next(log_first_idx); log_first_seq++; } @@ -453,11 +455,7 @@ static int log_store(int facility, int level, return msg->text_len; } -#ifdef CONFIG_SECURITY_DMESG_RESTRICT -int dmesg_restrict = 1; -#else -int dmesg_restrict; -#endif +int dmesg_restrict = IS_ENABLED(CONFIG_SECURITY_DMESG_RESTRICT); static int syslog_action_restricted(int type) { @@ -828,34 +826,74 @@ void log_buf_kexec_setup(void) /* requested log_buf_len from kernel cmdline */ static unsigned long __initdata new_log_buf_len; -/* save requested log_buf_len since it's too early to process it */ -static int __init log_buf_len_setup(char *str) +/* we practice scaling the ring buffer by powers of 2 */ +static void __init log_buf_len_update(unsigned size) { - unsigned size = memparse(str, &str); - if (size) size = roundup_pow_of_two(size); if (size > log_buf_len) new_log_buf_len = size; +} + +/* save requested log_buf_len since it's too early to process it */ +static int __init log_buf_len_setup(char *str) +{ + unsigned size = memparse(str, &str); + + log_buf_len_update(size); return 0; } early_param("log_buf_len", log_buf_len_setup); +static void __init log_buf_add_cpu(void) +{ + unsigned int cpu_extra; + + /* + * archs should set up cpu_possible_bits properly with + * set_cpu_possible() after setup_arch() but just in + * case lets ensure this is valid. + */ + if (num_possible_cpus() == 1) + return; + + cpu_extra = (num_possible_cpus() - 1) * __LOG_CPU_MAX_BUF_LEN; + + /* by default this will only continue through for large > 64 CPUs */ + if (cpu_extra <= __LOG_BUF_LEN / 2) + return; + + pr_info("log_buf_len individual max cpu contribution: %d bytes\n", + __LOG_CPU_MAX_BUF_LEN); + pr_info("log_buf_len total cpu_extra contributions: %d bytes\n", + cpu_extra); + pr_info("log_buf_len min size: %d bytes\n", __LOG_BUF_LEN); + + log_buf_len_update(cpu_extra + __LOG_BUF_LEN); +} + void __init setup_log_buf(int early) { unsigned long flags; char *new_log_buf; int free; + if (log_buf != __log_buf) + return; + + if (!early && !new_log_buf_len) + log_buf_add_cpu(); + if (!new_log_buf_len) return; if (early) { new_log_buf = - memblock_virt_alloc(new_log_buf_len, PAGE_SIZE); + memblock_virt_alloc(new_log_buf_len, LOG_ALIGN); } else { - new_log_buf = memblock_virt_alloc_nopanic(new_log_buf_len, 0); + new_log_buf = memblock_virt_alloc_nopanic(new_log_buf_len, + LOG_ALIGN); } if (unlikely(!new_log_buf)) { @@ -872,7 +910,7 @@ void __init setup_log_buf(int early) memcpy(log_buf, __log_buf, __LOG_BUF_LEN); raw_spin_unlock_irqrestore(&logbuf_lock, flags); - pr_info("log_buf_len: %d\n", log_buf_len); + pr_info("log_buf_len: %d bytes\n", log_buf_len); pr_info("early log buf free: %d(%d%%)\n", free, (free * 100) / __LOG_BUF_LEN); } @@ -881,7 +919,7 @@ static bool __read_mostly ignore_loglevel; static int __init ignore_loglevel_setup(char *str) { - ignore_loglevel = 1; + ignore_loglevel = true; pr_info("debug: ignoring loglevel setting.\n"); return 0; @@ -947,11 +985,7 @@ static inline void boot_delay_msec(int level) } #endif -#if defined(CONFIG_PRINTK_TIME) -static bool printk_time = 1; -#else -static bool printk_time; -#endif +static bool printk_time = IS_ENABLED(CONFIG_PRINTK_TIME); module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); static size_t print_time(u64 ts, char *buf) @@ -1310,7 +1344,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) * for pending data, not the size; return the count of * records, not the length. */ - error = log_next_idx - syslog_idx; + error = log_next_seq - syslog_seq; } else { u64 seq = syslog_seq; u32 idx = syslog_idx; @@ -1416,10 +1450,9 @@ static int have_callable_console(void) /* * Can we actually use the console at this time on this cpu? * - * Console drivers may assume that per-cpu resources have - * been allocated. So unless they're explicitly marked as - * being able to cope (CON_ANYTIME) don't call them until - * this CPU is officially up. + * Console drivers may assume that per-cpu resources have been allocated. So + * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't + * call them until this CPU is officially up. */ static inline int can_use_console(unsigned int cpu) { @@ -1432,8 +1465,10 @@ static inline int can_use_console(unsigned int cpu) * console_lock held, and 'console_locked' set) if it * is successful, false otherwise. */ -static int console_trylock_for_printk(unsigned int cpu) +static int console_trylock_for_printk(void) { + unsigned int cpu = smp_processor_id(); + if (!console_trylock()) return 0; /* @@ -1476,7 +1511,7 @@ static struct cont { struct task_struct *owner; /* task of first print*/ u64 ts_nsec; /* time of first print */ u8 level; /* log level of first message */ - u8 facility; /* log level of first message */ + u8 facility; /* log facility of first message */ enum log_flags flags; /* prefix, newline flags */ bool flushed:1; /* buffer sealed and committed */ } cont; @@ -1608,7 +1643,8 @@ asmlinkage int vprintk_emit(int facility, int level, */ if (!oops_in_progress && !lockdep_recursing(current)) { recursion_bug = 1; - goto out_restore_irqs; + local_irq_restore(flags); + return 0; } zap_locks(); } @@ -1716,21 +1752,30 @@ asmlinkage int vprintk_emit(int facility, int level, logbuf_cpu = UINT_MAX; raw_spin_unlock(&logbuf_lock); + lockdep_on(); + local_irq_restore(flags); /* If called from the scheduler, we can not call up(). */ if (!in_sched) { + lockdep_off(); + /* + * Disable preemption to avoid being preempted while holding + * console_sem which would prevent anyone from printing to + * console + */ + preempt_disable(); + /* * Try to acquire and then immediately release the console * semaphore. The release will print out buffers and wake up * /dev/kmsg and syslog() users. */ - if (console_trylock_for_printk(this_cpu)) + if (console_trylock_for_printk()) console_unlock(); + preempt_enable(); + lockdep_on(); } - lockdep_on(); -out_restore_irqs: - local_irq_restore(flags); return printed_len; } EXPORT_SYMBOL(vprintk_emit); @@ -1802,7 +1847,7 @@ EXPORT_SYMBOL(printk); #define LOG_LINE_MAX 0 #define PREFIX_MAX 0 -#define LOG_LINE_MAX 0 + static u64 syslog_seq; static u32 syslog_idx; static u64 console_seq; @@ -1881,11 +1926,12 @@ static int __add_preferred_console(char *name, int idx, char *options, return 0; } /* - * Set up a list of consoles. Called from init/main.c + * Set up a console. Called via do_early_param() in init/main.c + * for each "console=" parameter in the boot command line. */ static int __init console_setup(char *str) { - char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for index */ + char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for "ttyS" */ char *s, *options, *brl_options = NULL; int idx; @@ -1902,7 +1948,8 @@ static int __init console_setup(char *str) strncpy(buf, str, sizeof(buf) - 1); } buf[sizeof(buf) - 1] = 0; - if ((options = strchr(str, ',')) != NULL) + options = strchr(str, ','); + if (options) *(options++) = 0; #ifdef __sparc__ if (!strcmp(str, "ttya")) @@ -1911,7 +1958,7 @@ static int __init console_setup(char *str) strcpy(buf, "ttyS1"); #endif for (s = buf; *s; s++) - if ((*s >= '0' && *s <= '9') || *s == ',') + if (isdigit(*s) || *s == ',') break; idx = simple_strtoul(s, NULL, 10); *s = 0; @@ -1950,7 +1997,6 @@ int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, cha i++, c++) if (strcmp(c->name, name) == 0 && c->index == idx) { strlcpy(c->name, name_new, sizeof(c->name)); - c->name[sizeof(c->name) - 1] = 0; c->options = options; c->index = idx_new; return i; @@ -1959,12 +2005,12 @@ int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, cha return -1; } -bool console_suspend_enabled = 1; +bool console_suspend_enabled = true; EXPORT_SYMBOL(console_suspend_enabled); static int __init console_suspend_disable(char *str) { - console_suspend_enabled = 0; + console_suspend_enabled = false; return 1; } __setup("no_console_suspend", console_suspend_disable); @@ -2045,8 +2091,8 @@ EXPORT_SYMBOL(console_lock); /** * console_trylock - try to lock the console system for exclusive use. * - * Tried to acquire a lock which guarantees that the caller has - * exclusive access to the console system and the console_drivers list. + * Try to acquire a lock which guarantees that the caller has exclusive + * access to the console system and the console_drivers list. * * returns 1 on success, and 0 on failure to acquire the lock. */ @@ -2618,14 +2664,13 @@ EXPORT_SYMBOL(__printk_ratelimit); bool printk_timed_ratelimit(unsigned long *caller_jiffies, unsigned int interval_msecs) { - if (*caller_jiffies == 0 - || !time_in_range(jiffies, *caller_jiffies, - *caller_jiffies - + msecs_to_jiffies(interval_msecs))) { - *caller_jiffies = jiffies; - return true; - } - return false; + unsigned long elapsed = jiffies - *caller_jiffies; + + if (*caller_jiffies && elapsed <= msecs_to_jiffies(interval_msecs)) + return false; + + *caller_jiffies = jiffies; + return true; } EXPORT_SYMBOL(printk_timed_ratelimit); diff --git a/kernel/smp.c b/kernel/smp.c index 487653b..aff8aa1 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -670,7 +670,7 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), if (cond_func(cpu, info)) { ret = smp_call_function_single(cpu, func, info, wait); - WARN_ON_ONCE(!ret); + WARN_ON_ONCE(ret); } preempt_enable(); } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 75b22e2..75875a7 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1240,8 +1240,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = hugetlb_sysctl_handler, - .extra1 = (void *)&hugetlb_zero, - .extra2 = (void *)&hugetlb_infinity, + .extra1 = &zero, }, #ifdef CONFIG_NUMA { @@ -1250,8 +1249,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = &hugetlb_mempolicy_sysctl_handler, - .extra1 = (void *)&hugetlb_zero, - .extra2 = (void *)&hugetlb_infinity, + .extra1 = &zero, }, #endif { @@ -1274,8 +1272,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = hugetlb_overcommit_handler, - .extra1 = (void *)&hugetlb_zero, - .extra2 = (void *)&hugetlb_infinity, + .extra1 = &zero, }, #endif { diff --git a/kernel/watchdog.c b/kernel/watchdog.c index c3319bd..51b29e9 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -260,9 +260,11 @@ static void watchdog_overflow_callback(struct perf_event *event, return; if (hardlockup_panic) - panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu); + panic("Watchdog detected hard LOCKUP on cpu %d", + this_cpu); else - WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu); + WARN(1, "Watchdog detected hard LOCKUP on cpu %d", + this_cpu); __this_cpu_write(hard_watchdog_warn, true); return; @@ -345,7 +347,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) } } - printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", + pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", smp_processor_id(), duration, current->comm, task_pid_nr(current)); print_modules(); @@ -484,7 +486,7 @@ static int watchdog_nmi_enable(unsigned int cpu) if (PTR_ERR(event) == -EOPNOTSUPP) pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu); else if (PTR_ERR(event) == -ENOENT) - pr_warning("disabled (cpu%i): hardware events not enabled\n", + pr_warn("disabled (cpu%i): hardware events not enabled\n", cpu); else pr_err("disabled (cpu%i): unable to create perf event: %ld\n", diff --git a/lib/Kconfig b/lib/Kconfig index a8a775730..df87265 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -396,6 +396,39 @@ config CPU_RMAP config DQL bool +config GLOB + bool +# This actually supports modular compilation, but the module overhead +# is ridiculous for the amount of code involved. Until an out-of-tree +# driver asks for it, we'll just link it directly it into the kernel +# when required. Since we're ignoring out-of-tree users, there's also +# no need bother prompting for a manual decision: +# prompt "glob_match() function" + help + This option provides a glob_match function for performing + simple text pattern matching. It originated in the ATA code + to blacklist particular drive models, but other device drivers + may need similar functionality. + + All drivers in the Linux kernel tree that require this function + should automatically select this option. Say N unless you + are compiling an out-of tree driver which tells you that it + depends on this. + +config GLOB_SELFTEST + bool "glob self-test on init" + default n + depends on GLOB + help + This option enables a simple self-test of the glob_match + function on startup. It is primarily useful for people + working on the code to ensure they haven't introduced any + regressions. + + It only adds a little bit of code and slows kernel boot (or + module load) by a small amount, so you're welcome to play with + it, but you probably don't need it. + # # Netlink attribute parsing support is select'ed if needed # diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index cfe7df8..cb45f59 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -15,7 +15,7 @@ config PRINTK_TIME The behavior is also controlled by the kernel command line parameter printk.time=1. See Documentation/kernel-parameters.txt -config DEFAULT_MESSAGE_LOGLEVEL +config MESSAGE_LOGLEVEL_DEFAULT int "Default message log level (1-7)" range 1 7 default "4" diff --git a/lib/Makefile b/lib/Makefile index 8427df9..d6b4bc4 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -137,6 +137,8 @@ obj-$(CONFIG_CORDIC) += cordic.o obj-$(CONFIG_DQL) += dynamic_queue_limits.o +obj-$(CONFIG_GLOB) += glob.o + obj-$(CONFIG_MPILIB) += mpi/ obj-$(CONFIG_SIGNATURE) += digsig.o diff --git a/lib/bitmap.c b/lib/bitmap.c index 06f7e4f..1e031f2 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -40,9 +40,9 @@ * for the best explanations of this ordering. */ -int __bitmap_empty(const unsigned long *bitmap, int bits) +int __bitmap_empty(const unsigned long *bitmap, unsigned int bits) { - int k, lim = bits/BITS_PER_LONG; + unsigned int k, lim = bits/BITS_PER_LONG; for (k = 0; k < lim; ++k) if (bitmap[k]) return 0; @@ -55,9 +55,9 @@ int __bitmap_empty(const unsigned long *bitmap, int bits) } EXPORT_SYMBOL(__bitmap_empty); -int __bitmap_full(const unsigned long *bitmap, int bits) +int __bitmap_full(const unsigned long *bitmap, unsigned int bits) { - int k, lim = bits/BITS_PER_LONG; + unsigned int k, lim = bits/BITS_PER_LONG; for (k = 0; k < lim; ++k) if (~bitmap[k]) return 0; @@ -71,9 +71,9 @@ int __bitmap_full(const unsigned long *bitmap, int bits) EXPORT_SYMBOL(__bitmap_full); int __bitmap_equal(const unsigned long *bitmap1, - const unsigned long *bitmap2, int bits) + const unsigned long *bitmap2, unsigned int bits) { - int k, lim = bits/BITS_PER_LONG; + unsigned int k, lim = bits/BITS_PER_LONG; for (k = 0; k < lim; ++k) if (bitmap1[k] != bitmap2[k]) return 0; @@ -86,14 +86,14 @@ int __bitmap_equal(const unsigned long *bitmap1, } EXPORT_SYMBOL(__bitmap_equal); -void __bitmap_complement(unsigned long *dst, const unsigned long *src, int bits) +void __bitmap_complement(unsigned long *dst, const unsigned long *src, unsigned int bits) { - int k, lim = bits/BITS_PER_LONG; + unsigned int k, lim = bits/BITS_PER_LONG; for (k = 0; k < lim; ++k) dst[k] = ~src[k]; if (bits % BITS_PER_LONG) - dst[k] = ~src[k] & BITMAP_LAST_WORD_MASK(bits); + dst[k] = ~src[k]; } EXPORT_SYMBOL(__bitmap_complement); @@ -182,23 +182,26 @@ void __bitmap_shift_left(unsigned long *dst, EXPORT_SYMBOL(__bitmap_shift_left); int __bitmap_and(unsigned long *dst, const unsigned long *bitmap1, - const unsigned long *bitmap2, int bits) + const unsigned long *bitmap2, unsigned int bits) { - int k; - int nr = BITS_TO_LONGS(bits); + unsigned int k; + unsigned int lim = bits/BITS_PER_LONG; unsigned long result = 0; - for (k = 0; k < nr; k++) + for (k = 0; k < lim; k++) result |= (dst[k] = bitmap1[k] & bitmap2[k]); + if (bits % BITS_PER_LONG) + result |= (dst[k] = bitmap1[k] & bitmap2[k] & + BITMAP_LAST_WORD_MASK(bits)); return result != 0; } EXPORT_SYMBOL(__bitmap_and); void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1, - const unsigned long *bitmap2, int bits) + const unsigned long *bitmap2, unsigned int bits) { - int k; - int nr = BITS_TO_LONGS(bits); + unsigned int k; + unsigned int nr = BITS_TO_LONGS(bits); for (k = 0; k < nr; k++) dst[k] = bitmap1[k] | bitmap2[k]; @@ -206,10 +209,10 @@ void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1, EXPORT_SYMBOL(__bitmap_or); void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1, - const unsigned long *bitmap2, int bits) + const unsigned long *bitmap2, unsigned int bits) { - int k; - int nr = BITS_TO_LONGS(bits); + unsigned int k; + unsigned int nr = BITS_TO_LONGS(bits); for (k = 0; k < nr; k++) dst[k] = bitmap1[k] ^ bitmap2[k]; @@ -217,22 +220,25 @@ void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1, EXPORT_SYMBOL(__bitmap_xor); int __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1, - const unsigned long *bitmap2, int bits) + const unsigned long *bitmap2, unsigned int bits) { - int k; - int nr = BITS_TO_LONGS(bits); + unsigned int k; + unsigned int lim = bits/BITS_PER_LONG; unsigned long result = 0; - for (k = 0; k < nr; k++) + for (k = 0; k < lim; k++) result |= (dst[k] = bitmap1[k] & ~bitmap2[k]); + if (bits % BITS_PER_LONG) + result |= (dst[k] = bitmap1[k] & ~bitmap2[k] & + BITMAP_LAST_WORD_MASK(bits)); return result != 0; } EXPORT_SYMBOL(__bitmap_andnot); int __bitmap_intersects(const unsigned long *bitmap1, - const unsigned long *bitmap2, int bits) + const unsigned long *bitmap2, unsigned int bits) { - int k, lim = bits/BITS_PER_LONG; + unsigned int k, lim = bits/BITS_PER_LONG; for (k = 0; k < lim; ++k) if (bitmap1[k] & bitmap2[k]) return 1; @@ -245,9 +251,9 @@ int __bitmap_intersects(const unsigned long *bitmap1, EXPORT_SYMBOL(__bitmap_intersects); int __bitmap_subset(const unsigned long *bitmap1, - const unsigned long *bitmap2, int bits) + const unsigned long *bitmap2, unsigned int bits) { - int k, lim = bits/BITS_PER_LONG; + unsigned int k, lim = bits/BITS_PER_LONG; for (k = 0; k < lim; ++k) if (bitmap1[k] & ~bitmap2[k]) return 0; @@ -259,9 +265,10 @@ int __bitmap_subset(const unsigned long *bitmap1, } EXPORT_SYMBOL(__bitmap_subset); -int __bitmap_weight(const unsigned long *bitmap, int bits) +int __bitmap_weight(const unsigned long *bitmap, unsigned int bits) { - int k, w = 0, lim = bits/BITS_PER_LONG; + unsigned int k, lim = bits/BITS_PER_LONG; + int w = 0; for (k = 0; k < lim; k++) w += hweight_long(bitmap[k]); @@ -273,42 +280,42 @@ int __bitmap_weight(const unsigned long *bitmap, int bits) } EXPORT_SYMBOL(__bitmap_weight); -void bitmap_set(unsigned long *map, int start, int nr) +void bitmap_set(unsigned long *map, unsigned int start, int len) { unsigned long *p = map + BIT_WORD(start); - const int size = start + nr; + const unsigned int size = start + len; int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG); unsigned long mask_to_set = BITMAP_FIRST_WORD_MASK(start); - while (nr - bits_to_set >= 0) { + while (len - bits_to_set >= 0) { *p |= mask_to_set; - nr -= bits_to_set; + len -= bits_to_set; bits_to_set = BITS_PER_LONG; mask_to_set = ~0UL; p++; } - if (nr) { + if (len) { mask_to_set &= BITMAP_LAST_WORD_MASK(size); *p |= mask_to_set; } } EXPORT_SYMBOL(bitmap_set); -void bitmap_clear(unsigned long *map, int start, int nr) +void bitmap_clear(unsigned long *map, unsigned int start, int len) { unsigned long *p = map + BIT_WORD(start); - const int size = start + nr; + const unsigned int size = start + len; int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG); unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start); - while (nr - bits_to_clear >= 0) { + while (len - bits_to_clear >= 0) { *p &= ~mask_to_clear; - nr -= bits_to_clear; + len -= bits_to_clear; bits_to_clear = BITS_PER_LONG; mask_to_clear = ~0UL; p++; } - if (nr) { + if (len) { mask_to_clear &= BITMAP_LAST_WORD_MASK(size); *p &= ~mask_to_clear; } @@ -664,13 +671,8 @@ static int __bitmap_parselist(const char *buf, unsigned int buflen, int bitmap_parselist(const char *bp, unsigned long *maskp, int nmaskbits) { - char *nl = strchr(bp, '\n'); - int len; - - if (nl) - len = nl - bp; - else - len = strlen(bp); + char *nl = strchrnul(bp, '\n'); + int len = nl - bp; return __bitmap_parselist(bp, len, 0, maskp, nmaskbits); } @@ -716,7 +718,7 @@ EXPORT_SYMBOL(bitmap_parselist_user); * * If for example, just bits 4 through 7 are set in @buf, then @pos * values 4 through 7 will get mapped to 0 through 3, respectively, - * and other @pos values will get mapped to 0. When @pos value 7 + * and other @pos values will get mapped to -1. When @pos value 7 * gets mapped to (returns) @ord value 3 in this example, that means * that bit 7 is the 3rd (starting with 0th) set bit in @buf. * @@ -1046,7 +1048,7 @@ enum { REG_OP_RELEASE, /* clear all bits in region */ }; -static int __reg_op(unsigned long *bitmap, int pos, int order, int reg_op) +static int __reg_op(unsigned long *bitmap, unsigned int pos, int order, int reg_op) { int nbits_reg; /* number of bits in region */ int index; /* index first long of region in bitmap */ @@ -1112,11 +1114,11 @@ done: * Return the bit offset in bitmap of the allocated region, * or -errno on failure. */ -int bitmap_find_free_region(unsigned long *bitmap, int bits, int order) +int bitmap_find_free_region(unsigned long *bitmap, unsigned int bits, int order) { - int pos, end; /* scans bitmap by regions of size order */ + unsigned int pos, end; /* scans bitmap by regions of size order */ - for (pos = 0 ; (end = pos + (1 << order)) <= bits; pos = end) { + for (pos = 0 ; (end = pos + (1U << order)) <= bits; pos = end) { if (!__reg_op(bitmap, pos, order, REG_OP_ISFREE)) continue; __reg_op(bitmap, pos, order, REG_OP_ALLOC); @@ -1137,7 +1139,7 @@ EXPORT_SYMBOL(bitmap_find_free_region); * * No return value. */ -void bitmap_release_region(unsigned long *bitmap, int pos, int order) +void bitmap_release_region(unsigned long *bitmap, unsigned int pos, int order) { __reg_op(bitmap, pos, order, REG_OP_RELEASE); } @@ -1154,12 +1156,11 @@ EXPORT_SYMBOL(bitmap_release_region); * Return 0 on success, or %-EBUSY if specified region wasn't * free (not all bits were zero). */ -int bitmap_allocate_region(unsigned long *bitmap, int pos, int order) +int bitmap_allocate_region(unsigned long *bitmap, unsigned int pos, int order) { if (!__reg_op(bitmap, pos, order, REG_OP_ISFREE)) return -EBUSY; - __reg_op(bitmap, pos, order, REG_OP_ALLOC); - return 0; + return __reg_op(bitmap, pos, order, REG_OP_ALLOC); } EXPORT_SYMBOL(bitmap_allocate_region); diff --git a/lib/cmdline.c b/lib/cmdline.c index d4932f7..76a712e 100644 --- a/lib/cmdline.c +++ b/lib/cmdline.c @@ -121,11 +121,7 @@ EXPORT_SYMBOL(get_options); * @retptr: (output) Optional pointer to next char after parse completes * * Parses a string into a number. The number stored at @ptr is - * potentially suffixed with %K (for kilobytes, or 1024 bytes), - * %M (for megabytes, or 1048576 bytes), or %G (for gigabytes, or - * 1073741824). If the number is suffixed with K, M, or G, then - * the return value is the number multiplied by one kilobyte, one - * megabyte, or one gigabyte, respectively. + * potentially suffixed with K, M, G, T, P, E. */ unsigned long long memparse(const char *ptr, char **retptr) @@ -135,6 +131,15 @@ unsigned long long memparse(const char *ptr, char **retptr) unsigned long long ret = simple_strtoull(ptr, &endptr, 0); switch (*endptr) { + case 'E': + case 'e': + ret <<= 10; + case 'P': + case 'p': + ret <<= 10; + case 'T': + case 't': + ret <<= 10; case 'G': case 'g': ret <<= 10; diff --git a/lib/glob.c b/lib/glob.c new file mode 100644 index 0000000..500fc80 --- /dev/null +++ b/lib/glob.c @@ -0,0 +1,287 @@ +#include <linux/module.h> +#include <linux/glob.h> + +/* + * The only reason this code can be compiled as a module is because the + * ATA code that depends on it can be as well. In practice, they're + * both usually compiled in and the module overhead goes away. + */ +MODULE_DESCRIPTION("glob(7) matching"); +MODULE_LICENSE("Dual MIT/GPL"); + +/** + * glob_match - Shell-style pattern matching, like !fnmatch(pat, str, 0) + * @pat: Shell-style pattern to match, e.g. "*.[ch]". + * @str: String to match. The pattern must match the entire string. + * + * Perform shell-style glob matching, returning true (1) if the match + * succeeds, or false (0) if it fails. Equivalent to !fnmatch(@pat, @str, 0). + * + * Pattern metacharacters are ?, *, [ and \. + * (And, inside character classes, !, - and ].) + * + * This is small and simple implementation intended for device blacklists + * where a string is matched against a number of patterns. Thus, it + * does not preprocess the patterns. It is non-recursive, and run-time + * is at most quadratic: strlen(@str)*strlen(@pat). + * + * An example of the worst case is glob_match("*aaaaa", "aaaaaaaaaa"); + * it takes 6 passes over the pattern before matching the string. + * + * Like !fnmatch(@pat, @str, 0) and unlike the shell, this does NOT + * treat / or leading . specially; it isn't actually used for pathnames. + * + * Note that according to glob(7) (and unlike bash), character classes + * are complemented by a leading !; this does not support the regex-style + * [^a-z] syntax. + * + * An opening bracket without a matching close is matched literally. + */ +bool __pure glob_match(char const *pat, char const *str) +{ + /* + * Backtrack to previous * on mismatch and retry starting one + * character later in the string. Because * matches all characters + * (no exception for /), it can be easily proved that there's + * never a need to backtrack multiple levels. + */ + char const *back_pat = NULL, *back_str = back_str; + + /* + * Loop over each token (character or class) in pat, matching + * it against the remaining unmatched tail of str. Return false + * on mismatch, or true after matching the trailing nul bytes. + */ + for (;;) { + unsigned char c = *str++; + unsigned char d = *pat++; + + switch (d) { + case '?': /* Wildcard: anything but nul */ + if (c == '\0') + return false; + break; + case '*': /* Any-length wildcard */ + if (*pat == '\0') /* Optimize trailing * case */ + return true; + back_pat = pat; + back_str = --str; /* Allow zero-length match */ + break; + case '[': { /* Character class */ + bool match = false, inverted = (*pat == '!'); + char const *class = pat + inverted; + unsigned char a = *class++; + + /* + * Iterate over each span in the character class. + * A span is either a single character a, or a + * range a-b. The first span may begin with ']'. + */ + do { + unsigned char b = a; + + if (a == '\0') /* Malformed */ + goto literal; + + if (class[0] == '-' && class[1] != ']') { + b = class[1]; + + if (b == '\0') + goto literal; + + class += 2; + /* Any special action if a > b? */ + } + match |= (a <= c && c <= b); + } while ((a = *class++) != ']'); + + if (match == inverted) + goto backtrack; + pat = class; + } + break; + case '\\': + d = *pat++; + /*FALLTHROUGH*/ + default: /* Literal character */ +literal: + if (c == d) { + if (d == '\0') + return true; + break; + } +backtrack: + if (c == '\0' || !back_pat) + return false; /* No point continuing */ + /* Try again from last *, one character later in str. */ + pat = back_pat; + str = ++back_str; + break; + } + } +} +EXPORT_SYMBOL(glob_match); + + +#ifdef CONFIG_GLOB_SELFTEST + +#include <linux/printk.h> +#include <linux/moduleparam.h> + +/* Boot with "glob.verbose=1" to show successful tests, too */ +static bool verbose = false; +module_param(verbose, bool, 0); + +struct glob_test { + char const *pat, *str; + bool expected; +}; + +static bool __pure __init test(char const *pat, char const *str, bool expected) +{ + bool match = glob_match(pat, str); + bool success = match == expected; + + /* Can't get string literals into a particular section, so... */ + static char const msg_error[] __initconst = + KERN_ERR "glob: \"%s\" vs. \"%s\": %s *** ERROR ***\n"; + static char const msg_ok[] __initconst = + KERN_DEBUG "glob: \"%s\" vs. \"%s\": %s OK\n"; + static char const mismatch[] __initconst = "mismatch"; + char const *message; + + if (!success) + message = msg_error; + else if (verbose) + message = msg_ok; + else + return success; + + printk(message, pat, str, mismatch + 3*match); + return success; +} + +/* + * The tests are all jammed together in one array to make it simpler + * to place that array in the .init.rodata section. The obvious + * "array of structures containing char *" has no way to force the + * pointed-to strings to be in a particular section. + * + * Anyway, a test consists of: + * 1. Expected glob_match result: '1' or '0'. + * 2. Pattern to match: null-terminated string + * 3. String to match against: null-terminated string + * + * The list of tests is terminated with a final '\0' instead of + * a glob_match result character. + */ +static char const glob_tests[] __initconst = + /* Some basic tests */ + "1" "a\0" "a\0" + "0" "a\0" "b\0" + "0" "a\0" "aa\0" + "0" "a\0" "\0" + "1" "\0" "\0" + "0" "\0" "a\0" + /* Simple character class tests */ + "1" "[a]\0" "a\0" + "0" "[a]\0" "b\0" + "0" "[!a]\0" "a\0" + "1" "[!a]\0" "b\0" + "1" "[ab]\0" "a\0" + "1" "[ab]\0" "b\0" + "0" "[ab]\0" "c\0" + "1" "[!ab]\0" "c\0" + "1" "[a-c]\0" "b\0" + "0" "[a-c]\0" "d\0" + /* Corner cases in character class parsing */ + "1" "[a-c-e-g]\0" "-\0" + "0" "[a-c-e-g]\0" "d\0" + "1" "[a-c-e-g]\0" "f\0" + "1" "[]a-ceg-ik[]\0" "a\0" + "1" "[]a-ceg-ik[]\0" "]\0" + "1" "[]a-ceg-ik[]\0" "[\0" + "1" "[]a-ceg-ik[]\0" "h\0" + "0" "[]a-ceg-ik[]\0" "f\0" + "0" "[!]a-ceg-ik[]\0" "h\0" + "0" "[!]a-ceg-ik[]\0" "]\0" + "1" "[!]a-ceg-ik[]\0" "f\0" + /* Simple wild cards */ + "1" "?\0" "a\0" + "0" "?\0" "aa\0" + "0" "??\0" "a\0" + "1" "?x?\0" "axb\0" + "0" "?x?\0" "abx\0" + "0" "?x?\0" "xab\0" + /* Asterisk wild cards (backtracking) */ + "0" "*??\0" "a\0" + "1" "*??\0" "ab\0" + "1" "*??\0" "abc\0" + "1" "*??\0" "abcd\0" + "0" "??*\0" "a\0" + "1" "??*\0" "ab\0" + "1" "??*\0" "abc\0" + "1" "??*\0" "abcd\0" + "0" "?*?\0" "a\0" + "1" "?*?\0" "ab\0" + "1" "?*?\0" "abc\0" + "1" "?*?\0" "abcd\0" + "1" "*b\0" "b\0" + "1" "*b\0" "ab\0" + "0" "*b\0" "ba\0" + "1" "*b\0" "bb\0" + "1" "*b\0" "abb\0" + "1" "*b\0" "bab\0" + "1" "*bc\0" "abbc\0" + "1" "*bc\0" "bc\0" + "1" "*bc\0" "bbc\0" + "1" "*bc\0" "bcbc\0" + /* Multiple asterisks (complex backtracking) */ + "1" "*ac*\0" "abacadaeafag\0" + "1" "*ac*ae*ag*\0" "abacadaeafag\0" + "1" "*a*b*[bc]*[ef]*g*\0" "abacadaeafag\0" + "0" "*a*b*[ef]*[cd]*g*\0" "abacadaeafag\0" + "1" "*abcd*\0" "abcabcabcabcdefg\0" + "1" "*ab*cd*\0" "abcabcabcabcdefg\0" + "1" "*abcd*abcdef*\0" "abcabcdabcdeabcdefg\0" + "0" "*abcd*\0" "abcabcabcabcefg\0" + "0" "*ab*cd*\0" "abcabcabcabcefg\0"; + +static int __init glob_init(void) +{ + unsigned successes = 0; + unsigned n = 0; + char const *p = glob_tests; + static char const message[] __initconst = + KERN_INFO "glob: %u self-tests passed, %u failed\n"; + + /* + * Tests are jammed together in a string. The first byte is '1' + * or '0' to indicate the expected outcome, or '\0' to indicate the + * end of the tests. Then come two null-terminated strings: the + * pattern and the string to match it against. + */ + while (*p) { + bool expected = *p++ & 1; + char const *pat = p; + + p += strlen(p) + 1; + successes += test(pat, p, expected); + p += strlen(p) + 1; + n++; + } + + n -= successes; + printk(message, successes, n); + + /* What's the errno for "kernel bug detected"? Guess... */ + return n ? -ECANCELED : 0; +} + +/* We need a dummy exit function to allow unload */ +static void __exit glob_fini(void) { } + +module_init(glob_init); +module_exit(glob_fini); + +#endif /* CONFIG_GLOB_SELFTEST */ diff --git a/lib/klist.c b/lib/klist.c index 358a368a..89b485a 100644 --- a/lib/klist.c +++ b/lib/klist.c @@ -140,11 +140,11 @@ void klist_add_tail(struct klist_node *n, struct klist *k) EXPORT_SYMBOL_GPL(klist_add_tail); /** - * klist_add_after - Init a klist_node and add it after an existing node + * klist_add_behind - Init a klist_node and add it after an existing node * @n: node we're adding. * @pos: node to put @n after */ -void klist_add_after(struct klist_node *n, struct klist_node *pos) +void klist_add_behind(struct klist_node *n, struct klist_node *pos) { struct klist *k = knode_klist(pos); @@ -153,7 +153,7 @@ void klist_add_after(struct klist_node *n, struct klist_node *pos) list_add(&n->n_node, &pos->n_node); spin_unlock(&k->k_lock); } -EXPORT_SYMBOL_GPL(klist_add_after); +EXPORT_SYMBOL_GPL(klist_add_behind); /** * klist_add_before - Init a klist_node and add it before an existing node diff --git a/lib/list_sort.c b/lib/list_sort.c index 1183fa7..12bcba1 100644 --- a/lib/list_sort.c +++ b/lib/list_sort.c @@ -1,3 +1,6 @@ + +#define pr_fmt(fmt) "list_sort_test: " fmt + #include <linux/kernel.h> #include <linux/module.h> #include <linux/list_sort.h> @@ -47,6 +50,7 @@ static void merge_and_restore_back_links(void *priv, struct list_head *a, struct list_head *b) { struct list_head *tail = head; + u8 count = 0; while (a && b) { /* if equal, take 'a' -- important for sort stability */ @@ -70,7 +74,8 @@ static void merge_and_restore_back_links(void *priv, * element comparison is needed, so the client's cmp() * routine can invoke cond_resched() periodically. */ - (*cmp)(priv, tail->next, tail->next); + if (unlikely(!(++count))) + (*cmp)(priv, tail->next, tail->next); tail->next->prev = tail; tail = tail->next; @@ -123,9 +128,7 @@ void list_sort(void *priv, struct list_head *head, } if (lev > max_lev) { if (unlikely(lev >= ARRAY_SIZE(part)-1)) { - printk_once(KERN_DEBUG "list passed to" - " list_sort() too long for" - " efficiency\n"); + printk_once(KERN_DEBUG "list too long for efficiency\n"); lev--; } max_lev = lev; @@ -168,27 +171,25 @@ static struct debug_el **elts __initdata; static int __init check(struct debug_el *ela, struct debug_el *elb) { if (ela->serial >= TEST_LIST_LEN) { - printk(KERN_ERR "list_sort_test: error: incorrect serial %d\n", - ela->serial); + pr_err("error: incorrect serial %d\n", ela->serial); return -EINVAL; } if (elb->serial >= TEST_LIST_LEN) { - printk(KERN_ERR "list_sort_test: error: incorrect serial %d\n", - elb->serial); + pr_err("error: incorrect serial %d\n", elb->serial); return -EINVAL; } if (elts[ela->serial] != ela || elts[elb->serial] != elb) { - printk(KERN_ERR "list_sort_test: error: phantom element\n"); + pr_err("error: phantom element\n"); return -EINVAL; } if (ela->poison1 != TEST_POISON1 || ela->poison2 != TEST_POISON2) { - printk(KERN_ERR "list_sort_test: error: bad poison: %#x/%#x\n", - ela->poison1, ela->poison2); + pr_err("error: bad poison: %#x/%#x\n", + ela->poison1, ela->poison2); return -EINVAL; } if (elb->poison1 != TEST_POISON1 || elb->poison2 != TEST_POISON2) { - printk(KERN_ERR "list_sort_test: error: bad poison: %#x/%#x\n", - elb->poison1, elb->poison2); + pr_err("error: bad poison: %#x/%#x\n", + elb->poison1, elb->poison2); return -EINVAL; } return 0; @@ -207,25 +208,23 @@ static int __init cmp(void *priv, struct list_head *a, struct list_head *b) static int __init list_sort_test(void) { - int i, count = 1, err = -EINVAL; + int i, count = 1, err = -ENOMEM; struct debug_el *el; - struct list_head *cur, *tmp; + struct list_head *cur; LIST_HEAD(head); - printk(KERN_DEBUG "list_sort_test: start testing list_sort()\n"); + pr_debug("start testing list_sort()\n"); - elts = kmalloc(sizeof(void *) * TEST_LIST_LEN, GFP_KERNEL); + elts = kcalloc(TEST_LIST_LEN, sizeof(*elts), GFP_KERNEL); if (!elts) { - printk(KERN_ERR "list_sort_test: error: cannot allocate " - "memory\n"); - goto exit; + pr_err("error: cannot allocate memory\n"); + return err; } for (i = 0; i < TEST_LIST_LEN; i++) { el = kmalloc(sizeof(*el), GFP_KERNEL); if (!el) { - printk(KERN_ERR "list_sort_test: error: cannot " - "allocate memory\n"); + pr_err("error: cannot allocate memory\n"); goto exit; } /* force some equivalencies */ @@ -239,52 +238,52 @@ static int __init list_sort_test(void) list_sort(NULL, &head, cmp); + err = -EINVAL; for (cur = head.next; cur->next != &head; cur = cur->next) { struct debug_el *el1; int cmp_result; if (cur->next->prev != cur) { - printk(KERN_ERR "list_sort_test: error: list is " - "corrupted\n"); + pr_err("error: list is corrupted\n"); goto exit; } cmp_result = cmp(NULL, cur, cur->next); if (cmp_result > 0) { - printk(KERN_ERR "list_sort_test: error: list is not " - "sorted\n"); + pr_err("error: list is not sorted\n"); goto exit; } el = container_of(cur, struct debug_el, list); el1 = container_of(cur->next, struct debug_el, list); if (cmp_result == 0 && el->serial >= el1->serial) { - printk(KERN_ERR "list_sort_test: error: order of " - "equivalent elements not preserved\n"); + pr_err("error: order of equivalent elements not " + "preserved\n"); goto exit; } if (check(el, el1)) { - printk(KERN_ERR "list_sort_test: error: element check " - "failed\n"); + pr_err("error: element check failed\n"); goto exit; } count++; } + if (head.prev != cur) { + pr_err("error: list is corrupted\n"); + goto exit; + } + if (count != TEST_LIST_LEN) { - printk(KERN_ERR "list_sort_test: error: bad list length %d", - count); + pr_err("error: bad list length %d", count); goto exit; } err = 0; exit: + for (i = 0; i < TEST_LIST_LEN; i++) + kfree(elts[i]); kfree(elts); - list_for_each_safe(cur, tmp, &head) { - list_del(cur); - kfree(container_of(cur, struct debug_el, list)); - } return err; } module_init(list_sort_test); diff --git a/lib/string_helpers.c b/lib/string_helpers.c index ed5c145..29033f3 100644 --- a/lib/string_helpers.c +++ b/lib/string_helpers.c @@ -25,12 +25,15 @@ int string_get_size(u64 size, const enum string_size_units units, char *buf, int len) { - static const char *units_10[] = { "B", "kB", "MB", "GB", "TB", "PB", - "EB", "ZB", "YB", NULL}; - static const char *units_2[] = {"B", "KiB", "MiB", "GiB", "TiB", "PiB", - "EiB", "ZiB", "YiB", NULL }; - static const char **units_str[] = { - [STRING_UNITS_10] = units_10, + static const char *const units_10[] = { + "B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB", NULL + }; + static const char *const units_2[] = { + "B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB", + NULL + }; + static const char *const *const units_str[] = { + [STRING_UNITS_10] = units_10, [STRING_UNITS_2] = units_2, }; static const unsigned int divisor[] = { diff --git a/lib/test-kstrtox.c b/lib/test-kstrtox.c index bea3f3f..4137bca 100644 --- a/lib/test-kstrtox.c +++ b/lib/test-kstrtox.c @@ -3,7 +3,7 @@ #include <linux/module.h> #define for_each_test(i, test) \ - for (i = 0; i < sizeof(test) / sizeof(test[0]); i++) + for (i = 0; i < ARRAY_SIZE(test); i++) struct test_fail { const char *str; diff --git a/lib/zlib_deflate/deflate.c b/lib/zlib_deflate/deflate.c index d63381e..d20ef45 100644 --- a/lib/zlib_deflate/deflate.c +++ b/lib/zlib_deflate/deflate.c @@ -250,52 +250,6 @@ int zlib_deflateInit2( } /* ========================================================================= */ -#if 0 -int zlib_deflateSetDictionary( - z_streamp strm, - const Byte *dictionary, - uInt dictLength -) -{ - deflate_state *s; - uInt length = dictLength; - uInt n; - IPos hash_head = 0; - - if (strm == NULL || strm->state == NULL || dictionary == NULL) - return Z_STREAM_ERROR; - - s = (deflate_state *) strm->state; - if (s->status != INIT_STATE) return Z_STREAM_ERROR; - - strm->adler = zlib_adler32(strm->adler, dictionary, dictLength); - - if (length < MIN_MATCH) return Z_OK; - if (length > MAX_DIST(s)) { - length = MAX_DIST(s); -#ifndef USE_DICT_HEAD - dictionary += dictLength - length; /* use the tail of the dictionary */ -#endif - } - memcpy((char *)s->window, dictionary, length); - s->strstart = length; - s->block_start = (long)length; - - /* Insert all strings in the hash table (except for the last two bytes). - * s->lookahead stays null, so s->ins_h will be recomputed at the next - * call of fill_window. - */ - s->ins_h = s->window[0]; - UPDATE_HASH(s, s->ins_h, s->window[1]); - for (n = 0; n <= length - MIN_MATCH; n++) { - INSERT_STRING(s, n, hash_head); - } - if (hash_head) hash_head = 0; /* to make compiler happy */ - return Z_OK; -} -#endif /* 0 */ - -/* ========================================================================= */ int zlib_deflateReset( z_streamp strm ) @@ -326,45 +280,6 @@ int zlib_deflateReset( return Z_OK; } -/* ========================================================================= */ -#if 0 -int zlib_deflateParams( - z_streamp strm, - int level, - int strategy -) -{ - deflate_state *s; - compress_func func; - int err = Z_OK; - - if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR; - s = (deflate_state *) strm->state; - - if (level == Z_DEFAULT_COMPRESSION) { - level = 6; - } - if (level < 0 || level > 9 || strategy < 0 || strategy > Z_HUFFMAN_ONLY) { - return Z_STREAM_ERROR; - } - func = configuration_table[s->level].func; - - if (func != configuration_table[level].func && strm->total_in != 0) { - /* Flush the last buffer: */ - err = zlib_deflate(strm, Z_PARTIAL_FLUSH); - } - if (s->level != level) { - s->level = level; - s->max_lazy_match = configuration_table[level].max_lazy; - s->good_match = configuration_table[level].good_length; - s->nice_match = configuration_table[level].nice_length; - s->max_chain_length = configuration_table[level].max_chain; - } - s->strategy = strategy; - return err; -} -#endif /* 0 */ - /* ========================================================================= * Put a short in the pending buffer. The 16-bit value is put in MSB order. * IN assertion: the stream state is correct and there is enough room in @@ -568,64 +483,6 @@ int zlib_deflateEnd( return status == BUSY_STATE ? Z_DATA_ERROR : Z_OK; } -/* ========================================================================= - * Copy the source state to the destination state. - */ -#if 0 -int zlib_deflateCopy ( - z_streamp dest, - z_streamp source -) -{ -#ifdef MAXSEG_64K - return Z_STREAM_ERROR; -#else - deflate_state *ds; - deflate_state *ss; - ush *overlay; - deflate_workspace *mem; - - - if (source == NULL || dest == NULL || source->state == NULL) { - return Z_STREAM_ERROR; - } - - ss = (deflate_state *) source->state; - - *dest = *source; - - mem = (deflate_workspace *) dest->workspace; - - ds = &(mem->deflate_memory); - - dest->state = (struct internal_state *) ds; - *ds = *ss; - ds->strm = dest; - - ds->window = (Byte *) mem->window_memory; - ds->prev = (Pos *) mem->prev_memory; - ds->head = (Pos *) mem->head_memory; - overlay = (ush *) mem->overlay_memory; - ds->pending_buf = (uch *) overlay; - - memcpy(ds->window, ss->window, ds->w_size * 2 * sizeof(Byte)); - memcpy(ds->prev, ss->prev, ds->w_size * sizeof(Pos)); - memcpy(ds->head, ss->head, ds->hash_size * sizeof(Pos)); - memcpy(ds->pending_buf, ss->pending_buf, (uInt)ds->pending_buf_size); - - ds->pending_out = ds->pending_buf + (ss->pending_out - ss->pending_buf); - ds->d_buf = overlay + ds->lit_bufsize/sizeof(ush); - ds->l_buf = ds->pending_buf + (1+sizeof(ush))*ds->lit_bufsize; - - ds->l_desc.dyn_tree = ds->dyn_ltree; - ds->d_desc.dyn_tree = ds->dyn_dtree; - ds->bl_desc.dyn_tree = ds->bl_tree; - - return Z_OK; -#endif -} -#endif /* 0 */ - /* =========================================================================== * Read a new buffer from the current input stream, update the adler32 * and total number of bytes read. All deflate() input goes through diff --git a/lib/zlib_inflate/inflate.c b/lib/zlib_inflate/inflate.c index f5ce87b..58a733b 100644 --- a/lib/zlib_inflate/inflate.c +++ b/lib/zlib_inflate/inflate.c @@ -45,21 +45,6 @@ int zlib_inflateReset(z_streamp strm) return Z_OK; } -#if 0 -int zlib_inflatePrime(z_streamp strm, int bits, int value) -{ - struct inflate_state *state; - - if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR; - state = (struct inflate_state *)strm->state; - if (bits > 16 || state->bits + bits > 32) return Z_STREAM_ERROR; - value &= (1L << bits) - 1; - state->hold += value << state->bits; - state->bits += bits; - return Z_OK; -} -#endif - int zlib_inflateInit2(z_streamp strm, int windowBits) { struct inflate_state *state; @@ -761,123 +746,6 @@ int zlib_inflateEnd(z_streamp strm) return Z_OK; } -#if 0 -int zlib_inflateSetDictionary(z_streamp strm, const Byte *dictionary, - uInt dictLength) -{ - struct inflate_state *state; - unsigned long id; - - /* check state */ - if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR; - state = (struct inflate_state *)strm->state; - if (state->wrap != 0 && state->mode != DICT) - return Z_STREAM_ERROR; - - /* check for correct dictionary id */ - if (state->mode == DICT) { - id = zlib_adler32(0L, NULL, 0); - id = zlib_adler32(id, dictionary, dictLength); - if (id != state->check) - return Z_DATA_ERROR; - } - - /* copy dictionary to window */ - zlib_updatewindow(strm, strm->avail_out); - - if (dictLength > state->wsize) { - memcpy(state->window, dictionary + dictLength - state->wsize, - state->wsize); - state->whave = state->wsize; - } - else { - memcpy(state->window + state->wsize - dictLength, dictionary, - dictLength); - state->whave = dictLength; - } - state->havedict = 1; - return Z_OK; -} -#endif - -#if 0 -/* - Search buf[0..len-1] for the pattern: 0, 0, 0xff, 0xff. Return when found - or when out of input. When called, *have is the number of pattern bytes - found in order so far, in 0..3. On return *have is updated to the new - state. If on return *have equals four, then the pattern was found and the - return value is how many bytes were read including the last byte of the - pattern. If *have is less than four, then the pattern has not been found - yet and the return value is len. In the latter case, zlib_syncsearch() can be - called again with more data and the *have state. *have is initialized to - zero for the first call. - */ -static unsigned zlib_syncsearch(unsigned *have, unsigned char *buf, - unsigned len) -{ - unsigned got; - unsigned next; - - got = *have; - next = 0; - while (next < len && got < 4) { - if ((int)(buf[next]) == (got < 2 ? 0 : 0xff)) - got++; - else if (buf[next]) - got = 0; - else - got = 4 - got; - next++; - } - *have = got; - return next; -} -#endif - -#if 0 -int zlib_inflateSync(z_streamp strm) -{ - unsigned len; /* number of bytes to look at or looked at */ - unsigned long in, out; /* temporary to save total_in and total_out */ - unsigned char buf[4]; /* to restore bit buffer to byte string */ - struct inflate_state *state; - - /* check parameters */ - if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR; - state = (struct inflate_state *)strm->state; - if (strm->avail_in == 0 && state->bits < 8) return Z_BUF_ERROR; - - /* if first time, start search in bit buffer */ - if (state->mode != SYNC) { - state->mode = SYNC; - state->hold <<= state->bits & 7; - state->bits -= state->bits & 7; - len = 0; - while (state->bits >= 8) { - buf[len++] = (unsigned char)(state->hold); - state->hold >>= 8; - state->bits -= 8; - } - state->have = 0; - zlib_syncsearch(&(state->have), buf, len); - } - - /* search available input */ - len = zlib_syncsearch(&(state->have), strm->next_in, strm->avail_in); - strm->avail_in -= len; - strm->next_in += len; - strm->total_in += len; - - /* return no joy or set up to restart inflate() on a new block */ - if (state->have != 4) return Z_DATA_ERROR; - in = strm->total_in; out = strm->total_out; - zlib_inflateReset(strm); - strm->total_in = in; strm->total_out = out; - state->mode = TYPE; - return Z_OK; -} -#endif - /* * This subroutine adds the data at next_in/avail_in to the output history * without performing any output. The output buffer must be "caught up"; @@ -508,21 +508,34 @@ config CMA_DEBUG processing calls such as dma_alloc_from_contiguous(). This option does not affect warning and error messages. -config ZBUD - tristate - default n +config CMA_AREAS + int "Maximum count of the CMA areas" + depends on CMA + default 7 help - A special purpose allocator for storing compressed pages. - It is designed to store up to two compressed pages per physical - page. While this design limits storage density, it has simple and - deterministic reclaim properties that make it preferable to a higher - density approach when reclaim will be used. + CMA allows to create CMA areas for particular purpose, mainly, + used as device private area. This parameter sets the maximum + number of CMA area in the system. + + If unsure, leave the default value "7". + +config MEM_SOFT_DIRTY + bool "Track memory changes" + depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY && PROC_FS + select PROC_PAGE_MONITOR + help + This option enables memory changes tracking by introducing a + soft-dirty bit on pte-s. This bit it set when someone writes + into a page just as regular dirty bit, but unlike the latter + it can be cleared by hands. + + See Documentation/vm/soft-dirty.txt for more details. config ZSWAP bool "Compressed cache for swap pages (EXPERIMENTAL)" depends on FRONTSWAP && CRYPTO=y select CRYPTO_LZO - select ZBUD + select ZPOOL default n help A lightweight compressed cache for swap pages. It takes @@ -538,17 +551,22 @@ config ZSWAP they have not be fully explored on the large set of potential configurations and workloads that exist. -config MEM_SOFT_DIRTY - bool "Track memory changes" - depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY && PROC_FS - select PROC_PAGE_MONITOR +config ZPOOL + tristate "Common API for compressed memory storage" + default n help - This option enables memory changes tracking by introducing a - soft-dirty bit on pte-s. This bit it set when someone writes - into a page just as regular dirty bit, but unlike the latter - it can be cleared by hands. + Compressed memory storage API. This allows using either zbud or + zsmalloc. - See Documentation/vm/soft-dirty.txt for more details. +config ZBUD + tristate "Low density storage for compressed pages" + default n + help + A special purpose allocator for storing compressed pages. + It is designed to store up to two compressed pages per physical + page. While this design limits storage density, it has simple and + deterministic reclaim properties that make it preferable to a higher + density approach when reclaim will be used. config ZSMALLOC tristate "Memory allocator for compressed pages" diff --git a/mm/Makefile b/mm/Makefile index 4064f3e..632ae77 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -59,6 +59,8 @@ obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o obj-$(CONFIG_CLEANCACHE) += cleancache.o obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o +obj-$(CONFIG_ZPOOL) += zpool.o obj-$(CONFIG_ZBUD) += zbud.o obj-$(CONFIG_ZSMALLOC) += zsmalloc.o obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o +obj-$(CONFIG_CMA) += cma.o diff --git a/mm/cma.c b/mm/cma.c new file mode 100644 index 0000000..c17751c --- /dev/null +++ b/mm/cma.c @@ -0,0 +1,335 @@ +/* + * Contiguous Memory Allocator + * + * Copyright (c) 2010-2011 by Samsung Electronics. + * Copyright IBM Corporation, 2013 + * Copyright LG Electronics Inc., 2014 + * Written by: + * Marek Szyprowski <m.szyprowski@samsung.com> + * Michal Nazarewicz <mina86@mina86.com> + * Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> + * Joonsoo Kim <iamjoonsoo.kim@lge.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License or (at your optional) any later version of the license. + */ + +#define pr_fmt(fmt) "cma: " fmt + +#ifdef CONFIG_CMA_DEBUG +#ifndef DEBUG +# define DEBUG +#endif +#endif + +#include <linux/memblock.h> +#include <linux/err.h> +#include <linux/mm.h> +#include <linux/mutex.h> +#include <linux/sizes.h> +#include <linux/slab.h> +#include <linux/log2.h> +#include <linux/cma.h> + +struct cma { + unsigned long base_pfn; + unsigned long count; + unsigned long *bitmap; + unsigned int order_per_bit; /* Order of pages represented by one bit */ + struct mutex lock; +}; + +static struct cma cma_areas[MAX_CMA_AREAS]; +static unsigned cma_area_count; +static DEFINE_MUTEX(cma_mutex); + +phys_addr_t cma_get_base(struct cma *cma) +{ + return PFN_PHYS(cma->base_pfn); +} + +unsigned long cma_get_size(struct cma *cma) +{ + return cma->count << PAGE_SHIFT; +} + +static unsigned long cma_bitmap_aligned_mask(struct cma *cma, int align_order) +{ + return (1UL << (align_order >> cma->order_per_bit)) - 1; +} + +static unsigned long cma_bitmap_maxno(struct cma *cma) +{ + return cma->count >> cma->order_per_bit; +} + +static unsigned long cma_bitmap_pages_to_bits(struct cma *cma, + unsigned long pages) +{ + return ALIGN(pages, 1UL << cma->order_per_bit) >> cma->order_per_bit; +} + +static void cma_clear_bitmap(struct cma *cma, unsigned long pfn, int count) +{ + unsigned long bitmap_no, bitmap_count; + + bitmap_no = (pfn - cma->base_pfn) >> cma->order_per_bit; + bitmap_count = cma_bitmap_pages_to_bits(cma, count); + + mutex_lock(&cma->lock); + bitmap_clear(cma->bitmap, bitmap_no, bitmap_count); + mutex_unlock(&cma->lock); +} + +static int __init cma_activate_area(struct cma *cma) +{ + int bitmap_size = BITS_TO_LONGS(cma_bitmap_maxno(cma)) * sizeof(long); + unsigned long base_pfn = cma->base_pfn, pfn = base_pfn; + unsigned i = cma->count >> pageblock_order; + struct zone *zone; + + cma->bitmap = kzalloc(bitmap_size, GFP_KERNEL); + + if (!cma->bitmap) + return -ENOMEM; + + WARN_ON_ONCE(!pfn_valid(pfn)); + zone = page_zone(pfn_to_page(pfn)); + + do { + unsigned j; + + base_pfn = pfn; + for (j = pageblock_nr_pages; j; --j, pfn++) { + WARN_ON_ONCE(!pfn_valid(pfn)); + /* + * alloc_contig_range requires the pfn range + * specified to be in the same zone. Make this + * simple by forcing the entire CMA resv range + * to be in the same zone. + */ + if (page_zone(pfn_to_page(pfn)) != zone) + goto err; + } + init_cma_reserved_pageblock(pfn_to_page(base_pfn)); + } while (--i); + + mutex_init(&cma->lock); + return 0; + +err: + kfree(cma->bitmap); + return -EINVAL; +} + +static int __init cma_init_reserved_areas(void) +{ + int i; + + for (i = 0; i < cma_area_count; i++) { + int ret = cma_activate_area(&cma_areas[i]); + + if (ret) + return ret; + } + + return 0; +} +core_initcall(cma_init_reserved_areas); + +/** + * cma_declare_contiguous() - reserve custom contiguous area + * @base: Base address of the reserved area optional, use 0 for any + * @size: Size of the reserved area (in bytes), + * @limit: End address of the reserved memory (optional, 0 for any). + * @alignment: Alignment for the CMA area, should be power of 2 or zero + * @order_per_bit: Order of pages represented by one bit on bitmap. + * @fixed: hint about where to place the reserved area + * @res_cma: Pointer to store the created cma region. + * + * This function reserves memory from early allocator. It should be + * called by arch specific code once the early allocator (memblock or bootmem) + * has been activated and all other subsystems have already allocated/reserved + * memory. This function allows to create custom reserved areas. + * + * If @fixed is true, reserve contiguous area at exactly @base. If false, + * reserve in range from @base to @limit. + */ +int __init cma_declare_contiguous(phys_addr_t base, + phys_addr_t size, phys_addr_t limit, + phys_addr_t alignment, unsigned int order_per_bit, + bool fixed, struct cma **res_cma) +{ + struct cma *cma; + int ret = 0; + + pr_debug("%s(size %lx, base %08lx, limit %08lx alignment %08lx)\n", + __func__, (unsigned long)size, (unsigned long)base, + (unsigned long)limit, (unsigned long)alignment); + + if (cma_area_count == ARRAY_SIZE(cma_areas)) { + pr_err("Not enough slots for CMA reserved regions!\n"); + return -ENOSPC; + } + + if (!size) + return -EINVAL; + + if (alignment && !is_power_of_2(alignment)) + return -EINVAL; + + /* + * Sanitise input arguments. + * Pages both ends in CMA area could be merged into adjacent unmovable + * migratetype page by page allocator's buddy algorithm. In the case, + * you couldn't get a contiguous memory, which is not what we want. + */ + alignment = max(alignment, + (phys_addr_t)PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order)); + base = ALIGN(base, alignment); + size = ALIGN(size, alignment); + limit &= ~(alignment - 1); + + /* size should be aligned with order_per_bit */ + if (!IS_ALIGNED(size >> PAGE_SHIFT, 1 << order_per_bit)) + return -EINVAL; + + /* Reserve memory */ + if (base && fixed) { + if (memblock_is_region_reserved(base, size) || + memblock_reserve(base, size) < 0) { + ret = -EBUSY; + goto err; + } + } else { + phys_addr_t addr = memblock_alloc_range(size, alignment, base, + limit); + if (!addr) { + ret = -ENOMEM; + goto err; + } else { + base = addr; + } + } + + /* + * Each reserved area must be initialised later, when more kernel + * subsystems (like slab allocator) are available. + */ + cma = &cma_areas[cma_area_count]; + cma->base_pfn = PFN_DOWN(base); + cma->count = size >> PAGE_SHIFT; + cma->order_per_bit = order_per_bit; + *res_cma = cma; + cma_area_count++; + + pr_info("Reserved %ld MiB at %08lx\n", (unsigned long)size / SZ_1M, + (unsigned long)base); + return 0; + +err: + pr_err("Failed to reserve %ld MiB\n", (unsigned long)size / SZ_1M); + return ret; +} + +/** + * cma_alloc() - allocate pages from contiguous area + * @cma: Contiguous memory region for which the allocation is performed. + * @count: Requested number of pages. + * @align: Requested alignment of pages (in PAGE_SIZE order). + * + * This function allocates part of contiguous memory on specific + * contiguous memory area. + */ +struct page *cma_alloc(struct cma *cma, int count, unsigned int align) +{ + unsigned long mask, pfn, start = 0; + unsigned long bitmap_maxno, bitmap_no, bitmap_count; + struct page *page = NULL; + int ret; + + if (!cma || !cma->count) + return NULL; + + pr_debug("%s(cma %p, count %d, align %d)\n", __func__, (void *)cma, + count, align); + + if (!count) + return NULL; + + mask = cma_bitmap_aligned_mask(cma, align); + bitmap_maxno = cma_bitmap_maxno(cma); + bitmap_count = cma_bitmap_pages_to_bits(cma, count); + + for (;;) { + mutex_lock(&cma->lock); + bitmap_no = bitmap_find_next_zero_area(cma->bitmap, + bitmap_maxno, start, bitmap_count, mask); + if (bitmap_no >= bitmap_maxno) { + mutex_unlock(&cma->lock); + break; + } + bitmap_set(cma->bitmap, bitmap_no, bitmap_count); + /* + * It's safe to drop the lock here. We've marked this region for + * our exclusive use. If the migration fails we will take the + * lock again and unmark it. + */ + mutex_unlock(&cma->lock); + + pfn = cma->base_pfn + (bitmap_no << cma->order_per_bit); + mutex_lock(&cma_mutex); + ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA); + mutex_unlock(&cma_mutex); + if (ret == 0) { + page = pfn_to_page(pfn); + break; + } + + cma_clear_bitmap(cma, pfn, count); + if (ret != -EBUSY) + break; + + pr_debug("%s(): memory range at %p is busy, retrying\n", + __func__, pfn_to_page(pfn)); + /* try again with a bit different memory target */ + start = bitmap_no + mask + 1; + } + + pr_debug("%s(): returned %p\n", __func__, page); + return page; +} + +/** + * cma_release() - release allocated pages + * @cma: Contiguous memory region for which the allocation is performed. + * @pages: Allocated pages. + * @count: Number of allocated pages. + * + * This function releases memory allocated by alloc_cma(). + * It returns false when provided pages do not belong to contiguous area and + * true otherwise. + */ +bool cma_release(struct cma *cma, struct page *pages, int count) +{ + unsigned long pfn; + + if (!cma || !pages) + return false; + + pr_debug("%s(page %p)\n", __func__, (void *)pages); + + pfn = page_to_pfn(pages); + + if (pfn < cma->base_pfn || pfn >= cma->base_pfn + cma->count) + return false; + + VM_BUG_ON(pfn + count > cma->base_pfn + cma->count); + + free_contig_range(pfn, count); + cma_clear_bitmap(cma, pfn, count); + + return true; +} diff --git a/mm/filemap.c b/mm/filemap.c index 65d44fd..af19a6b 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -808,6 +808,17 @@ int __lock_page_killable(struct page *page) } EXPORT_SYMBOL_GPL(__lock_page_killable); +/* + * Return values: + * 1 - page is locked; mmap_sem is still held. + * 0 - page is not locked. + * mmap_sem has been released (up_read()), unless flags had both + * FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_RETRY_NOWAIT set, in + * which case mmap_sem is still held. + * + * If neither ALLOW_RETRY nor KILLABLE are set, will always return 1 + * with the page locked and the mmap_sem unperturbed. + */ int __lock_page_or_retry(struct page *page, struct mm_struct *mm, unsigned int flags) { @@ -1091,9 +1102,9 @@ no_page: if (WARN_ON_ONCE(!(fgp_flags & FGP_LOCK))) fgp_flags |= FGP_LOCK; - /* Init accessed so avoit atomic mark_page_accessed later */ + /* Init accessed so avoid atomic mark_page_accessed later */ if (fgp_flags & FGP_ACCESSED) - init_page_accessed(page); + __SetPageReferenced(page); err = add_to_page_cache_lru(page, mapping, offset, radix_gfp_mask); if (unlikely(err)) { @@ -1827,6 +1838,18 @@ static void do_async_mmap_readahead(struct vm_area_struct *vma, * The goto's are kind of ugly, but this streamlines the normal case of having * it in the page cache, and handles the special cases reasonably without * having a lot of duplicated code. + * + * vma->vm_mm->mmap_sem must be held on entry. + * + * If our return value has VM_FAULT_RETRY set, it's because + * lock_page_or_retry() returned 0. + * The mmap_sem has usually been released in this case. + * See __lock_page_or_retry() for the exception. + * + * If our return value does not have VM_FAULT_RETRY set, the mmap_sem + * has not been released. + * + * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set. */ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { @@ -258,6 +258,11 @@ unmap: return ret; } +/* + * mmap_sem must be held on entry. If @nonblocking != NULL and + * *@flags does not include FOLL_NOWAIT, the mmap_sem may be released. + * If it is, *@nonblocking will be set to 0 and -EBUSY returned. + */ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, unsigned long address, unsigned int *flags, int *nonblocking) { @@ -373,7 +378,7 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) * with a put_page() call when it is finished with. vmas will only * remain valid while mmap_sem is held. * - * Must be called with mmap_sem held for read or write. + * Must be called with mmap_sem held. It may be released. See below. * * __get_user_pages walks a process's page tables and takes a reference to * each struct page that each user address corresponds to at a given @@ -396,7 +401,14 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) * * If @nonblocking != NULL, __get_user_pages will not wait for disk IO * or mmap_sem contention, and if waiting is needed to pin all pages, - * *@nonblocking will be set to 0. + * *@nonblocking will be set to 0. Further, if @gup_flags does not + * include FOLL_NOWAIT, the mmap_sem will be released via up_read() in + * this case. + * + * A caller using such a combination of @nonblocking and @gup_flags + * must therefore hold the mmap_sem for reading only, and recognize + * when it's been released. Otherwise, it must be held for either + * reading or writing and will not be released. * * In most cases, get_user_pages or get_user_pages_fast should be used * instead of __get_user_pages. __get_user_pages should be used only if @@ -528,7 +540,7 @@ EXPORT_SYMBOL(__get_user_pages); * such architectures, gup() will not be enough to make a subsequent access * succeed. * - * This should be called with the mm_sem held for read. + * This has the same semantics wrt the @mm->mmap_sem as does filemap_fault(). */ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, unsigned long address, unsigned int fault_flags) diff --git a/mm/highmem.c b/mm/highmem.c index b32b70c..123bcd3 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -44,6 +44,66 @@ DEFINE_PER_CPU(int, __kmap_atomic_idx); */ #ifdef CONFIG_HIGHMEM +/* + * Architecture with aliasing data cache may define the following family of + * helper functions in its asm/highmem.h to control cache color of virtual + * addresses where physical memory pages are mapped by kmap. + */ +#ifndef get_pkmap_color + +/* + * Determine color of virtual address where the page should be mapped. + */ +static inline unsigned int get_pkmap_color(struct page *page) +{ + return 0; +} +#define get_pkmap_color get_pkmap_color + +/* + * Get next index for mapping inside PKMAP region for page with given color. + */ +static inline unsigned int get_next_pkmap_nr(unsigned int color) +{ + static unsigned int last_pkmap_nr; + + last_pkmap_nr = (last_pkmap_nr + 1) & LAST_PKMAP_MASK; + return last_pkmap_nr; +} + +/* + * Determine if page index inside PKMAP region (pkmap_nr) of given color + * has wrapped around PKMAP region end. When this happens an attempt to + * flush all unused PKMAP slots is made. + */ +static inline int no_more_pkmaps(unsigned int pkmap_nr, unsigned int color) +{ + return pkmap_nr == 0; +} + +/* + * Get the number of PKMAP entries of the given color. If no free slot is + * found after checking that many entries, kmap will sleep waiting for + * someone to call kunmap and free PKMAP slot. + */ +static inline int get_pkmap_entries_count(unsigned int color) +{ + return LAST_PKMAP; +} + +/* + * Get head of a wait queue for PKMAP entries of the given color. + * Wait queues for different mapping colors should be independent to avoid + * unnecessary wakeups caused by freeing of slots of other colors. + */ +static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color) +{ + static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait); + + return &pkmap_map_wait; +} +#endif + unsigned long totalhigh_pages __read_mostly; EXPORT_SYMBOL(totalhigh_pages); @@ -68,13 +128,10 @@ unsigned int nr_free_highpages (void) } static int pkmap_count[LAST_PKMAP]; -static unsigned int last_pkmap_nr; static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock); pte_t * pkmap_page_table; -static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait); - /* * Most architectures have no use for kmap_high_get(), so let's abstract * the disabling of IRQ out of the locking in that case to save on a @@ -161,15 +218,17 @@ static inline unsigned long map_new_virtual(struct page *page) { unsigned long vaddr; int count; + unsigned int last_pkmap_nr; + unsigned int color = get_pkmap_color(page); start: - count = LAST_PKMAP; + count = get_pkmap_entries_count(color); /* Find an empty entry */ for (;;) { - last_pkmap_nr = (last_pkmap_nr + 1) & LAST_PKMAP_MASK; - if (!last_pkmap_nr) { + last_pkmap_nr = get_next_pkmap_nr(color); + if (no_more_pkmaps(last_pkmap_nr, color)) { flush_all_zero_pkmaps(); - count = LAST_PKMAP; + count = get_pkmap_entries_count(color); } if (!pkmap_count[last_pkmap_nr]) break; /* Found a usable entry */ @@ -181,12 +240,14 @@ start: */ { DECLARE_WAITQUEUE(wait, current); + wait_queue_head_t *pkmap_map_wait = + get_pkmap_wait_queue_head(color); __set_current_state(TASK_UNINTERRUPTIBLE); - add_wait_queue(&pkmap_map_wait, &wait); + add_wait_queue(pkmap_map_wait, &wait); unlock_kmap(); schedule(); - remove_wait_queue(&pkmap_map_wait, &wait); + remove_wait_queue(pkmap_map_wait, &wait); lock_kmap(); /* Somebody else might have mapped it while we slept */ @@ -274,6 +335,8 @@ void kunmap_high(struct page *page) unsigned long nr; unsigned long flags; int need_wakeup; + unsigned int color = get_pkmap_color(page); + wait_queue_head_t *pkmap_map_wait; lock_kmap_any(flags); vaddr = (unsigned long)page_address(page); @@ -299,13 +362,14 @@ void kunmap_high(struct page *page) * no need for the wait-queue-head's lock. Simply * test if the queue is empty. */ - need_wakeup = waitqueue_active(&pkmap_map_wait); + pkmap_map_wait = get_pkmap_wait_queue_head(color); + need_wakeup = waitqueue_active(pkmap_map_wait); } unlock_kmap_any(flags); /* do wake-up, if needed, race-free outside of the spin lock */ if (need_wakeup) - wake_up(&pkmap_map_wait); + wake_up(pkmap_map_wait); } EXPORT_SYMBOL(kunmap_high); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 33514d8..3630d57 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -827,7 +827,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; } - if (unlikely(mem_cgroup_charge_anon(page, mm, GFP_KERNEL))) { + if (unlikely(mem_cgroup_charge_anon(page, mm, GFP_TRANSHUGE))) { put_page(page); count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; @@ -1132,7 +1132,7 @@ alloc: goto out; } - if (unlikely(mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL))) { + if (unlikely(mem_cgroup_charge_anon(new_page, mm, GFP_TRANSHUGE))) { put_page(new_page); if (page) { split_huge_page(page); @@ -1681,7 +1681,7 @@ static void __split_huge_page_refcount(struct page *page, &page_tail->_count); /* after clearing PageTail the gup refcount can be released */ - smp_mb(); + smp_mb__after_atomic(); /* * retain hwpoison flag of the poisoned tail page: @@ -1775,6 +1775,8 @@ static int __split_huge_page_map(struct page *page, if (pmd) { pgtable = pgtable_trans_huge_withdraw(mm, pmd); pmd_populate(mm, &_pmd, pgtable); + if (pmd_write(*pmd)) + BUG_ON(page_mapcount(page) != 1); haddr = address; for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { @@ -1784,8 +1786,6 @@ static int __split_huge_page_map(struct page *page, entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (!pmd_write(*pmd)) entry = pte_wrprotect(entry); - else - BUG_ON(page_mapcount(page) != 1); if (!pmd_young(*pmd)) entry = pte_mkold(entry); if (pmd_numa(*pmd)) @@ -2233,6 +2233,30 @@ static void khugepaged_alloc_sleep(void) static int khugepaged_node_load[MAX_NUMNODES]; +static bool khugepaged_scan_abort(int nid) +{ + int i; + + /* + * If zone_reclaim_mode is disabled, then no extra effort is made to + * allocate memory locally. + */ + if (!zone_reclaim_mode) + return false; + + /* If there is a count for this node already, it must be acceptable */ + if (khugepaged_node_load[nid]) + return false; + + for (i = 0; i < MAX_NUMNODES; i++) { + if (!khugepaged_node_load[i]) + continue; + if (node_distance(nid, i) > RECLAIM_DISTANCE) + return true; + } + return false; +} + #ifdef CONFIG_NUMA static int khugepaged_find_target_node(void) { @@ -2399,7 +2423,7 @@ static void collapse_huge_page(struct mm_struct *mm, if (!new_page) return; - if (unlikely(mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL))) + if (unlikely(mem_cgroup_charge_anon(new_page, mm, GFP_TRANSHUGE))) return; /* @@ -2545,6 +2569,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, * hit record. */ node = page_to_nid(page); + if (khugepaged_scan_abort(node)) + goto out_unmap; khugepaged_node_load[node]++; VM_BUG_ON_PAGE(PageCompound(page), page); if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7a0a73d..eeceeeb 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -35,7 +35,6 @@ #include <linux/node.h> #include "internal.h" -const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; unsigned long hugepages_treat_as_movable; int hugetlb_max_hstate __read_mostly; @@ -1089,6 +1088,9 @@ void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) unsigned long pfn; struct hstate *h; + if (!hugepages_supported()) + return; + /* Set scan step to minimum hugepage size */ for_each_hstate(h) if (order > huge_page_order(h)) @@ -1734,21 +1736,13 @@ static ssize_t nr_hugepages_show_common(struct kobject *kobj, return sprintf(buf, "%lu\n", nr_huge_pages); } -static ssize_t nr_hugepages_store_common(bool obey_mempolicy, - struct kobject *kobj, struct kobj_attribute *attr, - const char *buf, size_t len) +static ssize_t __nr_hugepages_store_common(bool obey_mempolicy, + struct hstate *h, int nid, + unsigned long count, size_t len) { int err; - int nid; - unsigned long count; - struct hstate *h; NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY); - err = kstrtoul(buf, 10, &count); - if (err) - goto out; - - h = kobj_to_hstate(kobj, &nid); if (hstate_is_gigantic(h) && !gigantic_page_supported()) { err = -EINVAL; goto out; @@ -1784,6 +1778,23 @@ out: return err; } +static ssize_t nr_hugepages_store_common(bool obey_mempolicy, + struct kobject *kobj, const char *buf, + size_t len) +{ + struct hstate *h; + unsigned long count; + int nid; + int err; + + err = kstrtoul(buf, 10, &count); + if (err) + return err; + + h = kobj_to_hstate(kobj, &nid); + return __nr_hugepages_store_common(obey_mempolicy, h, nid, count, len); +} + static ssize_t nr_hugepages_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -1793,7 +1804,7 @@ static ssize_t nr_hugepages_show(struct kobject *kobj, static ssize_t nr_hugepages_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t len) { - return nr_hugepages_store_common(false, kobj, attr, buf, len); + return nr_hugepages_store_common(false, kobj, buf, len); } HSTATE_ATTR(nr_hugepages); @@ -1812,7 +1823,7 @@ static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj, static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t len) { - return nr_hugepages_store_common(true, kobj, attr, buf, len); + return nr_hugepages_store_common(true, kobj, buf, len); } HSTATE_ATTR(nr_hugepages_mempolicy); #endif @@ -2248,36 +2259,21 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy, void __user *buffer, size_t *length, loff_t *ppos) { struct hstate *h = &default_hstate; - unsigned long tmp; + unsigned long tmp = h->max_huge_pages; int ret; if (!hugepages_supported()) return -ENOTSUPP; - tmp = h->max_huge_pages; - - if (write && hstate_is_gigantic(h) && !gigantic_page_supported()) - return -EINVAL; - table->data = &tmp; table->maxlen = sizeof(unsigned long); ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); if (ret) goto out; - if (write) { - NODEMASK_ALLOC(nodemask_t, nodes_allowed, - GFP_KERNEL | __GFP_NORETRY); - if (!(obey_mempolicy && - init_nodemask_of_mempolicy(nodes_allowed))) { - NODEMASK_FREE(nodes_allowed); - nodes_allowed = &node_states[N_MEMORY]; - } - h->max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed); - - if (nodes_allowed != &node_states[N_MEMORY]) - NODEMASK_FREE(nodes_allowed); - } + if (write) + ret = __nr_hugepages_store_common(obey_mempolicy, h, + NUMA_NO_NODE, tmp, *length); out: return ret; } @@ -2754,8 +2750,8 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, * from other VMAs and let the children be SIGKILLed if they are faulting the * same region. */ -static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, - struct page *page, unsigned long address) +static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, + struct page *page, unsigned long address) { struct hstate *h = hstate_vma(vma); struct vm_area_struct *iter_vma; @@ -2794,8 +2790,6 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, address + huge_page_size(h), page); } mutex_unlock(&mapping->i_mmap_mutex); - - return 1; } /* @@ -2810,7 +2804,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, { struct hstate *h = hstate_vma(vma); struct page *old_page, *new_page; - int outside_reserve = 0; + int ret = 0, outside_reserve = 0; unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ @@ -2840,14 +2834,14 @@ retry_avoidcopy: page_cache_get(old_page); - /* Drop page table lock as buddy allocator may be called */ + /* + * Drop page table lock as buddy allocator may be called. It will + * be acquired again before returning to the caller, as expected. + */ spin_unlock(ptl); new_page = alloc_huge_page(vma, address, outside_reserve); if (IS_ERR(new_page)) { - long err = PTR_ERR(new_page); - page_cache_release(old_page); - /* * If a process owning a MAP_PRIVATE mapping fails to COW, * it is due to references held by a child and an insufficient @@ -2856,29 +2850,25 @@ retry_avoidcopy: * may get SIGKILLed if it later faults. */ if (outside_reserve) { + page_cache_release(old_page); BUG_ON(huge_pte_none(pte)); - if (unmap_ref_private(mm, vma, old_page, address)) { - BUG_ON(huge_pte_none(pte)); - spin_lock(ptl); - ptep = huge_pte_offset(mm, address & huge_page_mask(h)); - if (likely(ptep && - pte_same(huge_ptep_get(ptep), pte))) - goto retry_avoidcopy; - /* - * race occurs while re-acquiring page table - * lock, and our job is done. - */ - return 0; - } - WARN_ON_ONCE(1); + unmap_ref_private(mm, vma, old_page, address); + BUG_ON(huge_pte_none(pte)); + spin_lock(ptl); + ptep = huge_pte_offset(mm, address & huge_page_mask(h)); + if (likely(ptep && + pte_same(huge_ptep_get(ptep), pte))) + goto retry_avoidcopy; + /* + * race occurs while re-acquiring page table + * lock, and our job is done. + */ + return 0; } - /* Caller expects lock to be held */ - spin_lock(ptl); - if (err == -ENOMEM) - return VM_FAULT_OOM; - else - return VM_FAULT_SIGBUS; + ret = (PTR_ERR(new_page) == -ENOMEM) ? + VM_FAULT_OOM : VM_FAULT_SIGBUS; + goto out_release_old; } /* @@ -2886,11 +2876,8 @@ retry_avoidcopy: * anon_vma prepared. */ if (unlikely(anon_vma_prepare(vma))) { - page_cache_release(new_page); - page_cache_release(old_page); - /* Caller expects lock to be held */ - spin_lock(ptl); - return VM_FAULT_OOM; + ret = VM_FAULT_OOM; + goto out_release_all; } copy_user_huge_page(new_page, old_page, address, vma, @@ -2900,6 +2887,7 @@ retry_avoidcopy: mmun_start = address & huge_page_mask(h); mmun_end = mmun_start + huge_page_size(h); mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + /* * Retake the page table lock to check for racing updates * before the page tables are altered @@ -2920,12 +2908,13 @@ retry_avoidcopy: } spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); +out_release_all: page_cache_release(new_page); +out_release_old: page_cache_release(old_page); - /* Caller expects lock to be held */ - spin_lock(ptl); - return 0; + spin_lock(ptl); /* Caller expects lock to be held */ + return ret; } /* Return the pagecache page at a given address within a VMA */ diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index 95487c7..329caf5 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c @@ -72,8 +72,7 @@ DEFINE_SIMPLE_ATTRIBUTE(unpoison_fops, NULL, hwpoison_unpoison, "%lli\n"); static void pfn_inject_exit(void) { - if (hwpoison_dir) - debugfs_remove_recursive(hwpoison_dir); + debugfs_remove_recursive(hwpoison_dir); } static int pfn_inject_init(void) diff --git a/mm/internal.h b/mm/internal.h index 7f22a11f..a1b651b 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -247,7 +247,7 @@ static inline void mlock_migrate_page(struct page *new, struct page *old) { } static inline struct page *mem_map_offset(struct page *base, int offset) { if (unlikely(offset >= MAX_ORDER_NR_PAGES)) - return pfn_to_page(page_to_pfn(base) + offset); + return nth_page(base, offset); return base + offset; } diff --git a/mm/madvise.c b/mm/madvise.c index a402f8f..0938b30 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -292,9 +292,6 @@ static long madvise_dontneed(struct vm_area_struct *vma, /* * Application wants to free up the pages and associated backing store. * This is effectively punching a hole into the middle of a file. - * - * NOTE: Currently, only shmfs/tmpfs is supported for this operation. - * Other filesystems return -ENOSYS. */ static long madvise_remove(struct vm_area_struct *vma, struct vm_area_struct **prev, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f009a14..90dc501 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2551,55 +2551,72 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb, return NOTIFY_OK; } - -/* See mem_cgroup_try_charge() for details */ -enum { - CHARGE_OK, /* success */ - CHARGE_RETRY, /* need to retry but retry is not bad */ - CHARGE_NOMEM, /* we can't do more. return -ENOMEM */ - CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */ -}; - -static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, - unsigned int nr_pages, unsigned int min_pages, - bool invoke_oom) +/** + * mem_cgroup_try_charge - try charging a memcg + * @memcg: memcg to charge + * @nr_pages: number of pages to charge + * + * Returns 0 if @memcg was charged successfully, -EINTR if the charge + * was bypassed to root_mem_cgroup, and -ENOMEM if the charge failed. + */ +static int mem_cgroup_try_charge(struct mem_cgroup *memcg, + gfp_t gfp_mask, + unsigned int nr_pages) { - unsigned long csize = nr_pages * PAGE_SIZE; + unsigned int batch = max(CHARGE_BATCH, nr_pages); + int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; struct mem_cgroup *mem_over_limit; struct res_counter *fail_res; + unsigned long nr_reclaimed; unsigned long flags = 0; - int ret; + unsigned long long size; + int ret = 0; - ret = res_counter_charge(&memcg->res, csize, &fail_res); +retry: + if (consume_stock(memcg, nr_pages)) + goto done; - if (likely(!ret)) { + size = batch * PAGE_SIZE; + if (!res_counter_charge(&memcg->res, size, &fail_res)) { if (!do_swap_account) - return CHARGE_OK; - ret = res_counter_charge(&memcg->memsw, csize, &fail_res); - if (likely(!ret)) - return CHARGE_OK; - - res_counter_uncharge(&memcg->res, csize); + goto done_restock; + if (!res_counter_charge(&memcg->memsw, size, &fail_res)) + goto done_restock; + res_counter_uncharge(&memcg->res, size); mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); flags |= MEM_CGROUP_RECLAIM_NOSWAP; } else mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); + + if (batch > nr_pages) { + batch = nr_pages; + goto retry; + } + /* - * Never reclaim on behalf of optional batching, retry with a - * single page instead. + * Unlike in global OOM situations, memcg is not in a physical + * memory shortage. Allow dying and OOM-killed tasks to + * bypass the last charges so that they can exit quickly and + * free their memory. */ - if (nr_pages > min_pages) - return CHARGE_RETRY; + if (unlikely(test_thread_flag(TIF_MEMDIE) || + fatal_signal_pending(current) || + current->flags & PF_EXITING)) + goto bypass; + + if (unlikely(task_in_memcg_oom(current))) + goto nomem; if (!(gfp_mask & __GFP_WAIT)) - return CHARGE_WOULDBLOCK; + goto nomem; - if (gfp_mask & __GFP_NORETRY) - return CHARGE_NOMEM; + nr_reclaimed = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags); - ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags); if (mem_cgroup_margin(mem_over_limit) >= nr_pages) - return CHARGE_RETRY; + goto retry; + + if (gfp_mask & __GFP_NORETRY) + goto nomem; /* * Even though the limit is exceeded at this point, reclaim * may have been able to free some pages. Retry the charge @@ -2609,96 +2626,38 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, * unlikely to succeed so close to the limit, and we fall back * to regular pages anyway in case of failure. */ - if (nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER) && ret) - return CHARGE_RETRY; - + if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER)) + goto retry; /* * At task move, charge accounts can be doubly counted. So, it's * better to wait until the end of task_move if something is going on. */ if (mem_cgroup_wait_acct_move(mem_over_limit)) - return CHARGE_RETRY; - - if (invoke_oom) - mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(csize)); - - return CHARGE_NOMEM; -} - -/** - * mem_cgroup_try_charge - try charging a memcg - * @memcg: memcg to charge - * @nr_pages: number of pages to charge - * @oom: trigger OOM if reclaim fails - * - * Returns 0 if @memcg was charged successfully, -EINTR if the charge - * was bypassed to root_mem_cgroup, and -ENOMEM if the charge failed. - */ -static int mem_cgroup_try_charge(struct mem_cgroup *memcg, - gfp_t gfp_mask, - unsigned int nr_pages, - bool oom) -{ - unsigned int batch = max(CHARGE_BATCH, nr_pages); - int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; - int ret; - - if (mem_cgroup_is_root(memcg)) - goto done; - /* - * Unlike in global OOM situations, memcg is not in a physical - * memory shortage. Allow dying and OOM-killed tasks to - * bypass the last charges so that they can exit quickly and - * free their memory. - */ - if (unlikely(test_thread_flag(TIF_MEMDIE) || - fatal_signal_pending(current) || - current->flags & PF_EXITING)) - goto bypass; + goto retry; - if (unlikely(task_in_memcg_oom(current))) - goto nomem; + if (nr_retries--) + goto retry; if (gfp_mask & __GFP_NOFAIL) - oom = false; -again: - if (consume_stock(memcg, nr_pages)) - goto done; - - do { - bool invoke_oom = oom && !nr_oom_retries; - - /* If killed, bypass charge */ - if (fatal_signal_pending(current)) - goto bypass; + goto bypass; - ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, - nr_pages, invoke_oom); - switch (ret) { - case CHARGE_OK: - break; - case CHARGE_RETRY: /* not in OOM situation but retry */ - batch = nr_pages; - goto again; - case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */ - goto nomem; - case CHARGE_NOMEM: /* OOM routine works */ - if (!oom || invoke_oom) - goto nomem; - nr_oom_retries--; - break; - } - } while (ret != CHARGE_OK); + if (fatal_signal_pending(current)) + goto bypass; - if (batch > nr_pages) - refill_stock(memcg, batch - nr_pages); -done: - return 0; + mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages)); nomem: if (!(gfp_mask & __GFP_NOFAIL)) return -ENOMEM; bypass: - return -EINTR; + memcg = root_mem_cgroup; + ret = -EINTR; + goto retry; + +done_restock: + if (batch > nr_pages) + refill_stock(memcg, batch - nr_pages); +done: + return ret; } /** @@ -2712,15 +2671,14 @@ bypass: */ static struct mem_cgroup *mem_cgroup_try_charge_mm(struct mm_struct *mm, gfp_t gfp_mask, - unsigned int nr_pages, - bool oom) + unsigned int nr_pages) { struct mem_cgroup *memcg; int ret; memcg = get_mem_cgroup_from_mm(mm); - ret = mem_cgroup_try_charge(memcg, gfp_mask, nr_pages, oom); + ret = mem_cgroup_try_charge(memcg, gfp_mask, nr_pages); css_put(&memcg->css); if (ret == -EINTR) memcg = root_mem_cgroup; @@ -2738,13 +2696,11 @@ static struct mem_cgroup *mem_cgroup_try_charge_mm(struct mm_struct *mm, static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) { - if (!mem_cgroup_is_root(memcg)) { - unsigned long bytes = nr_pages * PAGE_SIZE; + unsigned long bytes = nr_pages * PAGE_SIZE; - res_counter_uncharge(&memcg->res, bytes); - if (do_swap_account) - res_counter_uncharge(&memcg->memsw, bytes); - } + res_counter_uncharge(&memcg->res, bytes); + if (do_swap_account) + res_counter_uncharge(&memcg->memsw, bytes); } /* @@ -2756,9 +2712,6 @@ static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg, { unsigned long bytes = nr_pages * PAGE_SIZE; - if (mem_cgroup_is_root(memcg)) - return; - res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes); if (do_swap_account) res_counter_uncharge_until(&memcg->memsw, @@ -2842,14 +2795,6 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, } pc->mem_cgroup = memcg; - /* - * We access a page_cgroup asynchronously without lock_page_cgroup(). - * Especially when a page_cgroup is taken from a page, pc->mem_cgroup - * is accessed after testing USED bit. To make pc->mem_cgroup visible - * before USED bit, we need memory barrier here. - * See mem_cgroup_add_lru_list(), etc. - */ - smp_wmb(); SetPageCgroupUsed(pc); if (lrucare) { @@ -2937,8 +2882,7 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) if (ret) return ret; - ret = mem_cgroup_try_charge(memcg, gfp, size >> PAGE_SHIFT, - oom_gfp_allowed(gfp)); + ret = mem_cgroup_try_charge(memcg, gfp, size >> PAGE_SHIFT); if (ret == -EINTR) { /* * mem_cgroup_try_charge() chosed to bypass to root due to @@ -3463,12 +3407,13 @@ void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, memcg_uncharge_kmem(memcg, PAGE_SIZE << order); return; } - + /* + * The page is freshly allocated and not visible to any + * outside callers yet. Set up pc non-atomically. + */ pc = lookup_page_cgroup(page); - lock_page_cgroup(pc); pc->mem_cgroup = memcg; - SetPageCgroupUsed(pc); - unlock_page_cgroup(pc); + pc->flags = PCG_USED; } void __memcg_kmem_uncharge_pages(struct page *page, int order) @@ -3478,19 +3423,11 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order) pc = lookup_page_cgroup(page); - /* - * Fast unlocked return. Theoretically might have changed, have to - * check again after locking. - */ if (!PageCgroupUsed(pc)) return; - lock_page_cgroup(pc); - if (PageCgroupUsed(pc)) { - memcg = pc->mem_cgroup; - ClearPageCgroupUsed(pc); - } - unlock_page_cgroup(pc); + memcg = pc->mem_cgroup; + pc->flags = 0; /* * We trust that only if there is a memcg associated with the page, it @@ -3531,7 +3468,6 @@ void mem_cgroup_split_huge_fixup(struct page *head) for (i = 1; i < HPAGE_PMD_NR; i++) { pc = head_pc + i; pc->mem_cgroup = memcg; - smp_wmb();/* see __commit_charge() */ pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; } __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], @@ -3687,7 +3623,6 @@ int mem_cgroup_charge_anon(struct page *page, { unsigned int nr_pages = 1; struct mem_cgroup *memcg; - bool oom = true; if (mem_cgroup_disabled()) return 0; @@ -3699,14 +3634,9 @@ int mem_cgroup_charge_anon(struct page *page, if (PageTransHuge(page)) { nr_pages <<= compound_order(page); VM_BUG_ON_PAGE(!PageTransHuge(page), page); - /* - * Never OOM-kill a process for a huge page. The - * fault handler will fall back to regular pages. - */ - oom = false; } - memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, nr_pages, oom); + memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, nr_pages); if (!memcg) return -ENOMEM; __mem_cgroup_commit_charge(memcg, page, nr_pages, @@ -3743,7 +3673,7 @@ static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm, memcg = try_get_mem_cgroup_from_page(page); if (!memcg) memcg = get_mem_cgroup_from_mm(mm); - ret = mem_cgroup_try_charge(memcg, mask, 1, true); + ret = mem_cgroup_try_charge(memcg, mask, 1); css_put(&memcg->css); if (ret == -EINTR) memcg = root_mem_cgroup; @@ -3770,7 +3700,7 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page, if (!PageSwapCache(page)) { struct mem_cgroup *memcg; - memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true); + memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1); if (!memcg) return -ENOMEM; *memcgp = memcg; @@ -3839,7 +3769,7 @@ int mem_cgroup_charge_file(struct page *page, struct mm_struct *mm, return 0; } - memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true); + memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1); if (!memcg) return -ENOMEM; __mem_cgroup_commit_charge(memcg, page, 1, type, false); @@ -3993,7 +3923,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype, * replacement page, so leave it alone when phasing out the * page that is unused after the migration. */ - if (!end_migration && !mem_cgroup_is_root(memcg)) + if (!end_migration) mem_cgroup_do_uncharge(memcg, nr_pages, ctype); return memcg; @@ -4126,8 +4056,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent) * We uncharge this because swap is freed. This memcg can * be obsolete one. We avoid calling css_tryget_online(). */ - if (!mem_cgroup_is_root(memcg)) - res_counter_uncharge(&memcg->memsw, PAGE_SIZE); + res_counter_uncharge(&memcg->memsw, PAGE_SIZE); mem_cgroup_swap_statistics(memcg, false); css_put(&memcg->css); } @@ -4817,78 +4746,24 @@ out: return retval; } - -static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg, - enum mem_cgroup_stat_index idx) -{ - struct mem_cgroup *iter; - long val = 0; - - /* Per-cpu values can be negative, use a signed accumulator */ - for_each_mem_cgroup_tree(iter, memcg) - val += mem_cgroup_read_stat(iter, idx); - - if (val < 0) /* race ? */ - val = 0; - return val; -} - -static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) -{ - u64 val; - - if (!mem_cgroup_is_root(memcg)) { - if (!swap) - return res_counter_read_u64(&memcg->res, RES_USAGE); - else - return res_counter_read_u64(&memcg->memsw, RES_USAGE); - } - - /* - * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS - * as well as in MEM_CGROUP_STAT_RSS_HUGE. - */ - val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE); - val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS); - - if (swap) - val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP); - - return val << PAGE_SHIFT; -} - static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, - struct cftype *cft) + struct cftype *cft) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - u64 val; - int name; - enum res_type type; - - type = MEMFILE_TYPE(cft->private); - name = MEMFILE_ATTR(cft->private); + enum res_type type = MEMFILE_TYPE(cft->private); + int name = MEMFILE_ATTR(cft->private); switch (type) { case _MEM: - if (name == RES_USAGE) - val = mem_cgroup_usage(memcg, false); - else - val = res_counter_read_u64(&memcg->res, name); - break; + return res_counter_read_u64(&memcg->res, name); case _MEMSWAP: - if (name == RES_USAGE) - val = mem_cgroup_usage(memcg, true); - else - val = res_counter_read_u64(&memcg->memsw, name); - break; + return res_counter_read_u64(&memcg->memsw, name); case _KMEM: - val = res_counter_read_u64(&memcg->kmem, name); + return res_counter_read_u64(&memcg->kmem, name); break; default: BUG(); } - - return val; } #ifdef CONFIG_MEMCG_KMEM @@ -5350,7 +5225,10 @@ static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) if (!t) goto unlock; - usage = mem_cgroup_usage(memcg, swap); + if (!swap) + usage = res_counter_read_u64(&memcg->res, RES_USAGE); + else + usage = res_counter_read_u64(&memcg->memsw, RES_USAGE); /* * current_threshold points to threshold just below or equal to usage. @@ -5446,15 +5324,15 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, mutex_lock(&memcg->thresholds_lock); - if (type == _MEM) + if (type == _MEM) { thresholds = &memcg->thresholds; - else if (type == _MEMSWAP) + usage = res_counter_read_u64(&memcg->res, RES_USAGE); + } else if (type == _MEMSWAP) { thresholds = &memcg->memsw_thresholds; - else + usage = res_counter_read_u64(&memcg->memsw, RES_USAGE); + } else BUG(); - usage = mem_cgroup_usage(memcg, type == _MEMSWAP); - /* Check if a threshold crossed before adding a new one */ if (thresholds->primary) __mem_cgroup_threshold(memcg, type == _MEMSWAP); @@ -5534,18 +5412,19 @@ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, int i, j, size; mutex_lock(&memcg->thresholds_lock); - if (type == _MEM) + + if (type == _MEM) { thresholds = &memcg->thresholds; - else if (type == _MEMSWAP) + usage = res_counter_read_u64(&memcg->res, RES_USAGE); + } else if (type == _MEMSWAP) { thresholds = &memcg->memsw_thresholds; - else + usage = res_counter_read_u64(&memcg->memsw, RES_USAGE); + } else BUG(); if (!thresholds->primary) goto unlock; - usage = mem_cgroup_usage(memcg, type == _MEMSWAP); - /* Check if a threshold crossed before removing */ __mem_cgroup_threshold(memcg, type == _MEMSWAP); @@ -6299,9 +6178,9 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) * core guarantees its existence. */ } else { - res_counter_init(&memcg->res, NULL); - res_counter_init(&memcg->memsw, NULL); - res_counter_init(&memcg->kmem, NULL); + res_counter_init(&memcg->res, &root_mem_cgroup->res); + res_counter_init(&memcg->memsw, &root_mem_cgroup->memsw); + res_counter_init(&memcg->kmem, &root_mem_cgroup->kmem); /* * Deeper hierachy with use_hierarchy == false doesn't make * much sense so let cgroup subsystem know about this @@ -6435,55 +6314,39 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) #ifdef CONFIG_MMU /* Handlers for move charge at task migration. */ -#define PRECHARGE_COUNT_AT_ONCE 256 static int mem_cgroup_do_precharge(unsigned long count) { - int ret = 0; - int batch_count = PRECHARGE_COUNT_AT_ONCE; - struct mem_cgroup *memcg = mc.to; + int ret; - if (mem_cgroup_is_root(memcg)) { + /* Try a single bulk charge without reclaim first */ + ret = mem_cgroup_try_charge(mc.to, GFP_KERNEL & ~__GFP_WAIT, count); + if (!ret) { mc.precharge += count; - /* we don't need css_get for root */ return ret; } - /* try to charge at once */ - if (count > 1) { - struct res_counter *dummy; - /* - * "memcg" cannot be under rmdir() because we've already checked - * by cgroup_lock_live_cgroup() that it is not removed and we - * are still under the same cgroup_mutex. So we can postpone - * css_get(). - */ - if (res_counter_charge(&memcg->res, PAGE_SIZE * count, &dummy)) - goto one_by_one; - if (do_swap_account && res_counter_charge(&memcg->memsw, - PAGE_SIZE * count, &dummy)) { - res_counter_uncharge(&memcg->res, PAGE_SIZE * count); - goto one_by_one; - } - mc.precharge += count; + if (ret == -EINTR) { + __mem_cgroup_cancel_charge(root_mem_cgroup, count); return ret; } -one_by_one: - /* fall back to one by one charge */ + + /* Try charges one by one with reclaim */ while (count--) { - if (signal_pending(current)) { - ret = -EINTR; - break; - } - if (!batch_count--) { - batch_count = PRECHARGE_COUNT_AT_ONCE; - cond_resched(); - } - ret = mem_cgroup_try_charge(memcg, GFP_KERNEL, 1, false); + ret = mem_cgroup_try_charge(mc.to, + GFP_KERNEL & ~__GFP_NORETRY, 1); + /* + * In case of failure, any residual charges against + * mc.to will be dropped by mem_cgroup_clear_mc() + * later on. However, cancel any charges that are + * bypassed to root right away or they'll be lost. + */ + if (ret == -EINTR) + __mem_cgroup_cancel_charge(root_mem_cgroup, 1); if (ret) - /* mem_cgroup_clear_mc() will do uncharge later */ return ret; mc.precharge++; + cond_resched(); } - return ret; + return 0; } /** @@ -6760,21 +6623,18 @@ static void __mem_cgroup_clear_mc(void) /* we must fixup refcnts and charges */ if (mc.moved_swap) { /* uncharge swap account from the old cgroup */ - if (!mem_cgroup_is_root(mc.from)) - res_counter_uncharge(&mc.from->memsw, - PAGE_SIZE * mc.moved_swap); + res_counter_uncharge(&mc.from->memsw, + PAGE_SIZE * mc.moved_swap); for (i = 0; i < mc.moved_swap; i++) css_put(&mc.from->css); - if (!mem_cgroup_is_root(mc.to)) { - /* - * we charged both to->res and to->memsw, so we should - * uncharge to->res. - */ - res_counter_uncharge(&mc.to->res, - PAGE_SIZE * mc.moved_swap); - } + /* + * we charged both to->res and to->memsw, so we should + * uncharge to->res. + */ + res_counter_uncharge(&mc.to->res, + PAGE_SIZE * mc.moved_swap); /* we've already done css_get(mc.to) */ mc.moved_swap = 0; } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index a013bc9..44c6bd2 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1173,6 +1173,16 @@ int memory_failure(unsigned long pfn, int trapno, int flags) lock_page(hpage); /* + * The page could have changed compound pages during the locking. + * If this happens just bail out. + */ + if (compound_head(p) != hpage) { + action_result(pfn, "different compound page after locking", IGNORED); + res = -EBUSY; + goto out; + } + + /* * We use page flags to determine what action should be taken, but * the flags can be modified by the error containment action. One * example is an mlocked page, where PG_mlocked is cleared by diff --git a/mm/memory.c b/mm/memory.c index 8b44f76..5c55270 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -884,7 +884,7 @@ out_set_pte: return 0; } -int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, +static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, unsigned long addr, unsigned long end) { @@ -2399,7 +2399,10 @@ EXPORT_SYMBOL(unmap_mapping_range); /* * We enter with non-exclusive mmap_sem (to exclude vma changes, * but allow concurrent faults), and pte mapped but not yet locked. - * We return with mmap_sem still held, but pte unmapped and unlocked. + * We return with pte unmapped and unlocked. + * + * We return with the mmap_sem locked or unlocked in the same cases + * as does filemap_fault(). */ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, @@ -2688,6 +2691,11 @@ oom: return VM_FAULT_OOM; } +/* + * The mmap_sem must have been held on entry, and may have been + * released depending on flags and vma->vm_ops->fault() return value. + * See filemap_fault() and __lock_page_retry(). + */ static int __do_fault(struct vm_area_struct *vma, unsigned long address, pgoff_t pgoff, unsigned int flags, struct page **page) { @@ -2744,7 +2752,7 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address, if (write) entry = maybe_mkwrite(pte_mkdirty(entry), vma); else if (pte_file(*pte) && pte_file_soft_dirty(*pte)) - pte_mksoft_dirty(entry); + entry = pte_mksoft_dirty(entry); if (anon) { inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, address); @@ -2758,17 +2766,8 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address, update_mmu_cache(vma, address, pte); } -static unsigned long fault_around_bytes = rounddown_pow_of_two(65536); - -static inline unsigned long fault_around_pages(void) -{ - return fault_around_bytes >> PAGE_SHIFT; -} - -static inline unsigned long fault_around_mask(void) -{ - return ~(fault_around_bytes - 1) & PAGE_MASK; -} +static unsigned long fault_around_bytes __read_mostly = + rounddown_pow_of_two(65536); #ifdef CONFIG_DEBUG_FS static int fault_around_bytes_get(void *data, u64 *val) @@ -2834,12 +2833,15 @@ late_initcall(fault_around_debugfs); static void do_fault_around(struct vm_area_struct *vma, unsigned long address, pte_t *pte, pgoff_t pgoff, unsigned int flags) { - unsigned long start_addr; + unsigned long start_addr, nr_pages, mask; pgoff_t max_pgoff; struct vm_fault vmf; int off; - start_addr = max(address & fault_around_mask(), vma->vm_start); + nr_pages = ACCESS_ONCE(fault_around_bytes) >> PAGE_SHIFT; + mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK; + + start_addr = max(address & mask, vma->vm_start); off = ((address - start_addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); pte -= off; pgoff -= off; @@ -2851,7 +2853,7 @@ static void do_fault_around(struct vm_area_struct *vma, unsigned long address, max_pgoff = pgoff - ((start_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + PTRS_PER_PTE - 1; max_pgoff = min3(max_pgoff, vma_pages(vma) + vma->vm_pgoff - 1, - pgoff + fault_around_pages() - 1); + pgoff + nr_pages - 1); /* Check if it makes any sense to call ->map_pages */ while (!pte_none(*pte)) { @@ -2886,7 +2888,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, * something). */ if (vma->vm_ops->map_pages && !(flags & FAULT_FLAG_NONLINEAR) && - fault_around_pages() > 1) { + fault_around_bytes >> PAGE_SHIFT > 1) { pte = pte_offset_map_lock(mm, pmd, address, &ptl); do_fault_around(vma, address, pte, pgoff, flags); if (!pte_same(*pte, orig_pte)) @@ -3016,6 +3018,12 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, return ret; } +/* + * We enter with non-exclusive mmap_sem (to exclude vma changes, + * but allow concurrent faults). + * The mmap_sem may have been released depending on flags and our + * return value. See filemap_fault() and __lock_page_or_retry(). + */ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, unsigned int flags, pte_t orig_pte) @@ -3040,7 +3048,9 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, * * We enter with non-exclusive mmap_sem (to exclude vma changes, * but allow concurrent faults), and pte mapped but not yet locked. - * We return with mmap_sem still held, but pte unmapped and unlocked. + * We return with pte unmapped and unlocked. + * The mmap_sem may have been released depending on flags and our + * return value. See filemap_fault() and __lock_page_or_retry(). */ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, @@ -3172,7 +3182,10 @@ out: * * We enter with non-exclusive mmap_sem (to exclude vma changes, * but allow concurrent faults), and pte mapped but not yet locked. - * We return with mmap_sem still held, but pte unmapped and unlocked. + * We return with pte unmapped and unlocked. + * + * The mmap_sem may have been released depending on flags and our + * return value. See filemap_fault() and __lock_page_or_retry(). */ static int handle_pte_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, @@ -3181,7 +3194,7 @@ static int handle_pte_fault(struct mm_struct *mm, pte_t entry; spinlock_t *ptl; - entry = *pte; + entry = ACCESS_ONCE(*pte); if (!pte_present(entry)) { if (pte_none(entry)) { if (vma->vm_ops) { @@ -3232,6 +3245,9 @@ unlock: /* * By the time we get here, we already hold the mm semaphore + * + * The mmap_sem may have been released depending on flags and our + * return value. See filemap_fault() and __lock_page_or_retry(). */ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, unsigned int flags) @@ -3313,6 +3329,12 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, return handle_pte_fault(mm, vma, address, pte, pmd, flags); } +/* + * By the time we get here, we already hold the mm semaphore + * + * The mmap_sem may have been released depending on flags and our + * return value. See filemap_fault() and __lock_page_or_retry(). + */ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, unsigned int flags) { @@ -3591,11 +3613,13 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, ret = get_user_pages(tsk, mm, addr, 1, write, 1, &page, &vma); if (ret <= 0) { +#ifndef CONFIG_HAVE_IOREMAP_PROT + break; +#else /* * Check if this is a VM_IO | VM_PFNMAP VMA, which * we can access using slightly different code. */ -#ifdef CONFIG_HAVE_IOREMAP_PROT vma = find_vma(mm, addr); if (!vma || vma->vm_start > addr) break; @@ -3603,9 +3627,9 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, ret = vma->vm_ops->access(vma, addr, buf, len, write); if (ret <= 0) -#endif break; bytes = ret; +#endif } else { bytes = len; offset = addr & (PAGE_SIZE-1); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 469bbf5..2ff8c23 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -284,8 +284,8 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat) } #endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */ -static void grow_zone_span(struct zone *zone, unsigned long start_pfn, - unsigned long end_pfn) +static void __meminit grow_zone_span(struct zone *zone, unsigned long start_pfn, + unsigned long end_pfn) { unsigned long old_zone_end_pfn; @@ -427,8 +427,8 @@ out_fail: return -1; } -static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, - unsigned long end_pfn) +static void __meminit grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, + unsigned long end_pfn) { unsigned long old_pgdat_end_pfn = pgdat_end_pfn(pgdat); @@ -977,15 +977,18 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ zone = page_zone(pfn_to_page(pfn)); ret = -EINVAL; - if ((zone_idx(zone) > ZONE_NORMAL || online_type == ONLINE_MOVABLE) && + if ((zone_idx(zone) > ZONE_NORMAL || + online_type == MMOP_ONLINE_MOVABLE) && !can_online_high_movable(zone)) goto out; - if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) { + if (online_type == MMOP_ONLINE_KERNEL && + zone_idx(zone) == ZONE_MOVABLE) { if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) goto out; } - if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) { + if (online_type == MMOP_ONLINE_MOVABLE && + zone_idx(zone) == ZONE_MOVABLE - 1) { if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) goto out; } @@ -1156,6 +1159,34 @@ static int check_hotplug_memory_range(u64 start, u64 size) return 0; } +/* + * If movable zone has already been setup, newly added memory should be check. + * If its address is higher than movable zone, it should be added as movable. + * Without this check, movable zone may overlap with other zone. + */ +static int should_add_memory_movable(int nid, u64 start, u64 size) +{ + unsigned long start_pfn = start >> PAGE_SHIFT; + pg_data_t *pgdat = NODE_DATA(nid); + struct zone *movable_zone = pgdat->node_zones + ZONE_MOVABLE; + + if (zone_is_empty(movable_zone)) + return 0; + + if (movable_zone->zone_start_pfn <= start_pfn) + return 1; + + return 0; +} + +int zone_for_memory(int nid, u64 start, u64 size, int zone_default) +{ + if (should_add_memory_movable(nid, start, size)) + return ZONE_MOVABLE; + + return zone_default; +} + /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ int __ref add_memory(int nid, u64 start, u64 size) { @@ -210,12 +210,19 @@ out: * @vma: target vma * @start: start address * @end: end address + * @nonblocking: * * This takes care of making the pages present too. * * return 0 on success, negative error code on error. * - * vma->vm_mm->mmap_sem must be held for at least read. + * vma->vm_mm->mmap_sem must be held. + * + * If @nonblocking is NULL, it may be held for read or write and will + * be unperturbed. + * + * If @nonblocking is non-NULL, it must held for read only and may be + * released. If it's released, *@nonblocking will be set to 0. */ long __mlock_vma_pages_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, int *nonblocking) @@ -31,6 +31,7 @@ #include <linux/mempolicy.h> #include <linux/rmap.h> #include <linux/mmu_notifier.h> +#include <linux/mmdebug.h> #include <linux/perf_event.h> #include <linux/audit.h> #include <linux/khugepaged.h> @@ -134,6 +135,10 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) { unsigned long free, allowed, reserve; + VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) < + -(s64)vm_committed_as_batch * num_online_cpus(), + "memory commitment underflow"); + vm_acct_memory(pages); /* diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 41cefdf..950813b 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -23,6 +23,25 @@ static struct srcu_struct srcu; /* + * This function allows mmu_notifier::release callback to delay a call to + * a function that will free appropriate resources. The function must be + * quick and must not block. + */ +void mmu_notifier_call_srcu(struct rcu_head *rcu, + void (*func)(struct rcu_head *rcu)) +{ + call_srcu(&srcu, rcu, func); +} +EXPORT_SYMBOL_GPL(mmu_notifier_call_srcu); + +void mmu_notifier_synchronize(void) +{ + /* Wait for any running method to finish. */ + srcu_barrier(&srcu); +} +EXPORT_SYMBOL_GPL(mmu_notifier_synchronize); + +/* * This function can't run concurrently against mmu_notifier_register * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap * runs with mm_users == 0. Other tasks may still invoke mmu notifiers @@ -53,7 +72,6 @@ void __mmu_notifier_release(struct mm_struct *mm) */ if (mn->ops->release) mn->ops->release(mn, mm); - srcu_read_unlock(&srcu, id); spin_lock(&mm->mmu_notifier_mm->lock); while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { @@ -69,6 +87,7 @@ void __mmu_notifier_release(struct mm_struct *mm) hlist_del_init_rcu(&mn->hlist); } spin_unlock(&mm->mmu_notifier_mm->lock); + srcu_read_unlock(&srcu, id); /* * synchronize_srcu here prevents mmu_notifier_release from returning to @@ -325,6 +344,25 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) } EXPORT_SYMBOL_GPL(mmu_notifier_unregister); +/* + * Same as mmu_notifier_unregister but no callback and no srcu synchronization. + */ +void mmu_notifier_unregister_no_release(struct mmu_notifier *mn, + struct mm_struct *mm) +{ + spin_lock(&mm->mmu_notifier_mm->lock); + /* + * Can not use list_del_rcu() since __mmu_notifier_release + * can delete it before we hold the lock. + */ + hlist_del_init_rcu(&mn->hlist); + spin_unlock(&mm->mmu_notifier_mm->lock); + + BUG_ON(atomic_read(&mm->mm_count) <= 0); + mmdrop(mm); +} +EXPORT_SYMBOL_GPL(mmu_notifier_unregister_no_release); + static int __init mmu_notifier_init(void) { return init_srcu_struct(&srcu); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 3291e82..1e11df8 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -258,8 +258,6 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task, unsigned long totalpages, const nodemask_t *nodemask, bool force_kill) { - if (task->exit_state) - return OOM_SCAN_CONTINUE; if (oom_unkillable_task(task, NULL, nodemask)) return OOM_SCAN_CONTINUE; @@ -559,28 +557,25 @@ EXPORT_SYMBOL_GPL(unregister_oom_notifier); * if a parallel OOM killing is already taking place that includes a zone in * the zonelist. Otherwise, locks all zones in the zonelist and returns 1. */ -int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask) +bool oom_zonelist_trylock(struct zonelist *zonelist, gfp_t gfp_mask) { struct zoneref *z; struct zone *zone; - int ret = 1; + bool ret = true; spin_lock(&zone_scan_lock); - for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { + for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) if (zone_is_oom_locked(zone)) { - ret = 0; + ret = false; goto out; } - } - for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { - /* - * Lock each zone in the zonelist under zone_scan_lock so a - * parallel invocation of try_set_zonelist_oom() doesn't succeed - * when it shouldn't. - */ + /* + * Lock each zone in the zonelist under zone_scan_lock so a parallel + * call to oom_zonelist_trylock() doesn't succeed when it shouldn't. + */ + for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) zone_set_flag(zone, ZONE_OOM_LOCKED); - } out: spin_unlock(&zone_scan_lock); @@ -592,15 +587,14 @@ out: * allocation attempts with zonelists containing them may now recall the OOM * killer, if necessary. */ -void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask) +void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask) { struct zoneref *z; struct zone *zone; spin_lock(&zone_scan_lock); - for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { + for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) zone_clear_flag(zone, ZONE_OOM_LOCKED); - } spin_unlock(&zone_scan_lock); } @@ -694,9 +688,9 @@ void pagefault_out_of_memory(void) if (mem_cgroup_oom_synchronize(true)) return; - zonelist = node_zonelist(first_online_node, GFP_KERNEL); - if (try_set_zonelist_oom(zonelist, GFP_KERNEL)) { + zonelist = node_zonelist(first_memory_node, GFP_KERNEL); + if (oom_zonelist_trylock(zonelist, GFP_KERNEL)) { out_of_memory(NULL, 0, 0, NULL, false); - clear_zonelist_oom(zonelist, GFP_KERNEL); + oom_zonelist_unlock(zonelist, GFP_KERNEL); } } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index e0c94301..91d73ef 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -261,14 +261,11 @@ static unsigned long global_dirtyable_memory(void) */ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) { + const unsigned long available_memory = global_dirtyable_memory(); unsigned long background; unsigned long dirty; - unsigned long uninitialized_var(available_memory); struct task_struct *tsk; - if (!vm_dirty_bytes || !dirty_background_bytes) - available_memory = global_dirtyable_memory(); - if (vm_dirty_bytes) dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE); else diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ef44ad7..18cee0d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -680,9 +680,12 @@ static void free_pcppages_bulk(struct zone *zone, int count, int migratetype = 0; int batch_free = 0; int to_free = count; + unsigned long nr_scanned; spin_lock(&zone->lock); - zone->pages_scanned = 0; + nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED); + if (nr_scanned) + __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned); while (to_free) { struct page *page; @@ -731,8 +734,11 @@ static void free_one_page(struct zone *zone, unsigned int order, int migratetype) { + unsigned long nr_scanned; spin_lock(&zone->lock); - zone->pages_scanned = 0; + nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED); + if (nr_scanned) + __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned); __free_one_page(page, pfn, zone, order, migratetype); if (unlikely(!is_migrate_isolate(migratetype))) @@ -1257,15 +1263,11 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) { unsigned long flags; - int to_drain; - unsigned long batch; + int to_drain, batch; local_irq_save(flags); batch = ACCESS_ONCE(pcp->batch); - if (pcp->count >= batch) - to_drain = batch; - else - to_drain = pcp->count; + to_drain = min(pcp->count, batch); if (to_drain > 0) { free_pcppages_bulk(zone, to_drain, pcp); pcp->count -= to_drain; @@ -1610,6 +1612,9 @@ again: } __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); + if (zone_page_state(zone, NR_ALLOC_BATCH) == 0 && + !zone_is_fair_depleted(zone)) + zone_set_flag(zone, ZONE_FAIR_DEPLETED); __count_zone_vm_events(PGALLOC, zone, 1 << order); zone_statistics(preferred_zone, zone, gfp_flags); @@ -1712,7 +1717,6 @@ static bool __zone_watermark_ok(struct zone *z, unsigned int order, { /* free_pages my go negative - that's OK */ long min = mark; - long lowmem_reserve = z->lowmem_reserve[classzone_idx]; int o; long free_cma = 0; @@ -1727,7 +1731,7 @@ static bool __zone_watermark_ok(struct zone *z, unsigned int order, free_cma = zone_page_state(z, NR_FREE_CMA_PAGES); #endif - if (free_pages - free_cma <= min + lowmem_reserve) + if (free_pages - free_cma <= min + z->lowmem_reserve[classzone_idx]) return false; for (o = 0; o < order; o++) { /* At the next order, this order's pages become unavailable */ @@ -1922,6 +1926,18 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) #endif /* CONFIG_NUMA */ +static void reset_alloc_batches(struct zone *preferred_zone) +{ + struct zone *zone = preferred_zone->zone_pgdat->node_zones; + + do { + mod_zone_page_state(zone, NR_ALLOC_BATCH, + high_wmark_pages(zone) - low_wmark_pages(zone) - + atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); + zone_clear_flag(zone, ZONE_FAIR_DEPLETED); + } while (zone++ != preferred_zone); +} + /* * get_page_from_freelist goes through the zonelist trying to allocate * a page. @@ -1939,8 +1955,12 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, int did_zlc_setup = 0; /* just call zlc_setup() one time */ bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) && (gfp_mask & __GFP_WRITE); + int nr_fair_skipped = 0; + bool zonelist_rescan; zonelist_scan: + zonelist_rescan = false; + /* * Scan zonelist, looking for a zone with enough free. * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c. @@ -1964,9 +1984,11 @@ zonelist_scan: */ if (alloc_flags & ALLOC_FAIR) { if (!zone_local(preferred_zone, zone)) + break; + if (zone_is_fair_depleted(zone)) { + nr_fair_skipped++; continue; - if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0) - continue; + } } /* * When allocating a page cache page for writing, we @@ -2072,13 +2094,7 @@ this_zone_full: zlc_mark_zone_full(zonelist, z); } - if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) { - /* Disable zlc cache for second zonelist scan */ - zlc_active = 0; - goto zonelist_scan; - } - - if (page) + if (page) { /* * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was * necessary to allocate the page. The expectation is @@ -2087,8 +2103,37 @@ this_zone_full: * for !PFMEMALLOC purposes. */ page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS); + return page; + } - return page; + /* + * The first pass makes sure allocations are spread fairly within the + * local node. However, the local node might have free pages left + * after the fairness batches are exhausted, and remote zones haven't + * even been considered yet. Try once more without fairness, and + * include remote zones now, before entering the slowpath and waking + * kswapd: prefer spilling to a remote zone over swapping locally. + */ + if (alloc_flags & ALLOC_FAIR) { + alloc_flags &= ~ALLOC_FAIR; + if (nr_fair_skipped) { + zonelist_rescan = true; + reset_alloc_batches(preferred_zone); + } + if (nr_online_nodes > 1) + zonelist_rescan = true; + } + + if (unlikely(IS_ENABLED(CONFIG_NUMA) && zlc_active)) { + /* Disable zlc cache for second zonelist scan */ + zlc_active = 0; + zonelist_rescan = true; + } + + if (zonelist_rescan) + goto zonelist_scan; + + return NULL; } /* @@ -2201,8 +2246,8 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, { struct page *page; - /* Acquire the OOM killer lock for the zones in zonelist */ - if (!try_set_zonelist_oom(zonelist, gfp_mask)) { + /* Acquire the per-zone oom lock for each zone */ + if (!oom_zonelist_trylock(zonelist, gfp_mask)) { schedule_timeout_uninterruptible(1); return NULL; } @@ -2240,7 +2285,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, out_of_memory(zonelist, gfp_mask, order, nodemask, false); out: - clear_zonelist_oom(zonelist, gfp_mask); + oom_zonelist_unlock(zonelist, gfp_mask); return page; } @@ -2409,28 +2454,6 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, return page; } -static void reset_alloc_batches(struct zonelist *zonelist, - enum zone_type high_zoneidx, - struct zone *preferred_zone) -{ - struct zoneref *z; - struct zone *zone; - - for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { - /* - * Only reset the batches of zones that were actually - * considered in the fairness pass, we don't want to - * trash fairness information for zones that are not - * actually part of this zonelist's round-robin cycle. - */ - if (!zone_local(preferred_zone, zone)) - continue; - mod_zone_page_state(zone, NR_ALLOC_BATCH, - high_wmark_pages(zone) - low_wmark_pages(zone) - - atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); - } -} - static void wake_all_kswapds(unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, @@ -2616,14 +2639,6 @@ rebalance: goto got_pg; /* - * It can become very expensive to allocate transparent hugepages at - * fault, so use asynchronous memory compaction for THP unless it is - * khugepaged trying to collapse. - */ - if (!(gfp_mask & __GFP_NO_KSWAPD) || (current->flags & PF_KTHREAD)) - migration_mode = MIGRATE_SYNC_LIGHT; - - /* * If compaction is deferred for high-order allocations, it is because * sync compaction recently failed. In this is the case and the caller * requested a movable allocation that does not heavily disrupt the @@ -2633,6 +2648,15 @@ rebalance: (gfp_mask & __GFP_NO_KSWAPD)) goto nopage; + /* + * It can become very expensive to allocate transparent hugepages at + * fault, so use asynchronous memory compaction for THP unless it is + * khugepaged trying to collapse. + */ + if ((gfp_mask & GFP_TRANSHUGE) != GFP_TRANSHUGE || + (current->flags & PF_KTHREAD)) + migration_mode = MIGRATE_SYNC_LIGHT; + /* Try direct reclaim and then allocating */ page = __alloc_pages_direct_reclaim(gfp_mask, order, zonelist, high_zoneidx, @@ -2766,29 +2790,12 @@ retry_cpuset: if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) alloc_flags |= ALLOC_CMA; #endif -retry: /* First allocation attempt */ page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, zonelist, high_zoneidx, alloc_flags, preferred_zone, classzone_idx, migratetype); if (unlikely(!page)) { /* - * The first pass makes sure allocations are spread - * fairly within the local node. However, the local - * node might have free pages left after the fairness - * batches are exhausted, and remote zones haven't - * even been considered yet. Try once more without - * fairness, and include remote zones now, before - * entering the slowpath and waking kswapd: prefer - * spilling to a remote zone over swapping locally. - */ - if (alloc_flags & ALLOC_FAIR) { - reset_alloc_batches(zonelist, high_zoneidx, - preferred_zone); - alloc_flags &= ~ALLOC_FAIR; - goto retry; - } - /* * Runtime PM, block IO and its error handling path * can deadlock because I/O on the device might not * complete. @@ -2962,7 +2969,7 @@ EXPORT_SYMBOL(alloc_pages_exact); * Note this is not alloc_pages_exact_node() which allocates on a specific node, * but is not exact. */ -void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) +void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) { unsigned order = get_order(size); struct page *p = alloc_pages_node(nid, gfp_mask, order); @@ -2970,7 +2977,6 @@ void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) return NULL; return make_alloc_exact((unsigned long)page_address(p), order, size); } -EXPORT_SYMBOL(alloc_pages_exact_nid); /** * free_pages_exact - release memory allocated via alloc_pages_exact() @@ -3052,7 +3058,7 @@ static inline void show_node(struct zone *zone) void si_meminfo(struct sysinfo *val) { val->totalram = totalram_pages; - val->sharedram = 0; + val->sharedram = global_page_state(NR_SHMEM); val->freeram = global_page_state(NR_FREE_PAGES); val->bufferram = nr_blockdev_pages(); val->totalhigh = totalhigh_pages; @@ -3072,6 +3078,7 @@ void si_meminfo_node(struct sysinfo *val, int nid) for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) managed_pages += pgdat->node_zones[zone_type].managed_pages; val->totalram = managed_pages; + val->sharedram = node_page_state(nid, NR_SHMEM); val->freeram = node_page_state(nid, NR_FREE_PAGES); #ifdef CONFIG_HIGHMEM val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages; @@ -3253,12 +3260,12 @@ void show_free_areas(unsigned int filter) K(zone_page_state(zone, NR_BOUNCE)), K(zone_page_state(zone, NR_FREE_CMA_PAGES)), K(zone_page_state(zone, NR_WRITEBACK_TEMP)), - zone->pages_scanned, + K(zone_page_state(zone, NR_PAGES_SCANNED)), (!zone_reclaimable(zone) ? "yes" : "no") ); printk("lowmem_reserve[]:"); for (i = 0; i < MAX_NR_ZONES; i++) - printk(" %lu", zone->lowmem_reserve[i]); + printk(" %ld", zone->lowmem_reserve[i]); printk("\n"); } @@ -5579,7 +5586,7 @@ static void calculate_totalreserve_pages(void) for_each_online_pgdat(pgdat) { for (i = 0; i < MAX_NR_ZONES; i++) { struct zone *zone = pgdat->node_zones + i; - unsigned long max = 0; + long max = 0; /* Find valid and maximum lowmem_reserve in the zone */ for (j = i; j < MAX_NR_ZONES; j++) { diff --git a/mm/readahead.c b/mm/readahead.c index 0ca36a7..17b9172 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -326,7 +326,6 @@ static unsigned long get_next_ra_size(struct file_ra_state *ra, * - thrashing threshold in memory tight systems */ static pgoff_t count_history_pages(struct address_space *mapping, - struct file_ra_state *ra, pgoff_t offset, unsigned long max) { pgoff_t head; @@ -349,7 +348,7 @@ static int try_context_readahead(struct address_space *mapping, { pgoff_t size; - size = count_history_pages(mapping, ra, offset, max); + size = count_history_pages(mapping, offset, max); /* * not enough history pages: @@ -149,6 +149,19 @@ static inline void shmem_unacct_size(unsigned long flags, loff_t size) vm_unacct_memory(VM_ACCT(size)); } +static inline int shmem_reacct_size(unsigned long flags, + loff_t oldsize, loff_t newsize) +{ + if (!(flags & VM_NORESERVE)) { + if (VM_ACCT(newsize) > VM_ACCT(oldsize)) + return security_vm_enough_memory_mm(current->mm, + VM_ACCT(newsize) - VM_ACCT(oldsize)); + else if (VM_ACCT(newsize) < VM_ACCT(oldsize)) + vm_unacct_memory(VM_ACCT(oldsize) - VM_ACCT(newsize)); + } + return 0; +} + /* * ... whereas tmpfs objects are accounted incrementally as * pages are allocated, in order to allow huge sparse files. @@ -280,7 +293,7 @@ static bool shmem_confirm_swap(struct address_space *mapping, */ static int shmem_add_to_page_cache(struct page *page, struct address_space *mapping, - pgoff_t index, gfp_t gfp, void *expected) + pgoff_t index, void *expected) { int error; @@ -549,6 +562,10 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr) loff_t newsize = attr->ia_size; if (newsize != oldsize) { + error = shmem_reacct_size(SHMEM_I(inode)->flags, + oldsize, newsize); + if (error) + return error; i_size_write(inode, newsize); inode->i_ctime = inode->i_mtime = CURRENT_TIME; } @@ -649,7 +666,7 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, */ if (!error) error = shmem_add_to_page_cache(*pagep, mapping, index, - GFP_NOWAIT, radswap); + radswap); if (error != -ENOMEM) { /* * Truncation and eviction use free_swap_and_cache(), which @@ -1095,7 +1112,7 @@ repeat: gfp & GFP_RECLAIM_MASK); if (!error) { error = shmem_add_to_page_cache(page, mapping, index, - gfp, swp_to_radix_entry(swap)); + swp_to_radix_entry(swap)); /* * We already confirmed swap under page lock, and make * no memory allocation here, so usually no possibility @@ -1149,7 +1166,7 @@ repeat: __SetPageSwapBacked(page); __set_page_locked(page); if (sgp == SGP_WRITE) - init_page_accessed(page); + __SetPageReferenced(page); error = mem_cgroup_charge_file(page, current->mm, gfp & GFP_RECLAIM_MASK); @@ -1158,7 +1175,7 @@ repeat: error = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK); if (!error) { error = shmem_add_to_page_cache(page, mapping, index, - gfp, NULL); + NULL); radix_tree_preload_end(); } if (error) { @@ -2932,16 +2949,16 @@ static struct file *__shmem_file_setup(const char *name, loff_t size, this.len = strlen(name); this.hash = 0; /* will go */ sb = shm_mnt->mnt_sb; + path.mnt = mntget(shm_mnt); path.dentry = d_alloc_pseudo(sb, &this); if (!path.dentry) goto put_memory; d_set_d_op(path.dentry, &anon_ops); - path.mnt = mntget(shm_mnt); res = ERR_PTR(-ENOSPC); inode = shmem_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0, flags); if (!inode) - goto put_dentry; + goto put_memory; inode->i_flags |= i_flags; d_instantiate(path.dentry, inode); @@ -2949,19 +2966,19 @@ static struct file *__shmem_file_setup(const char *name, loff_t size, clear_nlink(inode); /* It is unlinked */ res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size)); if (IS_ERR(res)) - goto put_dentry; + goto put_path; res = alloc_file(&path, FMODE_WRITE | FMODE_READ, &shmem_file_operations); if (IS_ERR(res)) - goto put_dentry; + goto put_path; return res; -put_dentry: - path_put(&path); put_memory: shmem_unacct_size(flags, size); +put_path: + path_put(&path); return res; } @@ -191,7 +191,6 @@ struct array_cache { unsigned int limit; unsigned int batchcount; unsigned int touched; - spinlock_t lock; void *entry[]; /* * Must have this definition in here for the proper * alignment of array_cache. Also simplifies accessing @@ -203,6 +202,11 @@ struct array_cache { */ }; +struct alien_cache { + spinlock_t lock; + struct array_cache ac; +}; + #define SLAB_OBJ_PFMEMALLOC 1 static inline bool is_obj_pfmemalloc(void *objp) { @@ -242,7 +246,8 @@ static struct kmem_cache_node __initdata init_kmem_cache_node[NUM_INIT_LISTS]; static int drain_freelist(struct kmem_cache *cache, struct kmem_cache_node *n, int tofree); static void free_block(struct kmem_cache *cachep, void **objpp, int len, - int node); + int node, struct list_head *list); +static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list); static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp); static void cache_reap(struct work_struct *unused); @@ -267,7 +272,7 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent) #define MAKE_LIST(cachep, listp, slab, nodeid) \ do { \ INIT_LIST_HEAD(listp); \ - list_splice(&(cachep->node[nodeid]->slab), listp); \ + list_splice(&get_node(cachep, nodeid)->slab, listp); \ } while (0) #define MAKE_ALL_LISTS(cachep, ptr, nodeid) \ @@ -465,143 +470,6 @@ static struct kmem_cache kmem_cache_boot = { .name = "kmem_cache", }; -#define BAD_ALIEN_MAGIC 0x01020304ul - -#ifdef CONFIG_LOCKDEP - -/* - * Slab sometimes uses the kmalloc slabs to store the slab headers - * for other slabs "off slab". - * The locking for this is tricky in that it nests within the locks - * of all other slabs in a few places; to deal with this special - * locking we put on-slab caches into a separate lock-class. - * - * We set lock class for alien array caches which are up during init. - * The lock annotation will be lost if all cpus of a node goes down and - * then comes back up during hotplug - */ -static struct lock_class_key on_slab_l3_key; -static struct lock_class_key on_slab_alc_key; - -static struct lock_class_key debugobj_l3_key; -static struct lock_class_key debugobj_alc_key; - -static void slab_set_lock_classes(struct kmem_cache *cachep, - struct lock_class_key *l3_key, struct lock_class_key *alc_key, - int q) -{ - struct array_cache **alc; - struct kmem_cache_node *n; - int r; - - n = cachep->node[q]; - if (!n) - return; - - lockdep_set_class(&n->list_lock, l3_key); - alc = n->alien; - /* - * FIXME: This check for BAD_ALIEN_MAGIC - * should go away when common slab code is taught to - * work even without alien caches. - * Currently, non NUMA code returns BAD_ALIEN_MAGIC - * for alloc_alien_cache, - */ - if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC) - return; - for_each_node(r) { - if (alc[r]) - lockdep_set_class(&alc[r]->lock, alc_key); - } -} - -static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node) -{ - slab_set_lock_classes(cachep, &debugobj_l3_key, &debugobj_alc_key, node); -} - -static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep) -{ - int node; - - for_each_online_node(node) - slab_set_debugobj_lock_classes_node(cachep, node); -} - -static void init_node_lock_keys(int q) -{ - int i; - - if (slab_state < UP) - return; - - for (i = 1; i <= KMALLOC_SHIFT_HIGH; i++) { - struct kmem_cache_node *n; - struct kmem_cache *cache = kmalloc_caches[i]; - - if (!cache) - continue; - - n = cache->node[q]; - if (!n || OFF_SLAB(cache)) - continue; - - slab_set_lock_classes(cache, &on_slab_l3_key, - &on_slab_alc_key, q); - } -} - -static void on_slab_lock_classes_node(struct kmem_cache *cachep, int q) -{ - if (!cachep->node[q]) - return; - - slab_set_lock_classes(cachep, &on_slab_l3_key, - &on_slab_alc_key, q); -} - -static inline void on_slab_lock_classes(struct kmem_cache *cachep) -{ - int node; - - VM_BUG_ON(OFF_SLAB(cachep)); - for_each_node(node) - on_slab_lock_classes_node(cachep, node); -} - -static inline void init_lock_keys(void) -{ - int node; - - for_each_node(node) - init_node_lock_keys(node); -} -#else -static void init_node_lock_keys(int q) -{ -} - -static inline void init_lock_keys(void) -{ -} - -static inline void on_slab_lock_classes(struct kmem_cache *cachep) -{ -} - -static inline void on_slab_lock_classes_node(struct kmem_cache *cachep, int node) -{ -} - -static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node) -{ -} - -static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep) -{ -} -#endif - static DEFINE_PER_CPU(struct delayed_work, slab_reap_work); static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) @@ -792,13 +660,8 @@ static void start_cpu_timer(int cpu) } } -static struct array_cache *alloc_arraycache(int node, int entries, - int batchcount, gfp_t gfp) +static void init_arraycache(struct array_cache *ac, int limit, int batch) { - int memsize = sizeof(void *) * entries + sizeof(struct array_cache); - struct array_cache *nc = NULL; - - nc = kmalloc_node(memsize, gfp, node); /* * The array_cache structures contain pointers to free object. * However, when such objects are allocated or transferred to another @@ -806,15 +669,24 @@ static struct array_cache *alloc_arraycache(int node, int entries, * valid references during a kmemleak scan. Therefore, kmemleak must * not scan such objects. */ - kmemleak_no_scan(nc); - if (nc) { - nc->avail = 0; - nc->limit = entries; - nc->batchcount = batchcount; - nc->touched = 0; - spin_lock_init(&nc->lock); + kmemleak_no_scan(ac); + if (ac) { + ac->avail = 0; + ac->limit = limit; + ac->batchcount = batch; + ac->touched = 0; } - return nc; +} + +static struct array_cache *alloc_arraycache(int node, int entries, + int batchcount, gfp_t gfp) +{ + size_t memsize = sizeof(void *) * entries + sizeof(struct array_cache); + struct array_cache *ac = NULL; + + ac = kmalloc_node(memsize, gfp, node); + init_arraycache(ac, entries, batchcount); + return ac; } static inline bool is_slab_pfmemalloc(struct page *page) @@ -826,7 +698,7 @@ static inline bool is_slab_pfmemalloc(struct page *page) static void recheck_pfmemalloc_active(struct kmem_cache *cachep, struct array_cache *ac) { - struct kmem_cache_node *n = cachep->node[numa_mem_id()]; + struct kmem_cache_node *n = get_node(cachep, numa_mem_id()); struct page *page; unsigned long flags; @@ -881,7 +753,7 @@ static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac, * If there are empty slabs on the slabs_free list and we are * being forced to refill the cache, mark this one !pfmemalloc. */ - n = cachep->node[numa_mem_id()]; + n = get_node(cachep, numa_mem_id()); if (!list_empty(&n->slabs_free) && force_refill) { struct page *page = virt_to_head_page(objp); ClearPageSlabPfmemalloc(page); @@ -961,12 +833,13 @@ static int transfer_objects(struct array_cache *to, #define drain_alien_cache(cachep, alien) do { } while (0) #define reap_alien(cachep, n) do { } while (0) -static inline struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) +static inline struct alien_cache **alloc_alien_cache(int node, + int limit, gfp_t gfp) { - return (struct array_cache **)BAD_ALIEN_MAGIC; + return NULL; } -static inline void free_alien_cache(struct array_cache **ac_ptr) +static inline void free_alien_cache(struct alien_cache **ac_ptr) { } @@ -992,46 +865,60 @@ static inline void *____cache_alloc_node(struct kmem_cache *cachep, static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int); static void *alternate_node_alloc(struct kmem_cache *, gfp_t); -static struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) +static struct alien_cache *__alloc_alien_cache(int node, int entries, + int batch, gfp_t gfp) +{ + size_t memsize = sizeof(void *) * entries + sizeof(struct alien_cache); + struct alien_cache *alc = NULL; + + alc = kmalloc_node(memsize, gfp, node); + init_arraycache(&alc->ac, entries, batch); + spin_lock_init(&alc->lock); + return alc; +} + +static struct alien_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) { - struct array_cache **ac_ptr; - int memsize = sizeof(void *) * nr_node_ids; + struct alien_cache **alc_ptr; + size_t memsize = sizeof(void *) * nr_node_ids; int i; if (limit > 1) limit = 12; - ac_ptr = kzalloc_node(memsize, gfp, node); - if (ac_ptr) { - for_each_node(i) { - if (i == node || !node_online(i)) - continue; - ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d, gfp); - if (!ac_ptr[i]) { - for (i--; i >= 0; i--) - kfree(ac_ptr[i]); - kfree(ac_ptr); - return NULL; - } + alc_ptr = kzalloc_node(memsize, gfp, node); + if (!alc_ptr) + return NULL; + + for_each_node(i) { + if (i == node || !node_online(i)) + continue; + alc_ptr[i] = __alloc_alien_cache(node, limit, 0xbaadf00d, gfp); + if (!alc_ptr[i]) { + for (i--; i >= 0; i--) + kfree(alc_ptr[i]); + kfree(alc_ptr); + return NULL; } } - return ac_ptr; + return alc_ptr; } -static void free_alien_cache(struct array_cache **ac_ptr) +static void free_alien_cache(struct alien_cache **alc_ptr) { int i; - if (!ac_ptr) + if (!alc_ptr) return; for_each_node(i) - kfree(ac_ptr[i]); - kfree(ac_ptr); + kfree(alc_ptr[i]); + kfree(alc_ptr); } static void __drain_alien_cache(struct kmem_cache *cachep, - struct array_cache *ac, int node) + struct array_cache *ac, int node, + struct list_head *list) { - struct kmem_cache_node *n = cachep->node[node]; + struct kmem_cache_node *n = get_node(cachep, node); if (ac->avail) { spin_lock(&n->list_lock); @@ -1043,7 +930,7 @@ static void __drain_alien_cache(struct kmem_cache *cachep, if (n->shared) transfer_objects(n->shared, ac, ac->limit); - free_block(cachep, ac->entry, ac->avail, node); + free_block(cachep, ac->entry, ac->avail, node, list); ac->avail = 0; spin_unlock(&n->list_lock); } @@ -1057,28 +944,40 @@ static void reap_alien(struct kmem_cache *cachep, struct kmem_cache_node *n) int node = __this_cpu_read(slab_reap_node); if (n->alien) { - struct array_cache *ac = n->alien[node]; + struct alien_cache *alc = n->alien[node]; + struct array_cache *ac; + + if (alc) { + ac = &alc->ac; + if (ac->avail && spin_trylock_irq(&alc->lock)) { + LIST_HEAD(list); - if (ac && ac->avail && spin_trylock_irq(&ac->lock)) { - __drain_alien_cache(cachep, ac, node); - spin_unlock_irq(&ac->lock); + __drain_alien_cache(cachep, ac, node, &list); + spin_unlock_irq(&alc->lock); + slabs_destroy(cachep, &list); + } } } } static void drain_alien_cache(struct kmem_cache *cachep, - struct array_cache **alien) + struct alien_cache **alien) { int i = 0; + struct alien_cache *alc; struct array_cache *ac; unsigned long flags; for_each_online_node(i) { - ac = alien[i]; - if (ac) { - spin_lock_irqsave(&ac->lock, flags); - __drain_alien_cache(cachep, ac, i); - spin_unlock_irqrestore(&ac->lock, flags); + alc = alien[i]; + if (alc) { + LIST_HEAD(list); + + ac = &alc->ac; + spin_lock_irqsave(&alc->lock, flags); + __drain_alien_cache(cachep, ac, i, &list); + spin_unlock_irqrestore(&alc->lock, flags); + slabs_destroy(cachep, &list); } } } @@ -1087,8 +986,10 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) { int nodeid = page_to_nid(virt_to_page(objp)); struct kmem_cache_node *n; - struct array_cache *alien = NULL; + struct alien_cache *alien = NULL; + struct array_cache *ac; int node; + LIST_HEAD(list); node = numa_mem_id(); @@ -1099,21 +1000,25 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) if (likely(nodeid == node)) return 0; - n = cachep->node[node]; + n = get_node(cachep, node); STATS_INC_NODEFREES(cachep); if (n->alien && n->alien[nodeid]) { alien = n->alien[nodeid]; + ac = &alien->ac; spin_lock(&alien->lock); - if (unlikely(alien->avail == alien->limit)) { + if (unlikely(ac->avail == ac->limit)) { STATS_INC_ACOVERFLOW(cachep); - __drain_alien_cache(cachep, alien, nodeid); + __drain_alien_cache(cachep, ac, nodeid, &list); } - ac_put_obj(cachep, alien, objp); + ac_put_obj(cachep, ac, objp); spin_unlock(&alien->lock); + slabs_destroy(cachep, &list); } else { - spin_lock(&(cachep->node[nodeid])->list_lock); - free_block(cachep, &objp, 1, nodeid); - spin_unlock(&(cachep->node[nodeid])->list_lock); + n = get_node(cachep, nodeid); + spin_lock(&n->list_lock); + free_block(cachep, &objp, 1, nodeid, &list); + spin_unlock(&n->list_lock); + slabs_destroy(cachep, &list); } return 1; } @@ -1132,7 +1037,7 @@ static int init_cache_node_node(int node) { struct kmem_cache *cachep; struct kmem_cache_node *n; - const int memsize = sizeof(struct kmem_cache_node); + const size_t memsize = sizeof(struct kmem_cache_node); list_for_each_entry(cachep, &slab_caches, list) { /* @@ -1140,7 +1045,8 @@ static int init_cache_node_node(int node) * begin anything. Make sure some other cpu on this * node has not already allocated this */ - if (!cachep->node[node]) { + n = get_node(cachep, node); + if (!n) { n = kmalloc_node(memsize, GFP_KERNEL, node); if (!n) return -ENOMEM; @@ -1156,11 +1062,11 @@ static int init_cache_node_node(int node) cachep->node[node] = n; } - spin_lock_irq(&cachep->node[node]->list_lock); - cachep->node[node]->free_limit = + spin_lock_irq(&n->list_lock); + n->free_limit = (1 + nr_cpus_node(node)) * cachep->batchcount + cachep->num; - spin_unlock_irq(&cachep->node[node]->list_lock); + spin_unlock_irq(&n->list_lock); } return 0; } @@ -1181,12 +1087,13 @@ static void cpuup_canceled(long cpu) list_for_each_entry(cachep, &slab_caches, list) { struct array_cache *nc; struct array_cache *shared; - struct array_cache **alien; + struct alien_cache **alien; + LIST_HEAD(list); /* cpu is dead; no one can alloc from it. */ nc = cachep->array[cpu]; cachep->array[cpu] = NULL; - n = cachep->node[node]; + n = get_node(cachep, node); if (!n) goto free_array_cache; @@ -1196,7 +1103,7 @@ static void cpuup_canceled(long cpu) /* Free limit for this kmem_cache_node */ n->free_limit -= cachep->batchcount; if (nc) - free_block(cachep, nc->entry, nc->avail, node); + free_block(cachep, nc->entry, nc->avail, node, &list); if (!cpumask_empty(mask)) { spin_unlock_irq(&n->list_lock); @@ -1206,7 +1113,7 @@ static void cpuup_canceled(long cpu) shared = n->shared; if (shared) { free_block(cachep, shared->entry, - shared->avail, node); + shared->avail, node, &list); n->shared = NULL; } @@ -1221,6 +1128,7 @@ static void cpuup_canceled(long cpu) free_alien_cache(alien); } free_array_cache: + slabs_destroy(cachep, &list); kfree(nc); } /* @@ -1229,7 +1137,7 @@ free_array_cache: * shrink each nodelist to its limit. */ list_for_each_entry(cachep, &slab_caches, list) { - n = cachep->node[node]; + n = get_node(cachep, node); if (!n) continue; drain_freelist(cachep, n, slabs_tofree(cachep, n)); @@ -1260,7 +1168,7 @@ static int cpuup_prepare(long cpu) list_for_each_entry(cachep, &slab_caches, list) { struct array_cache *nc; struct array_cache *shared = NULL; - struct array_cache **alien = NULL; + struct alien_cache **alien = NULL; nc = alloc_arraycache(node, cachep->limit, cachep->batchcount, GFP_KERNEL); @@ -1284,7 +1192,7 @@ static int cpuup_prepare(long cpu) } } cachep->array[cpu] = nc; - n = cachep->node[node]; + n = get_node(cachep, node); BUG_ON(!n); spin_lock_irq(&n->list_lock); @@ -1305,13 +1213,7 @@ static int cpuup_prepare(long cpu) spin_unlock_irq(&n->list_lock); kfree(shared); free_alien_cache(alien); - if (cachep->flags & SLAB_DEBUG_OBJECTS) - slab_set_debugobj_lock_classes_node(cachep, node); - else if (!OFF_SLAB(cachep) && - !(cachep->flags & SLAB_DESTROY_BY_RCU)) - on_slab_lock_classes_node(cachep, node); } - init_node_lock_keys(node); return 0; bad: @@ -1395,7 +1297,7 @@ static int __meminit drain_cache_node_node(int node) list_for_each_entry(cachep, &slab_caches, list) { struct kmem_cache_node *n; - n = cachep->node[node]; + n = get_node(cachep, node); if (!n) continue; @@ -1575,10 +1477,6 @@ void __init kmem_cache_init(void) memcpy(ptr, cpu_cache_get(kmem_cache), sizeof(struct arraycache_init)); - /* - * Do not assume that spinlocks can be initialized via memcpy: - */ - spin_lock_init(&ptr->lock); kmem_cache->array[smp_processor_id()] = ptr; @@ -1588,10 +1486,6 @@ void __init kmem_cache_init(void) != &initarray_generic.cache); memcpy(ptr, cpu_cache_get(kmalloc_caches[INDEX_AC]), sizeof(struct arraycache_init)); - /* - * Do not assume that spinlocks can be initialized via memcpy: - */ - spin_lock_init(&ptr->lock); kmalloc_caches[INDEX_AC]->array[smp_processor_id()] = ptr; } @@ -1628,9 +1522,6 @@ void __init kmem_cache_init_late(void) BUG(); mutex_unlock(&slab_mutex); - /* Annotate slab for lockdep -- annotate the malloc caches */ - init_lock_keys(); - /* Done! */ slab_state = FULL; @@ -1690,14 +1581,10 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) printk(KERN_WARNING " cache: %s, object size: %d, order: %d\n", cachep->name, cachep->size, cachep->gfporder); - for_each_online_node(node) { + for_each_kmem_cache_node(cachep, node, n) { unsigned long active_objs = 0, num_objs = 0, free_objects = 0; unsigned long active_slabs = 0, num_slabs = 0; - n = cachep->node[node]; - if (!n) - continue; - spin_lock_irqsave(&n->list_lock, flags); list_for_each_entry(page, &n->slabs_full, lru) { active_objs += cachep->num; @@ -1724,7 +1611,8 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) } /* - * Interface to system's page allocator. No need to hold the cache-lock. + * Interface to system's page allocator. No need to hold the + * kmem_cache_node ->list_lock. * * If we requested dmaable memory, we will get it. Even if we * did not request dmaable memory, we might get it, but that @@ -2026,9 +1914,9 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, * @cachep: cache pointer being destroyed * @page: page pointer being destroyed * - * Destroy all the objs in a slab, and release the mem back to the system. - * Before calling the slab must have been unlinked from the cache. The - * cache-lock is not held/needed. + * Destroy all the objs in a slab page, and release the mem back to the system. + * Before calling the slab page must have been unlinked from the cache. The + * kmem_cache_node ->list_lock is not held/needed. */ static void slab_destroy(struct kmem_cache *cachep, struct page *page) { @@ -2060,6 +1948,16 @@ static void slab_destroy(struct kmem_cache *cachep, struct page *page) kmem_cache_free(cachep->freelist_cache, freelist); } +static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list) +{ + struct page *page, *n; + + list_for_each_entry_safe(page, n, list, lru) { + list_del(&page->lru); + slab_destroy(cachep, page); + } +} + /** * calculate_slab_order - calculate size (page order) of slabs * @cachep: pointer to the cache that is being created @@ -2405,17 +2303,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) return err; } - if (flags & SLAB_DEBUG_OBJECTS) { - /* - * Would deadlock through slab_destroy()->call_rcu()-> - * debug_object_activate()->kmem_cache_alloc(). - */ - WARN_ON_ONCE(flags & SLAB_DESTROY_BY_RCU); - - slab_set_debugobj_lock_classes(cachep); - } else if (!OFF_SLAB(cachep) && !(flags & SLAB_DESTROY_BY_RCU)) - on_slab_lock_classes(cachep); - return 0; } @@ -2434,7 +2321,7 @@ static void check_spinlock_acquired(struct kmem_cache *cachep) { #ifdef CONFIG_SMP check_irq_off(); - assert_spin_locked(&cachep->node[numa_mem_id()]->list_lock); + assert_spin_locked(&get_node(cachep, numa_mem_id())->list_lock); #endif } @@ -2442,7 +2329,7 @@ static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node) { #ifdef CONFIG_SMP check_irq_off(); - assert_spin_locked(&cachep->node[node]->list_lock); + assert_spin_locked(&get_node(cachep, node)->list_lock); #endif } @@ -2462,12 +2349,16 @@ static void do_drain(void *arg) struct kmem_cache *cachep = arg; struct array_cache *ac; int node = numa_mem_id(); + struct kmem_cache_node *n; + LIST_HEAD(list); check_irq_off(); ac = cpu_cache_get(cachep); - spin_lock(&cachep->node[node]->list_lock); - free_block(cachep, ac->entry, ac->avail, node); - spin_unlock(&cachep->node[node]->list_lock); + n = get_node(cachep, node); + spin_lock(&n->list_lock); + free_block(cachep, ac->entry, ac->avail, node, &list); + spin_unlock(&n->list_lock); + slabs_destroy(cachep, &list); ac->avail = 0; } @@ -2478,17 +2369,12 @@ static void drain_cpu_caches(struct kmem_cache *cachep) on_each_cpu(do_drain, cachep, 1); check_irq_on(); - for_each_online_node(node) { - n = cachep->node[node]; - if (n && n->alien) + for_each_kmem_cache_node(cachep, node, n) + if (n->alien) drain_alien_cache(cachep, n->alien); - } - for_each_online_node(node) { - n = cachep->node[node]; - if (n) - drain_array(cachep, n, n->shared, 1, node); - } + for_each_kmem_cache_node(cachep, node, n) + drain_array(cachep, n, n->shared, 1, node); } /* @@ -2534,17 +2420,14 @@ out: int __kmem_cache_shrink(struct kmem_cache *cachep) { - int ret = 0, i = 0; + int ret = 0; + int node; struct kmem_cache_node *n; drain_cpu_caches(cachep); check_irq_on(); - for_each_online_node(i) { - n = cachep->node[i]; - if (!n) - continue; - + for_each_kmem_cache_node(cachep, node, n) { drain_freelist(cachep, n, slabs_tofree(cachep, n)); ret += !list_empty(&n->slabs_full) || @@ -2566,13 +2449,11 @@ int __kmem_cache_shutdown(struct kmem_cache *cachep) kfree(cachep->array[i]); /* NUMA: free the node structures */ - for_each_online_node(i) { - n = cachep->node[i]; - if (n) { - kfree(n->shared); - free_alien_cache(n->alien); - kfree(n); - } + for_each_kmem_cache_node(cachep, i, n) { + kfree(n->shared); + free_alien_cache(n->alien); + kfree(n); + cachep->node[i] = NULL; } return 0; } @@ -2751,7 +2632,7 @@ static int cache_grow(struct kmem_cache *cachep, /* Take the node list lock to change the colour_next on this node */ check_irq_off(); - n = cachep->node[nodeid]; + n = get_node(cachep, nodeid); spin_lock(&n->list_lock); /* Get colour for the slab, and cal the next value. */ @@ -2920,7 +2801,7 @@ retry: */ batchcount = BATCHREFILL_LIMIT; } - n = cachep->node[node]; + n = get_node(cachep, node); BUG_ON(ac->avail > 0 || !n); spin_lock(&n->list_lock); @@ -3060,7 +2941,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags) { - if (cachep == kmem_cache) + if (unlikely(cachep == kmem_cache)) return false; return should_failslab(cachep->object_size, flags, cachep->flags); @@ -3169,8 +3050,8 @@ retry: nid = zone_to_nid(zone); if (cpuset_zone_allowed_hardwall(zone, flags) && - cache->node[nid] && - cache->node[nid]->free_objects) { + get_node(cache, nid) && + get_node(cache, nid)->free_objects) { obj = ____cache_alloc_node(cache, flags | GFP_THISNODE, nid); if (obj) @@ -3233,7 +3114,7 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int x; VM_BUG_ON(nodeid > num_online_nodes()); - n = cachep->node[nodeid]; + n = get_node(cachep, nodeid); BUG_ON(!n); retry: @@ -3304,7 +3185,7 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, if (nodeid == NUMA_NO_NODE) nodeid = slab_node; - if (unlikely(!cachep->node[nodeid])) { + if (unlikely(!get_node(cachep, nodeid))) { /* Node not bootstrapped yet */ ptr = fallback_alloc(cachep, flags); goto out; @@ -3405,12 +3286,13 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller) /* * Caller needs to acquire correct kmem_cache_node's list_lock + * @list: List of detached free slabs should be freed by caller */ -static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, - int node) +static void free_block(struct kmem_cache *cachep, void **objpp, + int nr_objects, int node, struct list_head *list) { int i; - struct kmem_cache_node *n; + struct kmem_cache_node *n = get_node(cachep, node); for (i = 0; i < nr_objects; i++) { void *objp; @@ -3420,7 +3302,6 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, objp = objpp[i]; page = virt_to_head_page(objp); - n = cachep->node[node]; list_del(&page->lru); check_spinlock_acquired_node(cachep, node); slab_put_obj(cachep, page, objp, node); @@ -3431,13 +3312,7 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, if (page->active == 0) { if (n->free_objects > n->free_limit) { n->free_objects -= cachep->num; - /* No need to drop any previously held - * lock here, even if we have a off-slab slab - * descriptor it is guaranteed to come from - * a different cache, refer to comments before - * alloc_slabmgmt. - */ - slab_destroy(cachep, page); + list_add_tail(&page->lru, list); } else { list_add(&page->lru, &n->slabs_free); } @@ -3456,13 +3331,14 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) int batchcount; struct kmem_cache_node *n; int node = numa_mem_id(); + LIST_HEAD(list); batchcount = ac->batchcount; #if DEBUG BUG_ON(!batchcount || batchcount > ac->avail); #endif check_irq_off(); - n = cachep->node[node]; + n = get_node(cachep, node); spin_lock(&n->list_lock); if (n->shared) { struct array_cache *shared_array = n->shared; @@ -3477,7 +3353,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) } } - free_block(cachep, ac->entry, batchcount, node); + free_block(cachep, ac->entry, batchcount, node, &list); free_done: #if STATS { @@ -3498,6 +3374,7 @@ free_done: } #endif spin_unlock(&n->list_lock); + slabs_destroy(cachep, &list); ac->avail -= batchcount; memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail); } @@ -3754,7 +3631,7 @@ static int alloc_kmem_cache_node(struct kmem_cache *cachep, gfp_t gfp) int node; struct kmem_cache_node *n; struct array_cache *new_shared; - struct array_cache **new_alien = NULL; + struct alien_cache **new_alien = NULL; for_each_online_node(node) { @@ -3775,15 +3652,16 @@ static int alloc_kmem_cache_node(struct kmem_cache *cachep, gfp_t gfp) } } - n = cachep->node[node]; + n = get_node(cachep, node); if (n) { struct array_cache *shared = n->shared; + LIST_HEAD(list); spin_lock_irq(&n->list_lock); if (shared) free_block(cachep, shared->entry, - shared->avail, node); + shared->avail, node, &list); n->shared = new_shared; if (!n->alien) { @@ -3793,6 +3671,7 @@ static int alloc_kmem_cache_node(struct kmem_cache *cachep, gfp_t gfp) n->free_limit = (1 + nr_cpus_node(node)) * cachep->batchcount + cachep->num; spin_unlock_irq(&n->list_lock); + slabs_destroy(cachep, &list); kfree(shared); free_alien_cache(new_alien); continue; @@ -3820,9 +3699,8 @@ fail: /* Cache is not active yet. Roll back what we did */ node--; while (node >= 0) { - if (cachep->node[node]) { - n = cachep->node[node]; - + n = get_node(cachep, node); + if (n) { kfree(n->shared); free_alien_cache(n->alien); kfree(n); @@ -3883,12 +3761,20 @@ static int __do_tune_cpucache(struct kmem_cache *cachep, int limit, cachep->shared = shared; for_each_online_cpu(i) { + LIST_HEAD(list); struct array_cache *ccold = new->new[i]; + int node; + struct kmem_cache_node *n; + if (!ccold) continue; - spin_lock_irq(&cachep->node[cpu_to_mem(i)]->list_lock); - free_block(cachep, ccold->entry, ccold->avail, cpu_to_mem(i)); - spin_unlock_irq(&cachep->node[cpu_to_mem(i)]->list_lock); + + node = cpu_to_mem(i); + n = get_node(cachep, node); + spin_lock_irq(&n->list_lock); + free_block(cachep, ccold->entry, ccold->avail, node, &list); + spin_unlock_irq(&n->list_lock); + slabs_destroy(cachep, &list); kfree(ccold); } kfree(new); @@ -3996,6 +3882,7 @@ skip_setup: static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n, struct array_cache *ac, int force, int node) { + LIST_HEAD(list); int tofree; if (!ac || !ac->avail) @@ -4008,12 +3895,13 @@ static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n, tofree = force ? ac->avail : (ac->limit + 4) / 5; if (tofree > ac->avail) tofree = (ac->avail + 1) / 2; - free_block(cachep, ac->entry, tofree, node); + free_block(cachep, ac->entry, tofree, node, &list); ac->avail -= tofree; memmove(ac->entry, &(ac->entry[tofree]), sizeof(void *) * ac->avail); } spin_unlock_irq(&n->list_lock); + slabs_destroy(cachep, &list); } } @@ -4048,7 +3936,7 @@ static void cache_reap(struct work_struct *w) * have established with reasonable certainty that * we can do some work if the lock was obtained. */ - n = searchp->node[node]; + n = get_node(searchp, node); reap_alien(searchp, n); @@ -4100,10 +3988,7 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) active_objs = 0; num_slabs = 0; - for_each_online_node(node) { - n = cachep->node[node]; - if (!n) - continue; + for_each_kmem_cache_node(cachep, node, n) { check_irq_on(); spin_lock_irq(&n->list_lock); @@ -4328,10 +4213,7 @@ static int leaks_show(struct seq_file *m, void *p) x[1] = 0; - for_each_online_node(node) { - n = cachep->node[node]; - if (!n) - continue; + for_each_kmem_cache_node(cachep, node, n) { check_irq_on(); spin_lock_irq(&n->list_lock); @@ -256,13 +256,12 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) return cachep; pr_err("%s: Wrong slab cache. %s but object is from %s\n", - __FUNCTION__, cachep->name, s->name); + __func__, cachep->name, s->name); WARN_ON_ONCE(1); return s; } -#endif - +#ifndef CONFIG_SLOB /* * The slab lists for all objects. */ @@ -277,7 +276,7 @@ struct kmem_cache_node { unsigned int free_limit; unsigned int colour_next; /* Per-node cache coloring */ struct array_cache *shared; /* shared per node */ - struct array_cache **alien; /* on other nodes */ + struct alien_cache **alien; /* on other nodes */ unsigned long next_reap; /* updated without locking */ int free_touched; /* updated without locking */ #endif @@ -294,5 +293,22 @@ struct kmem_cache_node { }; +static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) +{ + return s->node[node]; +} + +/* + * Iterator over all nodes. The body will be executed for each node that has + * a kmem_cache_node structure allocated (which is true for all online nodes) + */ +#define for_each_kmem_cache_node(__s, __node, __n) \ + for (__node = 0; __n = get_node(__s, __node), __node < nr_node_ids; __node++) \ + if (__n) + +#endif + void *slab_next(struct seq_file *m, void *p, loff_t *pos); void slab_stop(struct seq_file *m, void *p); + +#endif /* MM_SLAB_H */ diff --git a/mm/slab_common.c b/mm/slab_common.c index d31c4ba..d319502 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -19,6 +19,8 @@ #include <asm/tlbflush.h> #include <asm/page.h> #include <linux/memcontrol.h> + +#define CREATE_TRACE_POINTS #include <trace/events/kmem.h> #include "slab.h" @@ -787,3 +789,102 @@ static int __init slab_proc_init(void) } module_init(slab_proc_init); #endif /* CONFIG_SLABINFO */ + +static __always_inline void *__do_krealloc(const void *p, size_t new_size, + gfp_t flags) +{ + void *ret; + size_t ks = 0; + + if (p) + ks = ksize(p); + + if (ks >= new_size) + return (void *)p; + + ret = kmalloc_track_caller(new_size, flags); + if (ret && p) + memcpy(ret, p, ks); + + return ret; +} + +/** + * __krealloc - like krealloc() but don't free @p. + * @p: object to reallocate memory for. + * @new_size: how many bytes of memory are required. + * @flags: the type of memory to allocate. + * + * This function is like krealloc() except it never frees the originally + * allocated buffer. Use this if you don't want to free the buffer immediately + * like, for example, with RCU. + */ +void *__krealloc(const void *p, size_t new_size, gfp_t flags) +{ + if (unlikely(!new_size)) + return ZERO_SIZE_PTR; + + return __do_krealloc(p, new_size, flags); + +} +EXPORT_SYMBOL(__krealloc); + +/** + * krealloc - reallocate memory. The contents will remain unchanged. + * @p: object to reallocate memory for. + * @new_size: how many bytes of memory are required. + * @flags: the type of memory to allocate. + * + * The contents of the object pointed to are preserved up to the + * lesser of the new and old sizes. If @p is %NULL, krealloc() + * behaves exactly like kmalloc(). If @new_size is 0 and @p is not a + * %NULL pointer, the object pointed to is freed. + */ +void *krealloc(const void *p, size_t new_size, gfp_t flags) +{ + void *ret; + + if (unlikely(!new_size)) { + kfree(p); + return ZERO_SIZE_PTR; + } + + ret = __do_krealloc(p, new_size, flags); + if (ret && p != ret) + kfree(p); + + return ret; +} +EXPORT_SYMBOL(krealloc); + +/** + * kzfree - like kfree but zero memory + * @p: object to free memory of + * + * The memory of the object @p points to is zeroed before freed. + * If @p is %NULL, kzfree() does nothing. + * + * Note: this function zeroes the whole allocated buffer which can be a good + * deal bigger than the requested buffer size passed to kmalloc(). So be + * careful when using this function in performance sensitive code. + */ +void kzfree(const void *p) +{ + size_t ks; + void *mem = (void *)p; + + if (unlikely(ZERO_OR_NULL_PTR(mem))) + return; + ks = ksize(mem); + memset(mem, 0, ks); + kfree(mem); +} +EXPORT_SYMBOL(kzfree); + +/* Tracepoints definitions. */ +EXPORT_TRACEPOINT_SYMBOL(kmalloc); +EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); +EXPORT_TRACEPOINT_SYMBOL(kmalloc_node); +EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc_node); +EXPORT_TRACEPOINT_SYMBOL(kfree); +EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free); @@ -233,11 +233,6 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si) * Core slab cache functions *******************************************************************/ -static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) -{ - return s->node[node]; -} - /* Verify that a pointer has an address that is valid within a slab page */ static inline int check_valid_pointer(struct kmem_cache *s, struct page *page, const void *object) @@ -288,6 +283,10 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\ __p += (__s)->size) +#define for_each_object_idx(__p, __idx, __s, __addr, __objects) \ + for (__p = (__addr), __idx = 1; __idx <= __objects;\ + __p += (__s)->size, __idx++) + /* Determine object index from a given position */ static inline int slab_index(void *p, struct kmem_cache *s, void *addr) { @@ -382,9 +381,9 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) if (s->flags & __CMPXCHG_DOUBLE) { if (cmpxchg_double(&page->freelist, &page->counters, - freelist_old, counters_old, - freelist_new, counters_new)) - return 1; + freelist_old, counters_old, + freelist_new, counters_new)) + return 1; } else #endif { @@ -418,9 +417,9 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) if (s->flags & __CMPXCHG_DOUBLE) { if (cmpxchg_double(&page->freelist, &page->counters, - freelist_old, counters_old, - freelist_new, counters_new)) - return 1; + freelist_old, counters_old, + freelist_new, counters_new)) + return 1; } else #endif { @@ -945,60 +944,6 @@ static void trace(struct kmem_cache *s, struct page *page, void *object, } /* - * Hooks for other subsystems that check memory allocations. In a typical - * production configuration these hooks all should produce no code at all. - */ -static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) -{ - kmemleak_alloc(ptr, size, 1, flags); -} - -static inline void kfree_hook(const void *x) -{ - kmemleak_free(x); -} - -static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) -{ - flags &= gfp_allowed_mask; - lockdep_trace_alloc(flags); - might_sleep_if(flags & __GFP_WAIT); - - return should_failslab(s->object_size, flags, s->flags); -} - -static inline void slab_post_alloc_hook(struct kmem_cache *s, - gfp_t flags, void *object) -{ - flags &= gfp_allowed_mask; - kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); - kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags); -} - -static inline void slab_free_hook(struct kmem_cache *s, void *x) -{ - kmemleak_free_recursive(x, s->flags); - - /* - * Trouble is that we may no longer disable interrupts in the fast path - * So in order to make the debug calls that expect irqs to be - * disabled we need to disable interrupts temporarily. - */ -#if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP) - { - unsigned long flags; - - local_irq_save(flags); - kmemcheck_slab_free(s, x, s->object_size); - debug_check_no_locks_freed(x, s->object_size); - local_irq_restore(flags); - } -#endif - if (!(s->flags & SLAB_DEBUG_OBJECTS)) - debug_check_no_obj_freed(x, s->object_size); -} - -/* * Tracking of fully allocated slabs for debugging purposes. */ static void add_full(struct kmem_cache *s, @@ -1282,6 +1227,12 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node, static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) {} +#endif /* CONFIG_SLUB_DEBUG */ + +/* + * Hooks for other subsystems that check memory allocations. In a typical + * production configuration these hooks all should produce no code at all. + */ static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) { kmemleak_alloc(ptr, size, 1, flags); @@ -1293,21 +1244,44 @@ static inline void kfree_hook(const void *x) } static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) - { return 0; } +{ + flags &= gfp_allowed_mask; + lockdep_trace_alloc(flags); + might_sleep_if(flags & __GFP_WAIT); + + return should_failslab(s->object_size, flags, s->flags); +} -static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, - void *object) +static inline void slab_post_alloc_hook(struct kmem_cache *s, + gfp_t flags, void *object) { - kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, - flags & gfp_allowed_mask); + flags &= gfp_allowed_mask; + kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); + kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags); } static inline void slab_free_hook(struct kmem_cache *s, void *x) { kmemleak_free_recursive(x, s->flags); -} -#endif /* CONFIG_SLUB_DEBUG */ + /* + * Trouble is that we may no longer disable interrupts in the fast path + * So in order to make the debug calls that expect irqs to be + * disabled we need to disable interrupts temporarily. + */ +#if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP) + { + unsigned long flags; + + local_irq_save(flags); + kmemcheck_slab_free(s, x, s->object_size); + debug_check_no_locks_freed(x, s->object_size); + local_irq_restore(flags); + } +#endif + if (!(s->flags & SLAB_DEBUG_OBJECTS)) + debug_check_no_obj_freed(x, s->object_size); +} /* * Slab allocation and freeing @@ -1409,9 +1383,9 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) { struct page *page; void *start; - void *last; void *p; int order; + int idx; BUG_ON(flags & GFP_SLAB_BUG_MASK); @@ -1432,14 +1406,13 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) if (unlikely(s->flags & SLAB_POISON)) memset(start, POISON_INUSE, PAGE_SIZE << order); - last = start; - for_each_object(p, s, start, page->objects) { - setup_object(s, page, last); - set_freepointer(s, last, p); - last = p; + for_each_object_idx(p, idx, s, start, page->objects) { + setup_object(s, page, p); + if (likely(idx < page->objects)) + set_freepointer(s, p, p + s->size); + else + set_freepointer(s, p, NULL); } - setup_object(s, page, last); - set_freepointer(s, last, NULL); page->freelist = start; page->inuse = page->objects; @@ -2162,6 +2135,7 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) static DEFINE_RATELIMIT_STATE(slub_oom_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); int node; + struct kmem_cache_node *n; if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs)) return; @@ -2176,15 +2150,11 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) pr_warn(" %s debugging increased min order, use slub_debug=O to disable.\n", s->name); - for_each_online_node(node) { - struct kmem_cache_node *n = get_node(s, node); + for_each_kmem_cache_node(s, node, n) { unsigned long nr_slabs; unsigned long nr_objs; unsigned long nr_free; - if (!n) - continue; - nr_free = count_partial(n, count_free); nr_slabs = node_nr_slabs(n); nr_objs = node_nr_objs(n); @@ -2928,13 +2898,10 @@ static void early_kmem_cache_node_alloc(int node) static void free_kmem_cache_nodes(struct kmem_cache *s) { int node; + struct kmem_cache_node *n; - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = s->node[node]; - - if (n) - kmem_cache_free(kmem_cache_node, n); - + for_each_kmem_cache_node(s, node, n) { + kmem_cache_free(kmem_cache_node, n); s->node[node] = NULL; } } @@ -3222,12 +3189,11 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) static inline int kmem_cache_close(struct kmem_cache *s) { int node; + struct kmem_cache_node *n; flush_all(s); /* Attempt to free all objects */ - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = get_node(s, node); - + for_each_kmem_cache_node(s, node, n) { free_partial(s, n); if (n->nr_partial || slabs_node(s, node)) return 1; @@ -3412,9 +3378,7 @@ int __kmem_cache_shrink(struct kmem_cache *s) return -ENOMEM; flush_all(s); - for_each_node_state(node, N_NORMAL_MEMORY) { - n = get_node(s, node); - + for_each_kmem_cache_node(s, node, n) { if (!n->nr_partial) continue; @@ -3586,6 +3550,7 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache) { int node; struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); + struct kmem_cache_node *n; memcpy(s, static_cache, kmem_cache->object_size); @@ -3595,19 +3560,16 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache) * IPIs around. */ __flush_cpu_slab(s, smp_processor_id()); - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = get_node(s, node); + for_each_kmem_cache_node(s, node, n) { struct page *p; - if (n) { - list_for_each_entry(p, &n->partial, lru) - p->slab_cache = s; + list_for_each_entry(p, &n->partial, lru) + p->slab_cache = s; #ifdef CONFIG_SLUB_DEBUG - list_for_each_entry(p, &n->full, lru) - p->slab_cache = s; + list_for_each_entry(p, &n->full, lru) + p->slab_cache = s; #endif - } } list_add(&s->list, &slab_caches); return s; @@ -3960,16 +3922,14 @@ static long validate_slab_cache(struct kmem_cache *s) unsigned long count = 0; unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * sizeof(unsigned long), GFP_KERNEL); + struct kmem_cache_node *n; if (!map) return -ENOMEM; flush_all(s); - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = get_node(s, node); - + for_each_kmem_cache_node(s, node, n) count += validate_slab_node(s, n, map); - } kfree(map); return count; } @@ -4123,6 +4083,7 @@ static int list_locations(struct kmem_cache *s, char *buf, int node; unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * sizeof(unsigned long), GFP_KERNEL); + struct kmem_cache_node *n; if (!map || !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location), GFP_TEMPORARY)) { @@ -4132,8 +4093,7 @@ static int list_locations(struct kmem_cache *s, char *buf, /* Push back cpu slabs */ flush_all(s); - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = get_node(s, node); + for_each_kmem_cache_node(s, node, n) { unsigned long flags; struct page *page; @@ -4205,7 +4165,7 @@ static int list_locations(struct kmem_cache *s, char *buf, #endif #ifdef SLUB_RESILIENCY_TEST -static void resiliency_test(void) +static void __init resiliency_test(void) { u8 *p; @@ -4332,8 +4292,9 @@ static ssize_t show_slab_objects(struct kmem_cache *s, get_online_mems(); #ifdef CONFIG_SLUB_DEBUG if (flags & SO_ALL) { - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = get_node(s, node); + struct kmem_cache_node *n; + + for_each_kmem_cache_node(s, node, n) { if (flags & SO_TOTAL) x = atomic_long_read(&n->total_objects); @@ -4349,9 +4310,9 @@ static ssize_t show_slab_objects(struct kmem_cache *s, } else #endif if (flags & SO_PARTIAL) { - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = get_node(s, node); + struct kmem_cache_node *n; + for_each_kmem_cache_node(s, node, n) { if (flags & SO_TOTAL) x = count_partial(n, count_total); else if (flags & SO_OBJECTS) @@ -4364,7 +4325,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s, } x = sprintf(buf, "%lu", total); #ifdef CONFIG_NUMA - for_each_node_state(node, N_NORMAL_MEMORY) + for (node = 0; node < nr_node_ids; node++) if (nodes[node]) x += sprintf(buf + x, " N%d=%lu", node, nodes[node]); @@ -4378,16 +4339,12 @@ static ssize_t show_slab_objects(struct kmem_cache *s, static int any_slab_objects(struct kmem_cache *s) { int node; + struct kmem_cache_node *n; - for_each_online_node(node) { - struct kmem_cache_node *n = get_node(s, node); - - if (!n) - continue; - + for_each_kmem_cache_node(s, node, n) if (atomic_long_read(&n->total_objects)) return 1; - } + return 0; } #endif @@ -4509,7 +4466,7 @@ SLAB_ATTR_RO(ctor); static ssize_t aliases_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", s->refcount - 1); + return sprintf(buf, "%d\n", s->refcount < 0 ? 0 : s->refcount - 1); } SLAB_ATTR_RO(aliases); @@ -5171,12 +5128,6 @@ static char *create_unique_id(struct kmem_cache *s) *p++ = '-'; p += sprintf(p, "%07d", s->size); -#ifdef CONFIG_MEMCG_KMEM - if (!is_root_cache(s)) - p += sprintf(p, "-%08d", - memcg_cache_id(s->memcg_params->memcg)); -#endif - BUG_ON(p > name + ID_STR_LENGTH - 1); return name; } @@ -5342,13 +5293,9 @@ void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo) unsigned long nr_objs = 0; unsigned long nr_free = 0; int node; + struct kmem_cache_node *n; - for_each_online_node(node) { - struct kmem_cache_node *n = get_node(s, node); - - if (!n) - continue; - + for_each_kmem_cache_node(s, node, n) { nr_slabs += node_nr_slabs(n); nr_objs += node_nr_objs(n); nr_free += count_partial(n, count_free); @@ -501,7 +501,7 @@ static void __activate_page(struct page *page, struct lruvec *lruvec, SetPageActive(page); lru += LRU_ACTIVE; add_page_to_lru_list(page, lruvec, lru); - trace_mm_lru_activate(page, page_to_pfn(page)); + trace_mm_lru_activate(page); __count_vm_event(PGACTIVATE); update_page_reclaim_stat(lruvec, file, 1); @@ -589,6 +589,9 @@ static void __lru_cache_activate_page(struct page *page) * inactive,unreferenced -> inactive,referenced * inactive,referenced -> active,unreferenced * active,unreferenced -> active,referenced + * + * When a newly allocated page is not yet visible, so safe for non-atomic ops, + * __SetPageReferenced(page) may be substituted for mark_page_accessed(page). */ void mark_page_accessed(struct page *page) { @@ -614,17 +617,6 @@ void mark_page_accessed(struct page *page) } EXPORT_SYMBOL(mark_page_accessed); -/* - * Used to mark_page_accessed(page) that is not visible yet and when it is - * still safe to use non-atomic ops - */ -void init_page_accessed(struct page *page) -{ - if (!PageReferenced(page)) - __SetPageReferenced(page); -} -EXPORT_SYMBOL(init_page_accessed); - static void __lru_cache_add(struct page *page) { struct pagevec *pvec = &get_cpu_var(lru_add_pvec); @@ -996,7 +988,7 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec, SetPageLRU(page); add_page_to_lru_list(page, lruvec, lru); update_page_reclaim_stat(lruvec, file, active); - trace_mm_lru_insertion(page, page_to_pfn(page), lru, trace_pagemap_flags(page)); + trace_mm_lru_insertion(page, lru); } /* @@ -16,9 +16,6 @@ #include "internal.h" -#define CREATE_TRACE_POINTS -#include <trace/events/kmem.h> - /** * kstrdup - allocate space for and copy an existing string * @s: the string to duplicate @@ -112,97 +109,6 @@ void *memdup_user(const void __user *src, size_t len) } EXPORT_SYMBOL(memdup_user); -static __always_inline void *__do_krealloc(const void *p, size_t new_size, - gfp_t flags) -{ - void *ret; - size_t ks = 0; - - if (p) - ks = ksize(p); - - if (ks >= new_size) - return (void *)p; - - ret = kmalloc_track_caller(new_size, flags); - if (ret && p) - memcpy(ret, p, ks); - - return ret; -} - -/** - * __krealloc - like krealloc() but don't free @p. - * @p: object to reallocate memory for. - * @new_size: how many bytes of memory are required. - * @flags: the type of memory to allocate. - * - * This function is like krealloc() except it never frees the originally - * allocated buffer. Use this if you don't want to free the buffer immediately - * like, for example, with RCU. - */ -void *__krealloc(const void *p, size_t new_size, gfp_t flags) -{ - if (unlikely(!new_size)) - return ZERO_SIZE_PTR; - - return __do_krealloc(p, new_size, flags); - -} -EXPORT_SYMBOL(__krealloc); - -/** - * krealloc - reallocate memory. The contents will remain unchanged. - * @p: object to reallocate memory for. - * @new_size: how many bytes of memory are required. - * @flags: the type of memory to allocate. - * - * The contents of the object pointed to are preserved up to the - * lesser of the new and old sizes. If @p is %NULL, krealloc() - * behaves exactly like kmalloc(). If @new_size is 0 and @p is not a - * %NULL pointer, the object pointed to is freed. - */ -void *krealloc(const void *p, size_t new_size, gfp_t flags) -{ - void *ret; - - if (unlikely(!new_size)) { - kfree(p); - return ZERO_SIZE_PTR; - } - - ret = __do_krealloc(p, new_size, flags); - if (ret && p != ret) - kfree(p); - - return ret; -} -EXPORT_SYMBOL(krealloc); - -/** - * kzfree - like kfree but zero memory - * @p: object to free memory of - * - * The memory of the object @p points to is zeroed before freed. - * If @p is %NULL, kzfree() does nothing. - * - * Note: this function zeroes the whole allocated buffer which can be a good - * deal bigger than the requested buffer size passed to kmalloc(). So be - * careful when using this function in performance sensitive code. - */ -void kzfree(const void *p) -{ - size_t ks; - void *mem = (void *)p; - - if (unlikely(ZERO_OR_NULL_PTR(mem))) - return; - ks = ksize(mem); - memset(mem, 0, ks); - kfree(mem); -} -EXPORT_SYMBOL(kzfree); - /* * strndup_user - duplicate an existing string from user space * @s: The string to duplicate @@ -504,11 +410,3 @@ out_mm: out: return res; } - -/* Tracepoints definitions. */ -EXPORT_TRACEPOINT_SYMBOL(kmalloc); -EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); -EXPORT_TRACEPOINT_SYMBOL(kmalloc_node); -EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc_node); -EXPORT_TRACEPOINT_SYMBOL(kfree); -EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index f64632b..2b0aa54 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1270,19 +1270,15 @@ void unmap_kernel_range(unsigned long addr, unsigned long size) } EXPORT_SYMBOL_GPL(unmap_kernel_range); -int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) +int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page **pages) { unsigned long addr = (unsigned long)area->addr; unsigned long end = addr + get_vm_area_size(area); int err; - err = vmap_page_range(addr, end, prot, *pages); - if (err > 0) { - *pages += err; - err = 0; - } + err = vmap_page_range(addr, end, prot, pages); - return err; + return err > 0 ? 0 : err; } EXPORT_SYMBOL_GPL(map_vm_area); @@ -1548,7 +1544,7 @@ void *vmap(struct page **pages, unsigned int count, if (!area) return NULL; - if (map_vm_area(area, prot, &pages)) { + if (map_vm_area(area, prot, pages)) { vunmap(area->addr); return NULL; } @@ -1566,7 +1562,8 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, const int order = 0; struct page **pages; unsigned int nr_pages, array_size, i; - gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; + const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; + const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN; nr_pages = get_vm_area_size(area) >> PAGE_SHIFT; array_size = (nr_pages * sizeof(struct page *)); @@ -1589,12 +1586,11 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, for (i = 0; i < area->nr_pages; i++) { struct page *page; - gfp_t tmp_mask = gfp_mask | __GFP_NOWARN; if (node == NUMA_NO_NODE) - page = alloc_page(tmp_mask); + page = alloc_page(alloc_mask); else - page = alloc_pages_node(node, tmp_mask, order); + page = alloc_pages_node(node, alloc_mask, order); if (unlikely(!page)) { /* Successfully allocated i pages, free them in __vunmap() */ @@ -1602,9 +1598,11 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, goto fail; } area->pages[i] = page; + if (gfp_mask & __GFP_WAIT) + cond_resched(); } - if (map_vm_area(area, prot, &pages)) + if (map_vm_area(area, prot, pages)) goto fail; return area->addr; @@ -2690,14 +2688,14 @@ void get_vmalloc_info(struct vmalloc_info *vmi) prev_end = VMALLOC_START; - spin_lock(&vmap_area_lock); + rcu_read_lock(); if (list_empty(&vmap_area_list)) { vmi->largest_chunk = VMALLOC_TOTAL; goto out; } - list_for_each_entry(va, &vmap_area_list, list) { + list_for_each_entry_rcu(va, &vmap_area_list, list) { unsigned long addr = va->va_start; /* @@ -2724,7 +2722,7 @@ void get_vmalloc_info(struct vmalloc_info *vmi) vmi->largest_chunk = VMALLOC_END - prev_end; out: - spin_unlock(&vmap_area_lock); + rcu_read_unlock(); } #endif diff --git a/mm/vmscan.c b/mm/vmscan.c index 0f16ffe..d2f65c8 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -59,35 +59,20 @@ #include <trace/events/vmscan.h> struct scan_control { - /* Incremented by the number of inactive pages that were scanned */ - unsigned long nr_scanned; - - /* Number of pages freed so far during a call to shrink_zones() */ - unsigned long nr_reclaimed; - /* How many pages shrink_list() should reclaim */ unsigned long nr_to_reclaim; - unsigned long hibernation_mode; - /* This context's GFP mask */ gfp_t gfp_mask; - int may_writepage; - - /* Can mapped pages be reclaimed? */ - int may_unmap; - - /* Can pages be swapped as part of reclaim? */ - int may_swap; - + /* Allocation order */ int order; - /* Scan (total_size >> priority) pages at once */ - int priority; - - /* anon vs. file LRUs scanning "ratio" */ - int swappiness; + /* + * Nodemask of nodes allowed by the caller. If NULL, all nodes + * are scanned. + */ + nodemask_t *nodemask; /* * The memory cgroup that hit its limit and as a result is the @@ -95,11 +80,27 @@ struct scan_control { */ struct mem_cgroup *target_mem_cgroup; - /* - * Nodemask of nodes allowed by the caller. If NULL, all nodes - * are scanned. - */ - nodemask_t *nodemask; + /* Scan (total_size >> priority) pages at once */ + int priority; + + unsigned int may_writepage:1; + + /* Can mapped pages be reclaimed? */ + unsigned int may_unmap:1; + + /* Can pages be swapped as part of reclaim? */ + unsigned int may_swap:1; + + unsigned int hibernation_mode:1; + + /* One of the zones is ready for compaction */ + unsigned int compaction_ready:1; + + /* Incremented by the number of inactive pages that were scanned */ + unsigned long nr_scanned; + + /* Number of pages freed so far during a call to shrink_zones() */ + unsigned long nr_reclaimed; }; #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) @@ -136,7 +137,11 @@ struct scan_control { * From 0 .. 100. Higher means more swappy. */ int vm_swappiness = 60; -unsigned long vm_total_pages; /* The total number of pages which the VM controls */ +/* + * The total number of pages which are beyond the high watermark within all + * zones. + */ +unsigned long vm_total_pages; static LIST_HEAD(shrinker_list); static DECLARE_RWSEM(shrinker_rwsem); @@ -169,7 +174,8 @@ static unsigned long zone_reclaimable_pages(struct zone *zone) bool zone_reclaimable(struct zone *zone) { - return zone->pages_scanned < zone_reclaimable_pages(zone) * 6; + return zone_page_state(zone, NR_PAGES_SCANNED) < + zone_reclaimable_pages(zone) * 6; } static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru) @@ -1503,7 +1509,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); if (global_reclaim(sc)) { - zone->pages_scanned += nr_scanned; + __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned); if (current_is_kswapd()) __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned); else @@ -1693,7 +1699,7 @@ static void shrink_active_list(unsigned long nr_to_scan, nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold, &nr_scanned, sc, isolate_mode, lru); if (global_reclaim(sc)) - zone->pages_scanned += nr_scanned; + __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned); reclaim_stat->recent_scanned[file] += nr_taken; @@ -1750,7 +1756,7 @@ static void shrink_active_list(unsigned long nr_to_scan, * Count referenced pages from currently used mappings as rotated, * even though only some of them are actually re-activated. This * helps balance scan pressure between file and anonymous pages in - * get_scan_ratio. + * get_scan_count. */ reclaim_stat->recent_rotated[file] += nr_rotated; @@ -1865,8 +1871,8 @@ enum scan_balance { * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan */ -static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, - unsigned long *nr) +static void get_scan_count(struct lruvec *lruvec, int swappiness, + struct scan_control *sc, unsigned long *nr) { struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; u64 fraction[2]; @@ -1909,7 +1915,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, * using the memory controller's swap limit feature would be * too expensive. */ - if (!global_reclaim(sc) && !sc->swappiness) { + if (!global_reclaim(sc) && !swappiness) { scan_balance = SCAN_FILE; goto out; } @@ -1919,16 +1925,11 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, * system is close to OOM, scan both anon and file equally * (unless the swappiness setting disagrees with swapping). */ - if (!sc->priority && sc->swappiness) { + if (!sc->priority && swappiness) { scan_balance = SCAN_EQUAL; goto out; } - anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) + - get_lru_size(lruvec, LRU_INACTIVE_ANON); - file = get_lru_size(lruvec, LRU_ACTIVE_FILE) + - get_lru_size(lruvec, LRU_INACTIVE_FILE); - /* * Prevent the reclaimer from falling into the cache trap: as * cache pages start out inactive, every cache fault will tip @@ -1939,9 +1940,14 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, * anon pages. Try to detect this based on file LRU size. */ if (global_reclaim(sc)) { - unsigned long free = zone_page_state(zone, NR_FREE_PAGES); + unsigned long zonefile; + unsigned long zonefree; - if (unlikely(file + free <= high_wmark_pages(zone))) { + zonefree = zone_page_state(zone, NR_FREE_PAGES); + zonefile = zone_page_state(zone, NR_ACTIVE_FILE) + + zone_page_state(zone, NR_INACTIVE_FILE); + + if (unlikely(zonefile + zonefree <= high_wmark_pages(zone))) { scan_balance = SCAN_ANON; goto out; } @@ -1962,7 +1968,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, * With swappiness at 100, anonymous and file have the same priority. * This scanning priority is essentially the inverse of IO cost. */ - anon_prio = sc->swappiness; + anon_prio = swappiness; file_prio = 200 - anon_prio; /* @@ -1976,6 +1982,12 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, * * anon in [0], file in [1] */ + + anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) + + get_lru_size(lruvec, LRU_INACTIVE_ANON); + file = get_lru_size(lruvec, LRU_ACTIVE_FILE) + + get_lru_size(lruvec, LRU_INACTIVE_FILE); + spin_lock_irq(&zone->lru_lock); if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { reclaim_stat->recent_scanned[0] /= 2; @@ -2052,7 +2064,8 @@ out: /* * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. */ -static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) +static void shrink_lruvec(struct lruvec *lruvec, int swappiness, + struct scan_control *sc) { unsigned long nr[NR_LRU_LISTS]; unsigned long targets[NR_LRU_LISTS]; @@ -2063,7 +2076,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) struct blk_plug plug; bool scan_adjusted; - get_scan_count(lruvec, sc, nr); + get_scan_count(lruvec, swappiness, sc, nr); /* Record the original scan target for proportional adjustments later */ memcpy(targets, nr, sizeof(nr)); @@ -2241,9 +2254,10 @@ static inline bool should_continue_reclaim(struct zone *zone, } } -static void shrink_zone(struct zone *zone, struct scan_control *sc) +static bool shrink_zone(struct zone *zone, struct scan_control *sc) { unsigned long nr_reclaimed, nr_scanned; + bool reclaimable = false; do { struct mem_cgroup *root = sc->target_mem_cgroup; @@ -2259,11 +2273,12 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc) memcg = mem_cgroup_iter(root, NULL, &reclaim); do { struct lruvec *lruvec; + int swappiness; lruvec = mem_cgroup_zone_lruvec(zone, memcg); + swappiness = mem_cgroup_swappiness(memcg); - sc->swappiness = mem_cgroup_swappiness(memcg); - shrink_lruvec(lruvec, sc); + shrink_lruvec(lruvec, swappiness, sc); /* * Direct reclaim and kswapd have to scan all memory @@ -2287,20 +2302,21 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc) sc->nr_scanned - nr_scanned, sc->nr_reclaimed - nr_reclaimed); + if (sc->nr_reclaimed - nr_reclaimed) + reclaimable = true; + } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed, sc->nr_scanned - nr_scanned, sc)); + + return reclaimable; } /* Returns true if compaction should go ahead for a high-order request */ -static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) +static inline bool compaction_ready(struct zone *zone, int order) { unsigned long balance_gap, watermark; bool watermark_ok; - /* Do not consider compaction for orders reclaim is meant to satisfy */ - if (sc->order <= PAGE_ALLOC_COSTLY_ORDER) - return false; - /* * Compaction takes time to run and there are potentially other * callers using the pages just freed. Continue reclaiming until @@ -2309,18 +2325,18 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) */ balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP( zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO)); - watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order); + watermark = high_wmark_pages(zone) + balance_gap + (2UL << order); watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0); /* * If compaction is deferred, reclaim up to a point where * compaction will have a chance of success when re-enabled */ - if (compaction_deferred(zone, sc->order)) + if (compaction_deferred(zone, order)) return watermark_ok; /* If compaction is not ready to start, keep reclaiming */ - if (!compaction_suitable(zone, sc->order)) + if (!compaction_suitable(zone, order)) return false; return watermark_ok; @@ -2342,10 +2358,7 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) * If a zone is deemed to be full of pinned pages then just give it a light * scan then give up on it. * - * This function returns true if a zone is being reclaimed for a costly - * high-order allocation and compaction is ready to begin. This indicates to - * the caller that it should consider retrying the allocation instead of - * further reclaim. + * Returns true if a zone was reclaimable. */ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) { @@ -2354,13 +2367,13 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) unsigned long nr_soft_reclaimed; unsigned long nr_soft_scanned; unsigned long lru_pages = 0; - bool aborted_reclaim = false; struct reclaim_state *reclaim_state = current->reclaim_state; gfp_t orig_mask; struct shrink_control shrink = { .gfp_mask = sc->gfp_mask, }; enum zone_type requested_highidx = gfp_zone(sc->gfp_mask); + bool reclaimable = false; /* * If the number of buffer_heads in the machine exceeds the maximum @@ -2391,22 +2404,24 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) if (sc->priority != DEF_PRIORITY && !zone_reclaimable(zone)) continue; /* Let kswapd poll it */ - if (IS_ENABLED(CONFIG_COMPACTION)) { - /* - * If we already have plenty of memory free for - * compaction in this zone, don't free any more. - * Even though compaction is invoked for any - * non-zero order, only frequent costly order - * reclamation is disruptive enough to become a - * noticeable problem, like transparent huge - * page allocations. - */ - if ((zonelist_zone_idx(z) <= requested_highidx) - && compaction_ready(zone, sc)) { - aborted_reclaim = true; - continue; - } + + /* + * If we already have plenty of memory free for + * compaction in this zone, don't free any more. + * Even though compaction is invoked for any + * non-zero order, only frequent costly order + * reclamation is disruptive enough to become a + * noticeable problem, like transparent huge + * page allocations. + */ + if (IS_ENABLED(CONFIG_COMPACTION) && + sc->order > PAGE_ALLOC_COSTLY_ORDER && + zonelist_zone_idx(z) <= requested_highidx && + compaction_ready(zone, sc->order)) { + sc->compaction_ready = true; + continue; } + /* * This steals pages from memory cgroups over softlimit * and returns the number of reclaimed pages and @@ -2419,10 +2434,17 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) &nr_soft_scanned); sc->nr_reclaimed += nr_soft_reclaimed; sc->nr_scanned += nr_soft_scanned; + if (nr_soft_reclaimed) + reclaimable = true; /* need some check for avoid more shrink_zone() */ } - shrink_zone(zone, sc); + if (shrink_zone(zone, sc)) + reclaimable = true; + + if (global_reclaim(sc) && + !reclaimable && zone_reclaimable(zone)) + reclaimable = true; } /* @@ -2445,27 +2467,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) */ sc->gfp_mask = orig_mask; - return aborted_reclaim; -} - -/* All zones in zonelist are unreclaimable? */ -static bool all_unreclaimable(struct zonelist *zonelist, - struct scan_control *sc) -{ - struct zoneref *z; - struct zone *zone; - - for_each_zone_zonelist_nodemask(zone, z, zonelist, - gfp_zone(sc->gfp_mask), sc->nodemask) { - if (!populated_zone(zone)) - continue; - if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) - continue; - if (zone_reclaimable(zone)) - return false; - } - - return true; + return reclaimable; } /* @@ -2489,7 +2491,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, { unsigned long total_scanned = 0; unsigned long writeback_threshold; - bool aborted_reclaim; + bool zones_reclaimable; delayacct_freepages_start(); @@ -2500,11 +2502,14 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup, sc->priority); sc->nr_scanned = 0; - aborted_reclaim = shrink_zones(zonelist, sc); + zones_reclaimable = shrink_zones(zonelist, sc); total_scanned += sc->nr_scanned; if (sc->nr_reclaimed >= sc->nr_to_reclaim) - goto out; + break; + + if (sc->compaction_ready) + break; /* * If we're getting trouble reclaiming, start doing @@ -2526,28 +2531,19 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, WB_REASON_TRY_TO_FREE_PAGES); sc->may_writepage = 1; } - } while (--sc->priority >= 0 && !aborted_reclaim); + } while (--sc->priority >= 0); -out: delayacct_freepages_end(); if (sc->nr_reclaimed) return sc->nr_reclaimed; - /* - * As hibernation is going on, kswapd is freezed so that it can't mark - * the zone into all_unreclaimable. Thus bypassing all_unreclaimable - * check. - */ - if (oom_killer_disabled) - return 0; - /* Aborted reclaim to try compaction? don't OOM, then */ - if (aborted_reclaim) + if (sc->compaction_ready) return 1; - /* top priority shrink_zones still had more to do? don't OOM, then */ - if (global_reclaim(sc) && !all_unreclaimable(zonelist, sc)) + /* Any of the zones still reclaimable? Don't OOM. */ + if (zones_reclaimable) return 1; return 0; @@ -2684,15 +2680,14 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, { unsigned long nr_reclaimed; struct scan_control sc = { + .nr_to_reclaim = SWAP_CLUSTER_MAX, .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), + .order = order, + .nodemask = nodemask, + .priority = DEF_PRIORITY, .may_writepage = !laptop_mode, - .nr_to_reclaim = SWAP_CLUSTER_MAX, .may_unmap = 1, .may_swap = 1, - .order = order, - .priority = DEF_PRIORITY, - .target_mem_cgroup = NULL, - .nodemask = nodemask, }; /* @@ -2722,17 +2717,14 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, unsigned long *nr_scanned) { struct scan_control sc = { - .nr_scanned = 0, .nr_to_reclaim = SWAP_CLUSTER_MAX, + .target_mem_cgroup = memcg, .may_writepage = !laptop_mode, .may_unmap = 1, .may_swap = !noswap, - .order = 0, - .priority = 0, - .swappiness = mem_cgroup_swappiness(memcg), - .target_mem_cgroup = memcg, }; struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); + int swappiness = mem_cgroup_swappiness(memcg); sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); @@ -2748,7 +2740,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, * will pick up pages from other mem cgroup's as well. We hack * the priority and make it zero. */ - shrink_lruvec(lruvec, &sc); + shrink_lruvec(lruvec, swappiness, &sc); trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); @@ -2764,16 +2756,14 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long nr_reclaimed; int nid; struct scan_control sc = { - .may_writepage = !laptop_mode, - .may_unmap = 1, - .may_swap = !noswap, .nr_to_reclaim = SWAP_CLUSTER_MAX, - .order = 0, - .priority = DEF_PRIORITY, - .target_mem_cgroup = memcg, - .nodemask = NULL, /* we don't care the placement */ .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), + .target_mem_cgroup = memcg, + .priority = DEF_PRIORITY, + .may_writepage = !laptop_mode, + .may_unmap = 1, + .may_swap = !noswap, }; /* @@ -3031,12 +3021,11 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, unsigned long nr_soft_scanned; struct scan_control sc = { .gfp_mask = GFP_KERNEL, + .order = order, .priority = DEF_PRIORITY, + .may_writepage = !laptop_mode, .may_unmap = 1, .may_swap = 1, - .may_writepage = !laptop_mode, - .order = order, - .target_mem_cgroup = NULL, }; count_vm_event(PAGEOUTRUN); @@ -3417,14 +3406,13 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) { struct reclaim_state reclaim_state; struct scan_control sc = { + .nr_to_reclaim = nr_to_reclaim, .gfp_mask = GFP_HIGHUSER_MOVABLE, - .may_swap = 1, - .may_unmap = 1, + .priority = DEF_PRIORITY, .may_writepage = 1, - .nr_to_reclaim = nr_to_reclaim, + .may_unmap = 1, + .may_swap = 1, .hibernation_mode = 1, - .order = 0, - .priority = DEF_PRIORITY, }; struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); struct task_struct *p = current; @@ -3604,13 +3592,13 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) struct task_struct *p = current; struct reclaim_state reclaim_state; struct scan_control sc = { - .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), - .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), - .may_swap = 1, .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), .order = order, .priority = ZONE_RECLAIM_PRIORITY, + .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), + .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), + .may_swap = 1, }; struct shrink_control shrink = { .gfp_mask = sc.gfp_mask, diff --git a/mm/vmstat.c b/mm/vmstat.c index b37bd49..e9ab104 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -200,7 +200,7 @@ void set_pgdat_percpu_threshold(pg_data_t *pgdat, continue; threshold = (*calculate_pressure)(zone); - for_each_possible_cpu(cpu) + for_each_online_cpu(cpu) per_cpu_ptr(zone->pageset, cpu)->stat_threshold = threshold; } @@ -763,6 +763,7 @@ const char * const vmstat_text[] = { "nr_shmem", "nr_dirtied", "nr_written", + "nr_pages_scanned", #ifdef CONFIG_NUMA "numa_hit", @@ -1067,7 +1068,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, min_wmark_pages(zone), low_wmark_pages(zone), high_wmark_pages(zone), - zone->pages_scanned, + zone_page_state(zone, NR_PAGES_SCANNED), zone->spanned_pages, zone->present_pages, zone->managed_pages); @@ -1077,10 +1078,10 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, zone_page_state(zone, i)); seq_printf(m, - "\n protection: (%lu", + "\n protection: (%ld", zone->lowmem_reserve[0]); for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++) - seq_printf(m, ", %lu", zone->lowmem_reserve[i]); + seq_printf(m, ", %ld", zone->lowmem_reserve[i]); seq_printf(m, ")" "\n pagesets"); @@ -51,6 +51,7 @@ #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/zbud.h> +#include <linux/zpool.h> /***************** * Structures @@ -113,6 +114,90 @@ struct zbud_header { }; /***************** + * zpool + ****************/ + +#ifdef CONFIG_ZPOOL + +static int zbud_zpool_evict(struct zbud_pool *pool, unsigned long handle) +{ + return zpool_evict(pool, handle); +} + +static struct zbud_ops zbud_zpool_ops = { + .evict = zbud_zpool_evict +}; + +static void *zbud_zpool_create(gfp_t gfp, struct zpool_ops *zpool_ops) +{ + return zbud_create_pool(gfp, &zbud_zpool_ops); +} + +static void zbud_zpool_destroy(void *pool) +{ + zbud_destroy_pool(pool); +} + +static int zbud_zpool_malloc(void *pool, size_t size, gfp_t gfp, + unsigned long *handle) +{ + return zbud_alloc(pool, size, gfp, handle); +} +static void zbud_zpool_free(void *pool, unsigned long handle) +{ + zbud_free(pool, handle); +} + +static int zbud_zpool_shrink(void *pool, unsigned int pages, + unsigned int *reclaimed) +{ + unsigned int total = 0; + int ret = -EINVAL; + + while (total < pages) { + ret = zbud_reclaim_page(pool, 8); + if (ret < 0) + break; + total++; + } + + if (reclaimed) + *reclaimed = total; + + return ret; +} + +static void *zbud_zpool_map(void *pool, unsigned long handle, + enum zpool_mapmode mm) +{ + return zbud_map(pool, handle); +} +static void zbud_zpool_unmap(void *pool, unsigned long handle) +{ + zbud_unmap(pool, handle); +} + +static u64 zbud_zpool_total_size(void *pool) +{ + return zbud_get_pool_size(pool) * PAGE_SIZE; +} + +static struct zpool_driver zbud_zpool_driver = { + .type = "zbud", + .owner = THIS_MODULE, + .create = zbud_zpool_create, + .destroy = zbud_zpool_destroy, + .malloc = zbud_zpool_malloc, + .free = zbud_zpool_free, + .shrink = zbud_zpool_shrink, + .map = zbud_zpool_map, + .unmap = zbud_zpool_unmap, + .total_size = zbud_zpool_total_size, +}; + +#endif /* CONFIG_ZPOOL */ + +/***************** * Helpers *****************/ /* Just to make the code easier to read */ @@ -122,7 +207,7 @@ enum buddy { }; /* Converts an allocation size in bytes to size in zbud chunks */ -static int size_to_chunks(int size) +static int size_to_chunks(size_t size) { return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT; } @@ -247,7 +332,7 @@ void zbud_destroy_pool(struct zbud_pool *pool) * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate * a new page. */ -int zbud_alloc(struct zbud_pool *pool, unsigned int size, gfp_t gfp, +int zbud_alloc(struct zbud_pool *pool, size_t size, gfp_t gfp, unsigned long *handle) { int chunks, i, freechunks; @@ -511,11 +596,20 @@ static int __init init_zbud(void) /* Make sure the zbud header will fit in one chunk */ BUILD_BUG_ON(sizeof(struct zbud_header) > ZHDR_SIZE_ALIGNED); pr_info("loaded\n"); + +#ifdef CONFIG_ZPOOL + zpool_register_driver(&zbud_zpool_driver); +#endif + return 0; } static void __exit exit_zbud(void) { +#ifdef CONFIG_ZPOOL + zpool_unregister_driver(&zbud_zpool_driver); +#endif + pr_info("unloaded\n"); } diff --git a/mm/zpool.c b/mm/zpool.c new file mode 100644 index 0000000..e40612a --- /dev/null +++ b/mm/zpool.c @@ -0,0 +1,364 @@ +/* + * zpool memory storage api + * + * Copyright (C) 2014 Dan Streetman + * + * This is a common frontend for memory storage pool implementations. + * Typically, this is used to store compressed memory. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/list.h> +#include <linux/types.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/module.h> +#include <linux/zpool.h> + +struct zpool { + char *type; + + struct zpool_driver *driver; + void *pool; + struct zpool_ops *ops; + + struct list_head list; +}; + +static LIST_HEAD(drivers_head); +static DEFINE_SPINLOCK(drivers_lock); + +static LIST_HEAD(pools_head); +static DEFINE_SPINLOCK(pools_lock); + +/** + * zpool_register_driver() - register a zpool implementation. + * @driver: driver to register + */ +void zpool_register_driver(struct zpool_driver *driver) +{ + spin_lock(&drivers_lock); + atomic_set(&driver->refcount, 0); + list_add(&driver->list, &drivers_head); + spin_unlock(&drivers_lock); +} +EXPORT_SYMBOL(zpool_register_driver); + +/** + * zpool_unregister_driver() - unregister a zpool implementation. + * @driver: driver to unregister. + * + * Module usage counting is used to prevent using a driver + * while/after unloading, so if this is called from module + * exit function, this should never fail; if called from + * other than the module exit function, and this returns + * failure, the driver is in use and must remain available. + */ +int zpool_unregister_driver(struct zpool_driver *driver) +{ + int ret = 0, refcount; + + spin_lock(&drivers_lock); + refcount = atomic_read(&driver->refcount); + WARN_ON(refcount < 0); + if (refcount > 0) + ret = -EBUSY; + else + list_del(&driver->list); + spin_unlock(&drivers_lock); + + return ret; +} +EXPORT_SYMBOL(zpool_unregister_driver); + +/** + * zpool_evict() - evict callback from a zpool implementation. + * @pool: pool to evict from. + * @handle: handle to evict. + * + * This can be used by zpool implementations to call the + * user's evict zpool_ops struct evict callback. + */ +int zpool_evict(void *pool, unsigned long handle) +{ + struct zpool *zpool; + + spin_lock(&pools_lock); + list_for_each_entry(zpool, &pools_head, list) { + if (zpool->pool == pool) { + spin_unlock(&pools_lock); + if (!zpool->ops || !zpool->ops->evict) + return -EINVAL; + return zpool->ops->evict(zpool, handle); + } + } + spin_unlock(&pools_lock); + + return -ENOENT; +} +EXPORT_SYMBOL(zpool_evict); + +static struct zpool_driver *zpool_get_driver(char *type) +{ + struct zpool_driver *driver; + + spin_lock(&drivers_lock); + list_for_each_entry(driver, &drivers_head, list) { + if (!strcmp(driver->type, type)) { + bool got = try_module_get(driver->owner); + + if (got) + atomic_inc(&driver->refcount); + spin_unlock(&drivers_lock); + return got ? driver : NULL; + } + } + + spin_unlock(&drivers_lock); + return NULL; +} + +static void zpool_put_driver(struct zpool_driver *driver) +{ + atomic_dec(&driver->refcount); + module_put(driver->owner); +} + +/** + * zpool_create_pool() - Create a new zpool + * @type The type of the zpool to create (e.g. zbud, zsmalloc) + * @gfp The GFP flags to use when allocating the pool. + * @ops The optional ops callback. + * + * This creates a new zpool of the specified type. The gfp flags will be + * used when allocating memory, if the implementation supports it. If the + * ops param is NULL, then the created zpool will not be shrinkable. + * + * Implementations must guarantee this to be thread-safe. + * + * Returns: New zpool on success, NULL on failure. + */ +struct zpool *zpool_create_pool(char *type, gfp_t gfp, struct zpool_ops *ops) +{ + struct zpool_driver *driver; + struct zpool *zpool; + + pr_info("creating pool type %s\n", type); + + driver = zpool_get_driver(type); + + if (!driver) { + request_module(type); + driver = zpool_get_driver(type); + } + + if (!driver) { + pr_err("no driver for type %s\n", type); + return NULL; + } + + zpool = kmalloc(sizeof(*zpool), gfp); + if (!zpool) { + pr_err("couldn't create zpool - out of memory\n"); + zpool_put_driver(driver); + return NULL; + } + + zpool->type = driver->type; + zpool->driver = driver; + zpool->pool = driver->create(gfp, ops); + zpool->ops = ops; + + if (!zpool->pool) { + pr_err("couldn't create %s pool\n", type); + zpool_put_driver(driver); + kfree(zpool); + return NULL; + } + + pr_info("created %s pool\n", type); + + spin_lock(&pools_lock); + list_add(&zpool->list, &pools_head); + spin_unlock(&pools_lock); + + return zpool; +} + +/** + * zpool_destroy_pool() - Destroy a zpool + * @pool The zpool to destroy. + * + * Implementations must guarantee this to be thread-safe, + * however only when destroying different pools. The same + * pool should only be destroyed once, and should not be used + * after it is destroyed. + * + * This destroys an existing zpool. The zpool should not be in use. + */ +void zpool_destroy_pool(struct zpool *zpool) +{ + pr_info("destroying pool type %s\n", zpool->type); + + spin_lock(&pools_lock); + list_del(&zpool->list); + spin_unlock(&pools_lock); + zpool->driver->destroy(zpool->pool); + zpool_put_driver(zpool->driver); + kfree(zpool); +} + +/** + * zpool_get_type() - Get the type of the zpool + * @pool The zpool to check + * + * This returns the type of the pool. + * + * Implementations must guarantee this to be thread-safe. + * + * Returns: The type of zpool. + */ +char *zpool_get_type(struct zpool *zpool) +{ + return zpool->type; +} + +/** + * zpool_malloc() - Allocate memory + * @pool The zpool to allocate from. + * @size The amount of memory to allocate. + * @gfp The GFP flags to use when allocating memory. + * @handle Pointer to the handle to set + * + * This allocates the requested amount of memory from the pool. + * The gfp flags will be used when allocating memory, if the + * implementation supports it. The provided @handle will be + * set to the allocated object handle. + * + * Implementations must guarantee this to be thread-safe. + * + * Returns: 0 on success, negative value on error. + */ +int zpool_malloc(struct zpool *zpool, size_t size, gfp_t gfp, + unsigned long *handle) +{ + return zpool->driver->malloc(zpool->pool, size, gfp, handle); +} + +/** + * zpool_free() - Free previously allocated memory + * @pool The zpool that allocated the memory. + * @handle The handle to the memory to free. + * + * This frees previously allocated memory. This does not guarantee + * that the pool will actually free memory, only that the memory + * in the pool will become available for use by the pool. + * + * Implementations must guarantee this to be thread-safe, + * however only when freeing different handles. The same + * handle should only be freed once, and should not be used + * after freeing. + */ +void zpool_free(struct zpool *zpool, unsigned long handle) +{ + zpool->driver->free(zpool->pool, handle); +} + +/** + * zpool_shrink() - Shrink the pool size + * @pool The zpool to shrink. + * @pages The number of pages to shrink the pool. + * @reclaimed The number of pages successfully evicted. + * + * This attempts to shrink the actual memory size of the pool + * by evicting currently used handle(s). If the pool was + * created with no zpool_ops, or the evict call fails for any + * of the handles, this will fail. If non-NULL, the @reclaimed + * parameter will be set to the number of pages reclaimed, + * which may be more than the number of pages requested. + * + * Implementations must guarantee this to be thread-safe. + * + * Returns: 0 on success, negative value on error/failure. + */ +int zpool_shrink(struct zpool *zpool, unsigned int pages, + unsigned int *reclaimed) +{ + return zpool->driver->shrink(zpool->pool, pages, reclaimed); +} + +/** + * zpool_map_handle() - Map a previously allocated handle into memory + * @pool The zpool that the handle was allocated from + * @handle The handle to map + * @mm How the memory should be mapped + * + * This maps a previously allocated handle into memory. The @mm + * param indicates to the implementation how the memory will be + * used, i.e. read-only, write-only, read-write. If the + * implementation does not support it, the memory will be treated + * as read-write. + * + * This may hold locks, disable interrupts, and/or preemption, + * and the zpool_unmap_handle() must be called to undo those + * actions. The code that uses the mapped handle should complete + * its operatons on the mapped handle memory quickly and unmap + * as soon as possible. As the implementation may use per-cpu + * data, multiple handles should not be mapped concurrently on + * any cpu. + * + * Returns: A pointer to the handle's mapped memory area. + */ +void *zpool_map_handle(struct zpool *zpool, unsigned long handle, + enum zpool_mapmode mapmode) +{ + return zpool->driver->map(zpool->pool, handle, mapmode); +} + +/** + * zpool_unmap_handle() - Unmap a previously mapped handle + * @pool The zpool that the handle was allocated from + * @handle The handle to unmap + * + * This unmaps a previously mapped handle. Any locks or other + * actions that the implementation took in zpool_map_handle() + * will be undone here. The memory area returned from + * zpool_map_handle() should no longer be used after this. + */ +void zpool_unmap_handle(struct zpool *zpool, unsigned long handle) +{ + zpool->driver->unmap(zpool->pool, handle); +} + +/** + * zpool_get_total_size() - The total size of the pool + * @pool The zpool to check + * + * This returns the total size in bytes of the pool. + * + * Returns: Total size of the zpool in bytes. + */ +u64 zpool_get_total_size(struct zpool *zpool) +{ + return zpool->driver->total_size(zpool->pool); +} + +static int __init init_zpool(void) +{ + pr_info("loaded\n"); + return 0; +} + +static void __exit exit_zpool(void) +{ + pr_info("unloaded\n"); +} + +module_init(init_zpool); +module_exit(exit_zpool); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Dan Streetman <ddstreet@ieee.org>"); +MODULE_DESCRIPTION("Common API for compressed memory storage"); diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index fe78189..4e2fc83 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -92,6 +92,7 @@ #include <linux/spinlock.h> #include <linux/types.h> #include <linux/zsmalloc.h> +#include <linux/zpool.h> /* * This must be power of 2 and greater than of equal to sizeof(link_free). @@ -240,6 +241,81 @@ struct mapping_area { enum zs_mapmode vm_mm; /* mapping mode */ }; +/* zpool driver */ + +#ifdef CONFIG_ZPOOL + +static void *zs_zpool_create(gfp_t gfp, struct zpool_ops *zpool_ops) +{ + return zs_create_pool(gfp); +} + +static void zs_zpool_destroy(void *pool) +{ + zs_destroy_pool(pool); +} + +static int zs_zpool_malloc(void *pool, size_t size, gfp_t gfp, + unsigned long *handle) +{ + *handle = zs_malloc(pool, size); + return *handle ? 0 : -1; +} +static void zs_zpool_free(void *pool, unsigned long handle) +{ + zs_free(pool, handle); +} + +static int zs_zpool_shrink(void *pool, unsigned int pages, + unsigned int *reclaimed) +{ + return -EINVAL; +} + +static void *zs_zpool_map(void *pool, unsigned long handle, + enum zpool_mapmode mm) +{ + enum zs_mapmode zs_mm; + + switch (mm) { + case ZPOOL_MM_RO: + zs_mm = ZS_MM_RO; + break; + case ZPOOL_MM_WO: + zs_mm = ZS_MM_WO; + break; + case ZPOOL_MM_RW: /* fallthru */ + default: + zs_mm = ZS_MM_RW; + break; + } + + return zs_map_object(pool, handle, zs_mm); +} +static void zs_zpool_unmap(void *pool, unsigned long handle) +{ + zs_unmap_object(pool, handle); +} + +static u64 zs_zpool_total_size(void *pool) +{ + return zs_get_total_size_bytes(pool); +} + +static struct zpool_driver zs_zpool_driver = { + .type = "zsmalloc", + .owner = THIS_MODULE, + .create = zs_zpool_create, + .destroy = zs_zpool_destroy, + .malloc = zs_zpool_malloc, + .free = zs_zpool_free, + .shrink = zs_zpool_shrink, + .map = zs_zpool_map, + .unmap = zs_zpool_unmap, + .total_size = zs_zpool_total_size, +}; + +#endif /* CONFIG_ZPOOL */ /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ static DEFINE_PER_CPU(struct mapping_area, zs_map_area); @@ -690,7 +766,7 @@ static inline void __zs_cpu_down(struct mapping_area *area) static inline void *__zs_map_object(struct mapping_area *area, struct page *pages[2], int off, int size) { - BUG_ON(map_vm_area(area->vm, PAGE_KERNEL, &pages)); + BUG_ON(map_vm_area(area->vm, PAGE_KERNEL, pages)); area->vm_addr = area->vm->addr; return area->vm_addr + off; } @@ -814,6 +890,10 @@ static void zs_exit(void) { int cpu; +#ifdef CONFIG_ZPOOL + zpool_unregister_driver(&zs_zpool_driver); +#endif + cpu_notifier_register_begin(); for_each_online_cpu(cpu) @@ -840,6 +920,10 @@ static int zs_init(void) cpu_notifier_register_done(); +#ifdef CONFIG_ZPOOL + zpool_register_driver(&zs_zpool_driver); +#endif + return 0; fail: zs_exit(); @@ -34,7 +34,7 @@ #include <linux/swap.h> #include <linux/crypto.h> #include <linux/mempool.h> -#include <linux/zbud.h> +#include <linux/zpool.h> #include <linux/mm_types.h> #include <linux/page-flags.h> @@ -45,8 +45,8 @@ /********************************* * statistics **********************************/ -/* Number of memory pages used by the compressed pool */ -static u64 zswap_pool_pages; +/* Total bytes used by the compressed storage */ +static u64 zswap_pool_total_size; /* The number of compressed pages currently stored in zswap */ static atomic_t zswap_stored_pages = ATOMIC_INIT(0); @@ -89,8 +89,13 @@ static unsigned int zswap_max_pool_percent = 20; module_param_named(max_pool_percent, zswap_max_pool_percent, uint, 0644); -/* zbud_pool is shared by all of zswap backend */ -static struct zbud_pool *zswap_pool; +/* Compressed storage to use */ +#define ZSWAP_ZPOOL_DEFAULT "zbud" +static char *zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT; +module_param_named(zpool, zswap_zpool_type, charp, 0444); + +/* zpool is shared by all of zswap backend */ +static struct zpool *zswap_pool; /********************************* * compression functions @@ -168,7 +173,7 @@ static void zswap_comp_exit(void) * be held while changing the refcount. Since the lock must * be held, there is no reason to also make refcount atomic. * offset - the swap offset for the entry. Index into the red-black tree. - * handle - zbud allocation handle that stores the compressed page data + * handle - zpool allocation handle that stores the compressed page data * length - the length in bytes of the compressed page data. Needed during * decompression */ @@ -284,15 +289,15 @@ static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry) } /* - * Carries out the common pattern of freeing and entry's zbud allocation, + * Carries out the common pattern of freeing and entry's zpool allocation, * freeing the entry itself, and decrementing the number of stored pages. */ static void zswap_free_entry(struct zswap_entry *entry) { - zbud_free(zswap_pool, entry->handle); + zpool_free(zswap_pool, entry->handle); zswap_entry_cache_free(entry); atomic_dec(&zswap_stored_pages); - zswap_pool_pages = zbud_get_pool_size(zswap_pool); + zswap_pool_total_size = zpool_get_total_size(zswap_pool); } /* caller must hold the tree lock */ @@ -409,7 +414,7 @@ cleanup: static bool zswap_is_full(void) { return totalram_pages * zswap_max_pool_percent / 100 < - zswap_pool_pages; + DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); } /********************************* @@ -525,7 +530,7 @@ static int zswap_get_swap_cache_page(swp_entry_t entry, * the swap cache, the compressed version stored by zswap can be * freed. */ -static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle) +static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) { struct zswap_header *zhdr; swp_entry_t swpentry; @@ -541,9 +546,9 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle) }; /* extract swpentry from data */ - zhdr = zbud_map(pool, handle); + zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO); swpentry = zhdr->swpentry; /* here */ - zbud_unmap(pool, handle); + zpool_unmap_handle(pool, handle); tree = zswap_trees[swp_type(swpentry)]; offset = swp_offset(swpentry); @@ -573,13 +578,13 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle) case ZSWAP_SWAPCACHE_NEW: /* page is locked */ /* decompress */ dlen = PAGE_SIZE; - src = (u8 *)zbud_map(zswap_pool, entry->handle) + - sizeof(struct zswap_header); + src = (u8 *)zpool_map_handle(zswap_pool, entry->handle, + ZPOOL_MM_RO) + sizeof(struct zswap_header); dst = kmap_atomic(page); ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length, dst, &dlen); kunmap_atomic(dst); - zbud_unmap(zswap_pool, entry->handle); + zpool_unmap_handle(zswap_pool, entry->handle); BUG_ON(ret); BUG_ON(dlen != PAGE_SIZE); @@ -652,7 +657,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, /* reclaim space if needed */ if (zswap_is_full()) { zswap_pool_limit_hit++; - if (zbud_reclaim_page(zswap_pool, 8)) { + if (zpool_shrink(zswap_pool, 1, NULL)) { zswap_reject_reclaim_fail++; ret = -ENOMEM; goto reject; @@ -679,7 +684,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, /* store */ len = dlen + sizeof(struct zswap_header); - ret = zbud_alloc(zswap_pool, len, __GFP_NORETRY | __GFP_NOWARN, + ret = zpool_malloc(zswap_pool, len, __GFP_NORETRY | __GFP_NOWARN, &handle); if (ret == -ENOSPC) { zswap_reject_compress_poor++; @@ -689,11 +694,11 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, zswap_reject_alloc_fail++; goto freepage; } - zhdr = zbud_map(zswap_pool, handle); + zhdr = zpool_map_handle(zswap_pool, handle, ZPOOL_MM_RW); zhdr->swpentry = swp_entry(type, offset); buf = (u8 *)(zhdr + 1); memcpy(buf, dst, dlen); - zbud_unmap(zswap_pool, handle); + zpool_unmap_handle(zswap_pool, handle); put_cpu_var(zswap_dstmem); /* populate entry */ @@ -716,7 +721,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, /* update stats */ atomic_inc(&zswap_stored_pages); - zswap_pool_pages = zbud_get_pool_size(zswap_pool); + zswap_pool_total_size = zpool_get_total_size(zswap_pool); return 0; @@ -752,13 +757,13 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, /* decompress */ dlen = PAGE_SIZE; - src = (u8 *)zbud_map(zswap_pool, entry->handle) + - sizeof(struct zswap_header); + src = (u8 *)zpool_map_handle(zswap_pool, entry->handle, + ZPOOL_MM_RO) + sizeof(struct zswap_header); dst = kmap_atomic(page); ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length, dst, &dlen); kunmap_atomic(dst); - zbud_unmap(zswap_pool, entry->handle); + zpool_unmap_handle(zswap_pool, entry->handle); BUG_ON(ret); spin_lock(&tree->lock); @@ -811,7 +816,7 @@ static void zswap_frontswap_invalidate_area(unsigned type) zswap_trees[type] = NULL; } -static struct zbud_ops zswap_zbud_ops = { +static struct zpool_ops zswap_zpool_ops = { .evict = zswap_writeback_entry }; @@ -869,8 +874,8 @@ static int __init zswap_debugfs_init(void) zswap_debugfs_root, &zswap_written_back_pages); debugfs_create_u64("duplicate_entry", S_IRUGO, zswap_debugfs_root, &zswap_duplicate_entry); - debugfs_create_u64("pool_pages", S_IRUGO, - zswap_debugfs_root, &zswap_pool_pages); + debugfs_create_u64("pool_total_size", S_IRUGO, + zswap_debugfs_root, &zswap_pool_total_size); debugfs_create_atomic_t("stored_pages", S_IRUGO, zswap_debugfs_root, &zswap_stored_pages); @@ -895,16 +900,26 @@ static void __exit zswap_debugfs_exit(void) { } **********************************/ static int __init init_zswap(void) { + gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN; + if (!zswap_enabled) return 0; pr_info("loading zswap\n"); - zswap_pool = zbud_create_pool(GFP_KERNEL, &zswap_zbud_ops); + zswap_pool = zpool_create_pool(zswap_zpool_type, gfp, &zswap_zpool_ops); + if (!zswap_pool && strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) { + pr_info("%s zpool not available\n", zswap_zpool_type); + zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT; + zswap_pool = zpool_create_pool(zswap_zpool_type, gfp, + &zswap_zpool_ops); + } if (!zswap_pool) { - pr_err("zbud pool creation failed\n"); + pr_err("%s zpool not available\n", zswap_zpool_type); + pr_err("zpool creation failed\n"); goto error; } + pr_info("using %s pool\n", zswap_zpool_type); if (zswap_entry_cache_create()) { pr_err("entry cache creation failed\n"); @@ -928,7 +943,7 @@ pcpufail: compfail: zswap_entry_cache_destory(); cachefail: - zbud_destroy_pool(zswap_pool); + zpool_destroy_pool(zswap_pool); error: return -ENOMEM; } diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index 022d18a..52c43f9 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -188,7 +188,7 @@ static bool batadv_frag_insert_packet(struct batadv_orig_node *orig_node, /* Reached the end of the list, so insert after 'frag_entry_last'. */ if (likely(frag_entry_last)) { - hlist_add_after(&frag_entry_last->list, &frag_entry_new->list); + hlist_add_behind(&frag_entry_last->list, &frag_entry_new->list); chain->size += skb->len - hdr_size; chain->timestamp = jiffies; ret = true; diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index b4845f4..7751c92 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -1174,7 +1174,7 @@ static void br_multicast_add_router(struct net_bridge *br, } if (slot) - hlist_add_after_rcu(slot, &port->rlist); + hlist_add_behind_rcu(&port->rlist, slot); else hlist_add_head_rcu(&port->rlist, &br->router_list); } diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 5afeb5a..e9cb258 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -940,7 +940,7 @@ static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new) last = li; } if (last) - hlist_add_after_rcu(&last->hlist, &new->hlist); + hlist_add_behind_rcu(&new->hlist, &last->hlist); else hlist_add_before_rcu(&new->hlist, &li->hlist); } diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c index 731e1e1..fd0dc47 100644 --- a/net/ipv6/addrlabel.c +++ b/net/ipv6/addrlabel.c @@ -277,7 +277,7 @@ static int __ip6addrlbl_add(struct ip6addrlbl_entry *newp, int replace) last = p; } if (last) - hlist_add_after_rcu(&last->list, &newp->list); + hlist_add_behind_rcu(&newp->list, &last->list); else hlist_add_head_rcu(&newp->list, &ip6addrlbl_table.head); out: diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 0525d78..beeed60 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -389,7 +389,7 @@ redo: if (h != h0) continue; hlist_del(&pol->bydst); - hlist_add_after(entry0, &pol->bydst); + hlist_add_behind(&pol->bydst, entry0); } entry0 = &pol->bydst; } @@ -654,7 +654,7 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) break; } if (newpos) - hlist_add_after(newpos, &policy->bydst); + hlist_add_behind(&policy->bydst, newpos); else hlist_add_head(&policy->bydst, chain); xfrm_pol_hold(policy); diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 182be0f..31a731e 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -309,9 +309,12 @@ our $Operators = qr{ our $c90_Keywords = qr{do|for|while|if|else|return|goto|continue|switch|default|case|break}x; our $NonptrType; +our $NonptrTypeMisordered; our $NonptrTypeWithAttr; our $Type; +our $TypeMisordered; our $Declare; +our $DeclareMisordered; our $NON_ASCII_UTF8 = qr{ [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte @@ -353,16 +356,36 @@ our $signature_tags = qr{(?xi: Cc: )}; +our @typeListMisordered = ( + qr{char\s+(?:un)?signed}, + qr{int\s+(?:(?:un)?signed\s+)?short\s}, + qr{int\s+short(?:\s+(?:un)?signed)}, + qr{short\s+int(?:\s+(?:un)?signed)}, + qr{(?:un)?signed\s+int\s+short}, + qr{short\s+(?:un)?signed}, + qr{long\s+int\s+(?:un)?signed}, + qr{int\s+long\s+(?:un)?signed}, + qr{long\s+(?:un)?signed\s+int}, + qr{int\s+(?:un)?signed\s+long}, + qr{int\s+(?:un)?signed}, + qr{int\s+long\s+long\s+(?:un)?signed}, + qr{long\s+long\s+int\s+(?:un)?signed}, + qr{long\s+long\s+(?:un)?signed\s+int}, + qr{long\s+long\s+(?:un)?signed}, + qr{long\s+(?:un)?signed}, +); + our @typeList = ( qr{void}, - qr{(?:unsigned\s+)?char}, - qr{(?:unsigned\s+)?short}, - qr{(?:unsigned\s+)?int}, - qr{(?:unsigned\s+)?long}, - qr{(?:unsigned\s+)?long\s+int}, - qr{(?:unsigned\s+)?long\s+long}, - qr{(?:unsigned\s+)?long\s+long\s+int}, - qr{unsigned}, + qr{(?:(?:un)?signed\s+)?char}, + qr{(?:(?:un)?signed\s+)?short\s+int}, + qr{(?:(?:un)?signed\s+)?short}, + qr{(?:(?:un)?signed\s+)?int}, + qr{(?:(?:un)?signed\s+)?long\s+int}, + qr{(?:(?:un)?signed\s+)?long\s+long\s+int}, + qr{(?:(?:un)?signed\s+)?long\s+long}, + qr{(?:(?:un)?signed\s+)?long}, + qr{(?:un)?signed}, qr{float}, qr{double}, qr{bool}, @@ -372,6 +395,7 @@ our @typeList = ( qr{${Ident}_t}, qr{${Ident}_handler}, qr{${Ident}_handler_fn}, + @typeListMisordered, ); our @typeListWithAttr = ( @typeList, @@ -399,11 +423,6 @@ foreach my $entry (@mode_permission_funcs) { $mode_perms_search .= $entry->[0]; } -our $declaration_macros = qr{(?x: - (?:$Storage\s+)?(?:DECLARE|DEFINE)_[A-Z]+\s*\(| - (?:$Storage\s+)?LIST_HEAD\s*\( -)}; - our $allowed_asm_includes = qr{(?x: irq| memory @@ -413,6 +432,7 @@ our $allowed_asm_includes = qr{(?x: sub build_types { my $mods = "(?x: \n" . join("|\n ", @modifierList) . "\n)"; my $all = "(?x: \n" . join("|\n ", @typeList) . "\n)"; + my $Misordered = "(?x: \n" . join("|\n ", @typeListMisordered) . "\n)"; my $allWithAttr = "(?x: \n" . join("|\n ", @typeListWithAttr) . "\n)"; $Modifier = qr{(?:$Attribute|$Sparse|$mods)}; $NonptrType = qr{ @@ -424,6 +444,13 @@ sub build_types { ) (?:\s+$Modifier|\s+const)* }x; + $NonptrTypeMisordered = qr{ + (?:$Modifier\s+|const\s+)* + (?: + (?:${Misordered}\b) + ) + (?:\s+$Modifier|\s+const)* + }x; $NonptrTypeWithAttr = qr{ (?:$Modifier\s+|const\s+)* (?: @@ -435,10 +462,16 @@ sub build_types { }x; $Type = qr{ $NonptrType - (?:(?:\s|\*|\[\])+\s*const|(?:\s|\*|\[\])+|(?:\s*\[\s*\])+)? + (?:(?:\s|\*|\[\])+\s*const|(?:\s|\*\s*(?:const\s*)?|\[\])+|(?:\s*\[\s*\])+)? + (?:\s+$Inline|\s+$Modifier)* + }x; + $TypeMisordered = qr{ + $NonptrTypeMisordered + (?:(?:\s|\*|\[\])+\s*const|(?:\s|\*\s*(?:const\s*)?|\[\])+|(?:\s*\[\s*\])+)? (?:\s+$Inline|\s+$Modifier)* }x; $Declare = qr{(?:$Storage\s+(?:$Inline\s+)?)?$Type}; + $DeclareMisordered = qr{(?:$Storage\s+(?:$Inline\s+)?)?$TypeMisordered}; } build_types(); @@ -452,6 +485,12 @@ our $balanced_parens = qr/(\((?:[^\(\)]++|(?-1))*\))/; our $LvalOrFunc = qr{((?:[\&\*]\s*)?$Lval)\s*($balanced_parens{0,1})\s*}; our $FuncArg = qr{$Typecast{0,1}($LvalOrFunc|$Constant)}; +our $declaration_macros = qr{(?x: + (?:$Storage\s+)?(?:[A-Z_][A-Z0-9]*_){0,2}(?:DEFINE|DECLARE)(?:_[A-Z0-9]+){1,2}\s*\(| + (?:$Storage\s+)?LIST_HEAD\s*\(| + (?:$Storage\s+)?${Type}\s+uninitialized_var\s*\( +)}; + sub deparenthesize { my ($string) = @_; return "" if (!defined($string)); @@ -550,11 +589,43 @@ sub seed_camelcase_includes { } } +sub git_commit_info { + my ($commit, $id, $desc) = @_; + + return ($id, $desc) if ((which("git") eq "") || !(-e ".git")); + + my $output = `git log --no-color --format='%H %s' -1 $commit 2>&1`; + $output =~ s/^\s*//gm; + my @lines = split("\n", $output); + + if ($lines[0] =~ /^error: short SHA1 $commit is ambiguous\./) { +# Maybe one day convert this block of bash into something that returns +# all matching commit ids, but it's very slow... +# +# echo "checking commits $1..." +# git rev-list --remotes | grep -i "^$1" | +# while read line ; do +# git log --format='%H %s' -1 $line | +# echo "commit $(cut -c 1-12,41-)" +# done + } elsif ($lines[0] =~ /^fatal: ambiguous argument '$commit': unknown revision or path not in the working tree\./) { + } else { + $id = substr($lines[0], 0, 12); + $desc = substr($lines[0], 41); + } + + return ($id, $desc); +} + $chk_signoff = 0 if ($file); my @rawlines = (); my @lines = (); my @fixed = (); +my @fixed_inserted = (); +my @fixed_deleted = (); +my $fixlinenr = -1; + my $vname; for my $filename (@ARGV) { my $FILE; @@ -583,6 +654,9 @@ for my $filename (@ARGV) { @rawlines = (); @lines = (); @fixed = (); + @fixed_inserted = (); + @fixed_deleted = (); + $fixlinenr = -1; } exit($exit); @@ -674,6 +748,18 @@ sub format_email { return $formatted_email; } +sub which { + my ($bin) = @_; + + foreach my $path (split(/:/, $ENV{PATH})) { + if (-e "$path/$bin") { + return "$path/$bin"; + } + } + + return ""; +} + sub which_conf { my ($conf) = @_; @@ -1483,6 +1569,90 @@ sub report_dump { our @report; } +sub fixup_current_range { + my ($lineRef, $offset, $length) = @_; + + if ($$lineRef =~ /^\@\@ -\d+,\d+ \+(\d+),(\d+) \@\@/) { + my $o = $1; + my $l = $2; + my $no = $o + $offset; + my $nl = $l + $length; + $$lineRef =~ s/\+$o,$l \@\@/\+$no,$nl \@\@/; + } +} + +sub fix_inserted_deleted_lines { + my ($linesRef, $insertedRef, $deletedRef) = @_; + + my $range_last_linenr = 0; + my $delta_offset = 0; + + my $old_linenr = 0; + my $new_linenr = 0; + + my $next_insert = 0; + my $next_delete = 0; + + my @lines = (); + + my $inserted = @{$insertedRef}[$next_insert++]; + my $deleted = @{$deletedRef}[$next_delete++]; + + foreach my $old_line (@{$linesRef}) { + my $save_line = 1; + my $line = $old_line; #don't modify the array + if ($line =~ /^(?:\+\+\+\|\-\-\-)\s+\S+/) { #new filename + $delta_offset = 0; + } elsif ($line =~ /^\@\@ -\d+,\d+ \+\d+,\d+ \@\@/) { #new hunk + $range_last_linenr = $new_linenr; + fixup_current_range(\$line, $delta_offset, 0); + } + + while (defined($deleted) && ${$deleted}{'LINENR'} == $old_linenr) { + $deleted = @{$deletedRef}[$next_delete++]; + $save_line = 0; + fixup_current_range(\$lines[$range_last_linenr], $delta_offset--, -1); + } + + while (defined($inserted) && ${$inserted}{'LINENR'} == $old_linenr) { + push(@lines, ${$inserted}{'LINE'}); + $inserted = @{$insertedRef}[$next_insert++]; + $new_linenr++; + fixup_current_range(\$lines[$range_last_linenr], $delta_offset++, 1); + } + + if ($save_line) { + push(@lines, $line); + $new_linenr++; + } + + $old_linenr++; + } + + return @lines; +} + +sub fix_insert_line { + my ($linenr, $line) = @_; + + my $inserted = { + LINENR => $linenr, + LINE => $line, + }; + push(@fixed_inserted, $inserted); +} + +sub fix_delete_line { + my ($linenr, $line) = @_; + + my $deleted = { + LINENR => $linenr, + LINE => $line, + }; + + push(@fixed_deleted, $deleted); +} + sub ERROR { my ($type, $msg) = @_; @@ -1637,11 +1807,13 @@ sub process { my $signoff = 0; my $is_patch = 0; - my $in_header_lines = 1; + my $in_header_lines = $file ? 0 : 1; my $in_commit_log = 0; #Scanning lines before patch - + my $reported_maintainer_file = 0; my $non_utf8_charset = 0; + my $last_blank_line = 0; + our @report = (); our $cnt_lines = 0; our $cnt_error = 0; @@ -1759,8 +1931,10 @@ sub process { $realcnt = 0; $linenr = 0; + $fixlinenr = -1; foreach my $line (@lines) { $linenr++; + $fixlinenr++; my $sline = $line; #copy of $line $sline =~ s/$;/ /g; #with comments as spaces @@ -1891,7 +2065,7 @@ sub process { if (WARN("BAD_SIGN_OFF", "Do not use whitespace before $ucfirst_sign_off\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] = + $fixed[$fixlinenr] = "$ucfirst_sign_off $email"; } } @@ -1899,7 +2073,7 @@ sub process { if (WARN("BAD_SIGN_OFF", "'$ucfirst_sign_off' is the preferred signature form\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] = + $fixed[$fixlinenr] = "$ucfirst_sign_off $email"; } @@ -1908,7 +2082,7 @@ sub process { if (WARN("BAD_SIGN_OFF", "Use a single space after $ucfirst_sign_off\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] = + $fixed[$fixlinenr] = "$ucfirst_sign_off $email"; } } @@ -1956,6 +2130,31 @@ sub process { "Remove Gerrit Change-Id's before submitting upstream.\n" . $herecurr); } +# Check for improperly formed commit descriptions + if ($in_commit_log && + $line =~ /\bcommit\s+[0-9a-f]{5,}/i && + $line !~ /\b[Cc]ommit [0-9a-f]{12,16} \("/) { + $line =~ /\b(c)ommit\s+([0-9a-f]{5,})/i; + my $init_char = $1; + my $orig_commit = lc($2); + my $id = '01234567890ab'; + my $desc = 'commit description'; + ($id, $desc) = git_commit_info($orig_commit, $id, $desc); + ERROR("GIT_COMMIT_ID", + "Please use 12 to 16 chars for the git commit ID like: '${init_char}ommit $id (\"$desc\")'\n" . $herecurr); + } + +# Check for added, moved or deleted files + if (!$reported_maintainer_file && !$in_commit_log && + ($line =~ /^(?:new|deleted) file mode\s*\d+\s*$/ || + $line =~ /^rename (?:from|to) [\w\/\.\-]+\s*$/ || + ($line =~ /\{\s*([\w\/\.\-]*)\s*\=\>\s*([\w\/\.\-]*)\s*\}/ && + (defined($1) || defined($2))))) { + $reported_maintainer_file = 1; + WARN("FILE_PATH_CHANGES", + "added, moved or deleted file(s), does MAINTAINERS need updating?\n" . $herecurr); + } + # Check for wrappage within a valid hunk of the file if ($realcnt != 0 && $line !~ m{^(?:\+|-| |\\ No newline|$)}) { ERROR("CORRUPTED_PATCH", @@ -1993,7 +2192,8 @@ sub process { # Check if it's the start of a commit log # (not a header line and we haven't seen the patch filename) if ($in_header_lines && $realfile =~ /^$/ && - $rawline !~ /^(commit\b|from\b|[\w-]+:).+$/i) { + !($rawline =~ /^\s+\S/ || + $rawline =~ /^(commit\b|from\b|[\w-]+:).*$/i)) { $in_header_lines = 0; $in_commit_log = 1; } @@ -2021,14 +2221,14 @@ sub process { if (ERROR("DOS_LINE_ENDINGS", "DOS line endings\n" . $herevet) && $fix) { - $fixed[$linenr - 1] =~ s/[\s\015]+$//; + $fixed[$fixlinenr] =~ s/[\s\015]+$//; } } elsif ($rawline =~ /^\+.*\S\s+$/ || $rawline =~ /^\+\s+$/) { my $herevet = "$here\n" . cat_vet($rawline) . "\n"; if (ERROR("TRAILING_WHITESPACE", "trailing whitespace\n" . $herevet) && $fix) { - $fixed[$linenr - 1] =~ s/\s+$//; + $fixed[$fixlinenr] =~ s/\s+$//; } $rpt_cleaners = 1; @@ -2049,7 +2249,7 @@ sub process { # Only applies when adding the entry originally, after that we do not have # sufficient context to determine whether it is indeed long enough. if ($realfile =~ /Kconfig/ && - $line =~ /.\s*config\s+/) { + $line =~ /^\+\s*config\s+/) { my $length = 0; my $cnt = $realcnt; my $ln = $linenr + 1; @@ -2062,10 +2262,11 @@ sub process { $is_end = $lines[$ln - 1] =~ /^\+/; next if ($f =~ /^-/); + last if (!$file && $f =~ /^\@\@/); - if ($lines[$ln - 1] =~ /.\s*(?:bool|tristate)\s*\"/) { + if ($lines[$ln - 1] =~ /^\+\s*(?:bool|tristate)\s*\"/) { $is_start = 1; - } elsif ($lines[$ln - 1] =~ /.\s*(?:---)?help(?:---)?$/) { + } elsif ($lines[$ln - 1] =~ /^\+\s*(?:---)?help(?:---)?$/) { $length = -1; } @@ -2161,12 +2362,18 @@ sub process { "quoted string split across lines\n" . $hereprev); } +# check for missing a space in a string concatination + if ($prevrawline =~ /[^\\]\w"$/ && $rawline =~ /^\+[\t ]+"\w/) { + WARN('MISSING_SPACE', + "break quoted strings at a space character\n" . $hereprev); + } + # check for spaces before a quoted newline if ($rawline =~ /^.*\".*\s\\n/) { if (WARN("QUOTED_WHITESPACE_BEFORE_NEWLINE", "unnecessary whitespace before a quoted newline\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ s/^(\+.*\".*)\s+\\n/$1\\n/; + $fixed[$fixlinenr] =~ s/^(\+.*\".*)\s+\\n/$1\\n/; } } @@ -2203,7 +2410,7 @@ sub process { if (ERROR("CODE_INDENT", "code indent should use tabs where possible\n" . $herevet) && $fix) { - $fixed[$linenr - 1] =~ s/^\+([ \t]+)/"\+" . tabify($1)/e; + $fixed[$fixlinenr] =~ s/^\+([ \t]+)/"\+" . tabify($1)/e; } } @@ -2213,9 +2420,9 @@ sub process { if (WARN("SPACE_BEFORE_TAB", "please, no space before tabs\n" . $herevet) && $fix) { - while ($fixed[$linenr - 1] =~ + while ($fixed[$fixlinenr] =~ s/(^\+.*) {8,8}+\t/$1\t\t/) {} - while ($fixed[$linenr - 1] =~ + while ($fixed[$fixlinenr] =~ s/(^\+.*) +\t/$1\t/) {} } } @@ -2249,19 +2456,19 @@ sub process { if (CHK("PARENTHESIS_ALIGNMENT", "Alignment should match open parenthesis\n" . $hereprev) && $fix && $line =~ /^\+/) { - $fixed[$linenr - 1] =~ + $fixed[$fixlinenr] =~ s/^\+[ \t]*/\+$goodtabindent/; } } } } - if ($line =~ /^\+.*\*[ \t]*\)[ \t]+(?!$Assignment|$Arithmetic)/) { + if ($line =~ /^\+.*\(\s*$Type\s*\)[ \t]+(?!$Assignment|$Arithmetic|{)/) { if (CHK("SPACING", - "No space is necessary after a cast\n" . $hereprev) && + "No space is necessary after a cast\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ - s/^(\+.*\*[ \t]*\))[ \t]+/$1/; + $fixed[$fixlinenr] =~ + s/(\(\s*$Type\s*\))[ \t]+/$1/; } } @@ -2291,10 +2498,44 @@ sub process { "networking block comments put the trailing */ on a separate line\n" . $herecurr); } +# check for missing blank lines after struct/union declarations +# with exceptions for various attributes and macros + if ($prevline =~ /^[\+ ]};?\s*$/ && + $line =~ /^\+/ && + !($line =~ /^\+\s*$/ || + $line =~ /^\+\s*EXPORT_SYMBOL/ || + $line =~ /^\+\s*MODULE_/i || + $line =~ /^\+\s*\#\s*(?:end|elif|else)/ || + $line =~ /^\+[a-z_]*init/ || + $line =~ /^\+\s*(?:static\s+)?[A-Z_]*ATTR/ || + $line =~ /^\+\s*DECLARE/ || + $line =~ /^\+\s*__setup/)) { + if (CHK("LINE_SPACING", + "Please use a blank line after function/struct/union/enum declarations\n" . $hereprev) && + $fix) { + fix_insert_line($fixlinenr, "\+"); + } + } + +# check for multiple consecutive blank lines + if ($prevline =~ /^[\+ ]\s*$/ && + $line =~ /^\+\s*$/ && + $last_blank_line != ($linenr - 1)) { + if (CHK("LINE_SPACING", + "Please don't use multiple blank lines\n" . $hereprev) && + $fix) { + fix_delete_line($fixlinenr, $rawline); + } + + $last_blank_line = $linenr; + } + # check for missing blank lines after declarations if ($sline =~ /^\+\s+\S/ && #Not at char 1 # actual declarations ($prevline =~ /^\+\s+$Declare\s*$Ident\s*[=,;:\[]/ || + # function pointer declarations + $prevline =~ /^\+\s+$Declare\s*\(\s*\*\s*$Ident\s*\)\s*[=,;:\[\(]/ || # foo bar; where foo is some local typedef or #define $prevline =~ /^\+\s+$Ident(?:\s+|\s*\*\s*)$Ident\s*[=,;\[]/ || # known declaration macros @@ -2307,6 +2548,8 @@ sub process { $prevline =~ /(?:\{\s*|\\)$/) && # looks like a declaration !($sline =~ /^\+\s+$Declare\s*$Ident\s*[=,;:\[]/ || + # function pointer declarations + $sline =~ /^\+\s+$Declare\s*\(\s*\*\s*$Ident\s*\)\s*[=,;:\[\(]/ || # foo bar; where foo is some local typedef or #define $sline =~ /^\+\s+$Ident(?:\s+|\s*\*\s*)$Ident\s*[=,;\[]/ || # known declaration macros @@ -2321,8 +2564,11 @@ sub process { $sline =~ /^\+\s+\(?\s*(?:$Compare|$Assignment|$Operators)/) && # indentation of previous and current line are the same (($prevline =~ /\+(\s+)\S/) && $sline =~ /^\+$1\S/)) { - WARN("SPACING", - "Missing a blank line after declarations\n" . $hereprev); + if (WARN("LINE_SPACING", + "Missing a blank line after declarations\n" . $hereprev) && + $fix) { + fix_insert_line($fixlinenr, "\+"); + } } # check for spaces at the beginning of a line. @@ -2335,13 +2581,33 @@ sub process { if (WARN("LEADING_SPACE", "please, no spaces at the start of a line\n" . $herevet) && $fix) { - $fixed[$linenr - 1] =~ s/^\+([ \t]+)/"\+" . tabify($1)/e; + $fixed[$fixlinenr] =~ s/^\+([ \t]+)/"\+" . tabify($1)/e; } } # check we are in a valid C source file if not then ignore this hunk next if ($realfile !~ /\.(h|c)$/); +# check indentation of any line with a bare else +# if the previous line is a break or return and is indented 1 tab more... + if ($sline =~ /^\+([\t]+)(?:}[ \t]*)?else(?:[ \t]*{)?\s*$/) { + my $tabs = length($1) + 1; + if ($prevline =~ /^\+\t{$tabs,$tabs}(?:break|return)\b/) { + WARN("UNNECESSARY_ELSE", + "else is not generally useful after a break or return\n" . $hereprev); + } + } + +# check indentation of a line with a break; +# if the previous line is a goto or return and is indented the same # of tabs + if ($sline =~ /^\+([\t]+)break\s*;\s*$/) { + my $tabs = $1; + if ($prevline =~ /^\+$tabs(?:goto|return)\b/) { + WARN("UNNECESSARY_BREAK", + "break is not useful after a goto or return\n" . $hereprev); + } + } + # discourage the addition of CONFIG_EXPERIMENTAL in #if(def). if ($line =~ /^\+\s*\#\s*if.*\bCONFIG_EXPERIMENTAL\b/) { WARN("CONFIG_EXPERIMENTAL", @@ -2477,7 +2743,7 @@ sub process { # if/while/etc brace do not go on next line, unless defining a do while loop, # or if that brace on the next line is for something else - if ($line =~ /(.*)\b((?:if|while|for|switch)\s*\(|do\b|else\b)/ && $line !~ /^.\s*\#/) { + if ($line =~ /(.*)\b((?:if|while|for|switch|(?:[a-z_]+|)for_each[a-z_]+)\s*\(|do\b|else\b)/ && $line !~ /^.\s*\#/) { my $pre_ctx = "$1$2"; my ($level, @ctx) = ctx_statement_level($linenr, $realcnt, 0); @@ -2504,7 +2770,7 @@ sub process { #print "realcnt<$realcnt> ctx_cnt<$ctx_cnt>\n"; #print "pre<$pre_ctx>\nline<$line>\nctx<$ctx>\nnext<$lines[$ctx_ln - 1]>\n"; - if ($ctx !~ /{\s*/ && defined($lines[$ctx_ln -1]) && $lines[$ctx_ln - 1] =~ /^\+\s*{/) { + if ($ctx !~ /{\s*/ && defined($lines[$ctx_ln - 1]) && $lines[$ctx_ln - 1] =~ /^\+\s*{/) { ERROR("OPEN_BRACE", "that open brace { should be on the previous line\n" . "$here\n$ctx\n$rawlines[$ctx_ln - 1]\n"); @@ -2523,7 +2789,7 @@ sub process { } # Check relative indent for conditionals and blocks. - if ($line =~ /\b(?:(?:if|while|for)\s*\(|do\b)/ && $line !~ /^.\s*#/ && $line !~ /\}\s*while\s*/) { + if ($line =~ /\b(?:(?:if|while|for|(?:[a-z_]+|)for_each[a-z_]+)\s*\(|do\b)/ && $line !~ /^.\s*#/ && $line !~ /\}\s*while\s*/) { ($stat, $cond, $line_nr_next, $remain_next, $off_next) = ctx_statement_block($linenr, $realcnt, 0) if (!defined $stat); @@ -2654,8 +2920,18 @@ sub process { # check for initialisation to aggregates open brace on the next line if ($line =~ /^.\s*{/ && $prevline =~ /(?:^|[^=])=\s*$/) { - ERROR("OPEN_BRACE", - "that open brace { should be on the previous line\n" . $hereprev); + if (ERROR("OPEN_BRACE", + "that open brace { should be on the previous line\n" . $hereprev) && + $fix && $prevline =~ /^\+/ && $line =~ /^\+/) { + fix_delete_line($fixlinenr - 1, $prevrawline); + fix_delete_line($fixlinenr, $rawline); + my $fixedline = $prevrawline; + $fixedline =~ s/\s*=\s*$/ = {/; + fix_insert_line($fixlinenr, $fixedline); + $fixedline = $line; + $fixedline =~ s/^(.\s*){\s*/$1/; + fix_insert_line($fixlinenr, $fixedline); + } } # @@ -2680,10 +2956,10 @@ sub process { if (ERROR("C99_COMMENTS", "do not use C99 // comments\n" . $herecurr) && $fix) { - my $line = $fixed[$linenr - 1]; + my $line = $fixed[$fixlinenr]; if ($line =~ /\/\/(.*)$/) { my $comment = trim($1); - $fixed[$linenr - 1] =~ s@\/\/(.*)$@/\* $comment \*/@; + $fixed[$fixlinenr] =~ s@\/\/(.*)$@/\* $comment \*/@; } } } @@ -2742,7 +3018,7 @@ sub process { "do not initialise globals to 0 or NULL\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ s/($Type\s*$Ident\s*(?:\s+$Modifier))*\s*=\s*(0|NULL|false)\s*;/$1;/; + $fixed[$fixlinenr] =~ s/($Type\s*$Ident\s*(?:\s+$Modifier))*\s*=\s*(0|NULL|false)\s*;/$1;/; } } # check for static initialisers. @@ -2751,10 +3027,17 @@ sub process { "do not initialise statics to 0 or NULL\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ s/(\bstatic\s.*?)\s*=\s*(0|NULL|false)\s*;/$1;/; + $fixed[$fixlinenr] =~ s/(\bstatic\s.*?)\s*=\s*(0|NULL|false)\s*;/$1;/; } } +# check for misordered declarations of char/short/int/long with signed/unsigned + while ($sline =~ m{(\b$TypeMisordered\b)}g) { + my $tmp = trim($1); + WARN("MISORDERED_TYPE", + "type '$tmp' should be specified in [[un]signed] [short|int|long|long long] order\n" . $herecurr); + } + # check for static const char * arrays. if ($line =~ /\bstatic\s+const\s+char\s*\*\s*(\w+)\s*\[\s*\]\s*=\s*/) { WARN("STATIC_CONST_CHAR_ARRAY", @@ -2781,7 +3064,7 @@ sub process { if (ERROR("FUNCTION_WITHOUT_ARGS", "Bad function definition - $1() should probably be $1(void)\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ s/(\b($Type)\s+($Ident))\s*\(\s*\)/$2 $3(void)/; + $fixed[$fixlinenr] =~ s/(\b($Type)\s+($Ident))\s*\(\s*\)/$2 $3(void)/; } } @@ -2790,7 +3073,7 @@ sub process { if (WARN("DEFINE_PCI_DEVICE_TABLE", "Prefer struct pci_device_id over deprecated DEFINE_PCI_DEVICE_TABLE\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ s/\b(?:static\s+|)DEFINE_PCI_DEVICE_TABLE\s*\(\s*(\w+)\s*\)\s*=\s*/static const struct pci_device_id $1\[\] = /; + $fixed[$fixlinenr] =~ s/\b(?:static\s+|)DEFINE_PCI_DEVICE_TABLE\s*\(\s*(\w+)\s*\)\s*=\s*/static const struct pci_device_id $1\[\] = /; } } @@ -2827,7 +3110,7 @@ sub process { my $sub_from = $ident; my $sub_to = $ident; $sub_to =~ s/\Q$from\E/$to/; - $fixed[$linenr - 1] =~ + $fixed[$fixlinenr] =~ s@\Q$sub_from\E@$sub_to@; } } @@ -2855,7 +3138,7 @@ sub process { my $sub_from = $match; my $sub_to = $match; $sub_to =~ s/\Q$from\E/$to/; - $fixed[$linenr - 1] =~ + $fixed[$fixlinenr] =~ s@\Q$sub_from\E@$sub_to@; } } @@ -2917,7 +3200,7 @@ sub process { if (WARN("PREFER_PR_LEVEL", "Prefer pr_warn(... to pr_warning(...\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ + $fixed[$fixlinenr] =~ s/\bpr_warning\b/pr_warn/; } } @@ -2933,17 +3216,40 @@ sub process { # function brace can't be on same line, except for #defines of do while, # or if closed on same line - if (($line=~/$Type\s*$Ident\(.*\).*\s{/) and + if (($line=~/$Type\s*$Ident\(.*\).*\s*{/) and !($line=~/\#\s*define.*do\s{/) and !($line=~/}/)) { - ERROR("OPEN_BRACE", - "open brace '{' following function declarations go on the next line\n" . $herecurr); + if (ERROR("OPEN_BRACE", + "open brace '{' following function declarations go on the next line\n" . $herecurr) && + $fix) { + fix_delete_line($fixlinenr, $rawline); + my $fixed_line = $rawline; + $fixed_line =~ /(^..*$Type\s*$Ident\(.*\)\s*){(.*)$/; + my $line1 = $1; + my $line2 = $2; + fix_insert_line($fixlinenr, ltrim($line1)); + fix_insert_line($fixlinenr, "\+{"); + if ($line2 !~ /^\s*$/) { + fix_insert_line($fixlinenr, "\+\t" . trim($line2)); + } + } } # open braces for enum, union and struct go on the same line. if ($line =~ /^.\s*{/ && $prevline =~ /^.\s*(?:typedef\s+)?(enum|union|struct)(?:\s+$Ident)?\s*$/) { - ERROR("OPEN_BRACE", - "open brace '{' following $1 go on the same line\n" . $hereprev); + if (ERROR("OPEN_BRACE", + "open brace '{' following $1 go on the same line\n" . $hereprev) && + $fix && $prevline =~ /^\+/ && $line =~ /^\+/) { + fix_delete_line($fixlinenr - 1, $prevrawline); + fix_delete_line($fixlinenr, $rawline); + my $fixedline = rtrim($prevrawline) . " {"; + fix_insert_line($fixlinenr, $fixedline); + $fixedline = $rawline; + $fixedline =~ s/^(.\s*){\s*/$1\t/; + if ($fixedline !~ /^\+\s*$/) { + fix_insert_line($fixlinenr, $fixedline); + } + } } # missing space after union, struct or enum definition @@ -2951,7 +3257,7 @@ sub process { if (WARN("SPACING", "missing space after $1 definition\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ + $fixed[$fixlinenr] =~ s/^(.\s*(?:typedef\s+)?(?:enum|union|struct)(?:\s+$Ident){1,2})([=\{])/$1 $2/; } } @@ -3021,7 +3327,7 @@ sub process { } if (show_type("SPACING") && $fix) { - $fixed[$linenr - 1] =~ + $fixed[$fixlinenr] =~ s/^(.\s*)$Declare\s*\(\s*\*\s*$Ident\s*\)\s*\(/$1 . $declare . $post_declare_space . '(*' . $funcname . ')('/ex; } } @@ -3038,7 +3344,7 @@ sub process { if (ERROR("BRACKET_SPACE", "space prohibited before open square bracket '['\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ + $fixed[$fixlinenr] =~ s/^(\+.*?)\s+\[/$1\[/; } } @@ -3073,7 +3379,7 @@ sub process { if (WARN("SPACING", "space prohibited between function name and open parenthesis '('\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ + $fixed[$fixlinenr] =~ s/\b$name\s+\(/$name\(/; } } @@ -3341,8 +3647,8 @@ sub process { $fixed_line = $fixed_line . $fix_elements[$#elements]; } - if ($fix && $line_fixed && $fixed_line ne $fixed[$linenr - 1]) { - $fixed[$linenr - 1] = $fixed_line; + if ($fix && $line_fixed && $fixed_line ne $fixed[$fixlinenr]) { + $fixed[$fixlinenr] = $fixed_line; } @@ -3353,7 +3659,7 @@ sub process { if (WARN("SPACING", "space prohibited before semicolon\n" . $herecurr) && $fix) { - 1 while $fixed[$linenr - 1] =~ + 1 while $fixed[$fixlinenr] =~ s/^(\+.*\S)\s+;/$1;/; } } @@ -3386,7 +3692,7 @@ sub process { if (ERROR("SPACING", "space required before the open brace '{'\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ s/^(\+.*(?:do|\))){/$1 {/; + $fixed[$fixlinenr] =~ s/^(\+.*(?:do|\))){/$1 {/; } } @@ -3404,7 +3710,7 @@ sub process { if (ERROR("SPACING", "space required after that close brace '}'\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ + $fixed[$fixlinenr] =~ s/}((?!(?:,|;|\)))\S)/} $1/; } } @@ -3414,7 +3720,7 @@ sub process { if (ERROR("SPACING", "space prohibited after that open square bracket '['\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ + $fixed[$fixlinenr] =~ s/\[\s+/\[/; } } @@ -3422,7 +3728,7 @@ sub process { if (ERROR("SPACING", "space prohibited before that close square bracket ']'\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ + $fixed[$fixlinenr] =~ s/\s+\]/\]/; } } @@ -3433,7 +3739,7 @@ sub process { if (ERROR("SPACING", "space prohibited after that open parenthesis '('\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ + $fixed[$fixlinenr] =~ s/\(\s+/\(/; } } @@ -3443,18 +3749,27 @@ sub process { if (ERROR("SPACING", "space prohibited before that close parenthesis ')'\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ + print("fixlinenr: <$fixlinenr> fixed[fixlinenr]: <$fixed[$fixlinenr]>\n"); + $fixed[$fixlinenr] =~ s/\s+\)/\)/; } } +# check unnecessary parentheses around addressof/dereference single $Lvals +# ie: &(foo->bar) should be &foo->bar and *(foo->bar) should be *foo->bar + + while ($line =~ /(?:[^&]&\s*|\*)\(\s*($Ident\s*(?:$Member\s*)+)\s*\)/g) { + CHK("UNNECESSARY_PARENTHESES", + "Unnecessary parentheses around $1\n" . $herecurr); + } + #goto labels aren't indented, allow a single space however if ($line=~/^.\s+[A-Za-z\d_]+:(?![0-9]+)/ and !($line=~/^. [A-Za-z\d_]+:/) and !($line=~/^.\s+default:/)) { if (WARN("INDENTED_LABEL", "labels should not be indented\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ + $fixed[$fixlinenr] =~ s/^(.)\s+/$1/; } } @@ -3516,7 +3831,7 @@ sub process { if (ERROR("SPACING", "space required before the open parenthesis '('\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ + $fixed[$fixlinenr] =~ s/\b(if|while|for|switch)\(/$1 \(/; } } @@ -3606,7 +3921,7 @@ sub process { # if should not continue a brace if ($line =~ /}\s*if\b/) { ERROR("TRAILING_STATEMENTS", - "trailing statements should be on next line\n" . + "trailing statements should be on next line (or did you mean 'else if'?)\n" . $herecurr); } # case and default should not have general statements after them @@ -3622,14 +3937,26 @@ sub process { # Check for }<nl>else {, these must be at the same # indent level to be relevant to each other. - if ($prevline=~/}\s*$/ and $line=~/^.\s*else\s*/ and - $previndent == $indent) { - ERROR("ELSE_AFTER_BRACE", - "else should follow close brace '}'\n" . $hereprev); + if ($prevline=~/}\s*$/ and $line=~/^.\s*else\s*/ && + $previndent == $indent) { + if (ERROR("ELSE_AFTER_BRACE", + "else should follow close brace '}'\n" . $hereprev) && + $fix && $prevline =~ /^\+/ && $line =~ /^\+/) { + fix_delete_line($fixlinenr - 1, $prevrawline); + fix_delete_line($fixlinenr, $rawline); + my $fixedline = $prevrawline; + $fixedline =~ s/}\s*$//; + if ($fixedline !~ /^\+\s*$/) { + fix_insert_line($fixlinenr, $fixedline); + } + $fixedline = $rawline; + $fixedline =~ s/^(.\s*)else/$1} else/; + fix_insert_line($fixlinenr, $fixedline); + } } - if ($prevline=~/}\s*$/ and $line=~/^.\s*while\s*/ and - $previndent == $indent) { + if ($prevline=~/}\s*$/ and $line=~/^.\s*while\s*/ && + $previndent == $indent) { my ($s, $c) = ctx_statement_block($linenr, $realcnt, 0); # Find out what is on the end of the line after the @@ -3638,8 +3965,18 @@ sub process { $s =~ s/\n.*//g; if ($s =~ /^\s*;/) { - ERROR("WHILE_AFTER_BRACE", - "while should follow close brace '}'\n" . $hereprev); + if (ERROR("WHILE_AFTER_BRACE", + "while should follow close brace '}'\n" . $hereprev) && + $fix && $prevline =~ /^\+/ && $line =~ /^\+/) { + fix_delete_line($fixlinenr - 1, $prevrawline); + fix_delete_line($fixlinenr, $rawline); + my $fixedline = $prevrawline; + my $trailing = $rawline; + $trailing =~ s/^\+//; + $trailing = trim($trailing); + $fixedline =~ s/}\s*$/} $trailing/; + fix_insert_line($fixlinenr, $fixedline); + } } } @@ -3653,7 +3990,7 @@ sub process { "Avoid gcc v4.3+ binary constant extension: <$var>\n" . $herecurr) && $fix) { my $hexval = sprintf("0x%x", oct($var)); - $fixed[$linenr - 1] =~ + $fixed[$fixlinenr] =~ s/\b$var\b/$hexval/; } } @@ -3689,7 +4026,7 @@ sub process { if (WARN("WHITESPACE_AFTER_LINE_CONTINUATION", "Whitespace after \\ makes next lines useless\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ s/\s+$//; + $fixed[$fixlinenr] =~ s/\s+$//; } } @@ -3762,7 +4099,7 @@ sub process { $dstat !~ /^(?:$Ident|-?$Constant),$/ && # 10, // foo(), $dstat !~ /^(?:$Ident|-?$Constant);$/ && # foo(); $dstat !~ /^[!~-]?(?:$Lval|$Constant)$/ && # 10 // foo() // !foo // ~foo // -foo // foo->bar // foo.bar->baz - $dstat !~ /^'X'$/ && # character constants + $dstat !~ /^'X'$/ && $dstat !~ /^'XX'$/ && # character constants $dstat !~ /$exceptions/ && $dstat !~ /^\.$Ident\s*=/ && # .foo = $dstat !~ /^(?:\#\s*$Ident|\#\s*$Constant)\s*$/ && # stringification #foo @@ -4014,6 +4351,23 @@ sub process { } } +# check for unnecessary "Out of Memory" messages + if ($line =~ /^\+.*\b$logFunctions\s*\(/ && + $prevline =~ /^[ \+]\s*if\s*\(\s*(\!\s*|NULL\s*==\s*)?($Lval)(\s*==\s*NULL\s*)?\s*\)/ && + (defined $1 || defined $3) && + $linenr > 3) { + my $testval = $2; + my $testline = $lines[$linenr - 3]; + + my ($s, $c) = ctx_statement_block($linenr - 3, $realcnt, 0); +# print("line: <$line>\nprevline: <$prevline>\ns: <$s>\nc: <$c>\n\n\n"); + + if ($c =~ /(?:^|\n)[ \+]\s*(?:$Type\s*)?\Q$testval\E\s*=\s*(?:\([^\)]*\)\s*)?\s*(?:devm_)?(?:[kv][czm]alloc(?:_node|_array)?\b|kstrdup|(?:dev_)?alloc_skb)/) { + WARN("OOM_MESSAGE", + "Possible unnecessary 'out of memory' message\n" . $hereprev); + } + } + # check for bad placement of section $InitAttribute (e.g.: __initdata) if ($line =~ /(\b$InitAttribute\b)/) { my $attr = $1; @@ -4027,7 +4381,7 @@ sub process { WARN("MISPLACED_INIT", "$attr should be placed after $var\n" . $herecurr))) && $fix) { - $fixed[$linenr - 1] =~ s/(\bstatic\s+(?:const\s+)?)(?:$attr\s+)?($NonptrTypeWithAttr)\s+(?:$attr\s+)?($Ident(?:\[[^]]*\])?)\s*([=;])\s*/"$1" . trim(string_find_replace($2, "\\s*$attr\\s*", " ")) . " " . trim(string_find_replace($3, "\\s*$attr\\s*", "")) . " $attr" . ("$4" eq ";" ? ";" : " = ")/e; + $fixed[$fixlinenr] =~ s/(\bstatic\s+(?:const\s+)?)(?:$attr\s+)?($NonptrTypeWithAttr)\s+(?:$attr\s+)?($Ident(?:\[[^]]*\])?)\s*([=;])\s*/"$1" . trim(string_find_replace($2, "\\s*$attr\\s*", " ")) . " " . trim(string_find_replace($3, "\\s*$attr\\s*", "")) . " $attr" . ("$4" eq ";" ? ";" : " = ")/e; } } } @@ -4041,7 +4395,7 @@ sub process { if (ERROR("INIT_ATTRIBUTE", "Use of const init definition must use ${attr_prefix}initconst\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ + $fixed[$fixlinenr] =~ s/$InitAttributeData/${attr_prefix}initconst/; } } @@ -4052,12 +4406,12 @@ sub process { if (ERROR("INIT_ATTRIBUTE", "Use of $attr requires a separate use of const\n" . $herecurr) && $fix) { - my $lead = $fixed[$linenr - 1] =~ + my $lead = $fixed[$fixlinenr] =~ /(^\+\s*(?:static\s+))/; $lead = rtrim($1); $lead = "$lead " if ($lead !~ /^\+$/); $lead = "${lead}const "; - $fixed[$linenr - 1] =~ s/(^\+\s*(?:static\s+))/$lead/; + $fixed[$fixlinenr] =~ s/(^\+\s*(?:static\s+))/$lead/; } } @@ -4070,7 +4424,7 @@ sub process { if (WARN("CONSTANT_CONVERSION", "$constant_func should be $func\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ s/\b$constant_func\b/$func/g; + $fixed[$fixlinenr] =~ s/\b$constant_func\b/$func/g; } } @@ -4120,7 +4474,7 @@ sub process { if (ERROR("SPACING", "exactly one space required after that #$1\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ + $fixed[$fixlinenr] =~ s/^(.\s*\#\s*(ifdef|ifndef|elif))\s{2,}/$1 /; } @@ -4168,7 +4522,7 @@ sub process { if (WARN("INLINE", "plain inline is preferred over $1\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ s/\b(__inline__|__inline)\b/inline/; + $fixed[$fixlinenr] =~ s/\b(__inline__|__inline)\b/inline/; } } @@ -4193,7 +4547,7 @@ sub process { if (WARN("PREFER_PRINTF", "__printf(string-index, first-to-check) is preferred over __attribute__((format(printf, string-index, first-to-check)))\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ s/\b__attribute__\s*\(\s*\(\s*format\s*\(\s*printf\s*,\s*(.*)\)\s*\)\s*\)/"__printf(" . trim($1) . ")"/ex; + $fixed[$fixlinenr] =~ s/\b__attribute__\s*\(\s*\(\s*format\s*\(\s*printf\s*,\s*(.*)\)\s*\)\s*\)/"__printf(" . trim($1) . ")"/ex; } } @@ -4204,7 +4558,7 @@ sub process { if (WARN("PREFER_SCANF", "__scanf(string-index, first-to-check) is preferred over __attribute__((format(scanf, string-index, first-to-check)))\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ s/\b__attribute__\s*\(\s*\(\s*format\s*\(\s*scanf\s*,\s*(.*)\)\s*\)\s*\)/"__scanf(" . trim($1) . ")"/ex; + $fixed[$fixlinenr] =~ s/\b__attribute__\s*\(\s*\(\s*format\s*\(\s*scanf\s*,\s*(.*)\)\s*\)\s*\)/"__scanf(" . trim($1) . ")"/ex; } } @@ -4219,7 +4573,7 @@ sub process { if (WARN("SIZEOF_PARENTHESIS", "sizeof $1 should be sizeof($1)\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ s/\bsizeof\s+((?:\*\s*|)$Lval|$Type(?:\s+$Lval|))/"sizeof(" . trim($1) . ")"/ex; + $fixed[$fixlinenr] =~ s/\bsizeof\s+((?:\*\s*|)$Lval|$Type(?:\s+$Lval|))/"sizeof(" . trim($1) . ")"/ex; } } @@ -4242,7 +4596,7 @@ sub process { if (WARN("PREFER_SEQ_PUTS", "Prefer seq_puts to seq_printf\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ s/\bseq_printf\b/seq_puts/; + $fixed[$fixlinenr] =~ s/\bseq_printf\b/seq_puts/; } } } @@ -4271,7 +4625,7 @@ sub process { if (WARN("PREFER_ETHER_ADDR_COPY", "Prefer ether_addr_copy() over memcpy() if the Ethernet addresses are __aligned(2)\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ s/\bmemcpy\s*\(\s*$FuncArg\s*,\s*$FuncArg\s*\,\s*ETH_ALEN\s*\)/ether_addr_copy($2, $7)/; + $fixed[$fixlinenr] =~ s/\bmemcpy\s*\(\s*$FuncArg\s*,\s*$FuncArg\s*\,\s*ETH_ALEN\s*\)/ether_addr_copy($2, $7)/; } } @@ -4359,7 +4713,7 @@ sub process { if (CHK("AVOID_EXTERNS", "extern prototypes should be avoided in .h files\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ s/(.*)\bextern\b\s*(.*)/$1$2/; + $fixed[$fixlinenr] =~ s/(.*)\bextern\b\s*(.*)/$1$2/; } } @@ -4419,23 +4773,24 @@ sub process { # check for k[mz]alloc with multiplies that could be kmalloc_array/kcalloc if ($^V && $^V ge 5.10.0 && - $line =~ /\b($Lval)\s*\=\s*(?:$balanced_parens)?\s*(k[mz]alloc)\s*\(\s*($FuncArg)\s*\*\s*($FuncArg)/) { + $line =~ /\b($Lval)\s*\=\s*(?:$balanced_parens)?\s*(k[mz]alloc)\s*\(\s*($FuncArg)\s*\*\s*($FuncArg)\s*,/) { my $oldfunc = $3; my $a1 = $4; my $a2 = $10; my $newfunc = "kmalloc_array"; $newfunc = "kcalloc" if ($oldfunc eq "kzalloc"); - if ($a1 =~ /^sizeof\s*\S/ || $a2 =~ /^sizeof\s*\S/) { + my $r1 = $a1; + my $r2 = $a2; + if ($a1 =~ /^sizeof\s*\S/) { + $r1 = $a2; + $r2 = $a1; + } + if ($r1 !~ /^sizeof\b/ && $r2 =~ /^sizeof\s*\S/ && + !($r1 =~ /^$Constant$/ || $r1 =~ /^[A-Z_][A-Z0-9_]*$/)) { if (WARN("ALLOC_WITH_MULTIPLY", "Prefer $newfunc over $oldfunc with multiply\n" . $herecurr) && $fix) { - my $r1 = $a1; - my $r2 = $a2; - if ($a1 =~ /^sizeof\s*\S/) { - $r1 = $a2; - $r2 = $a1; - } - $fixed[$linenr - 1] =~ s/\b($Lval)\s*\=\s*(?:$balanced_parens)?\s*(k[mz]alloc)\s*\(\s*($FuncArg)\s*\*\s*($FuncArg)/$1 . ' = ' . "$newfunc(" . trim($r1) . ', ' . trim($r2)/e; + $fixed[$fixlinenr] =~ s/\b($Lval)\s*\=\s*(?:$balanced_parens)?\s*(k[mz]alloc)\s*\(\s*($FuncArg)\s*\*\s*($FuncArg)/$1 . ' = ' . "$newfunc(" . trim($r1) . ', ' . trim($r2)/e; } } @@ -4459,17 +4814,17 @@ sub process { if (WARN("ONE_SEMICOLON", "Statements terminations use 1 semicolon\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ s/(\s*;\s*){2,}$/;/g; + $fixed[$fixlinenr] =~ s/(\s*;\s*){2,}$/;/g; } } -# check for case / default statements not preceeded by break/fallthrough/switch +# check for case / default statements not preceded by break/fallthrough/switch if ($line =~ /^.\s*(?:case\s+(?:$Ident|$Constant)\s*|default):/) { my $has_break = 0; my $has_statement = 0; my $count = 0; my $prevline = $linenr; - while ($prevline > 1 && $count < 3 && !$has_break) { + while ($prevline > 1 && ($file || $count < 3) && !$has_break) { $prevline--; my $rline = $rawlines[$prevline - 1]; my $fline = $lines[$prevline - 1]; @@ -4507,7 +4862,7 @@ sub process { if (WARN("USE_FUNC", "__func__ should be used instead of gcc specific __FUNCTION__\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ s/\b__FUNCTION__\b/__func__/g; + $fixed[$fixlinenr] =~ s/\b__FUNCTION__\b/__func__/g; } } @@ -4750,12 +5105,16 @@ sub process { hash_show_words(\%use_type, "Used"); hash_show_words(\%ignore_type, "Ignored"); - if ($clean == 0 && $fix && "@rawlines" ne "@fixed") { + if ($clean == 0 && $fix && + ("@rawlines" ne "@fixed" || + $#fixed_inserted >= 0 || $#fixed_deleted >= 0)) { my $newfile = $filename; $newfile .= ".EXPERIMENTAL-checkpatch-fixes" if (!$fix_inplace); my $linecount = 0; my $f; + @fixed = fix_inserted_deleted_lines(\@fixed, \@fixed_inserted, \@fixed_deleted); + open($f, '>', $newfile) or die "$P: Can't open $newfile for write\n"; foreach my $fixed_line (@fixed) { @@ -4763,7 +5122,7 @@ sub process { if ($file) { if ($linecount > 3) { $fixed_line =~ s/^\+//; - print $f $fixed_line. "\n"; + print $f $fixed_line . "\n"; } } else { print $f $fixed_line . "\n"; |