diff options
100 files changed, 2589 insertions, 1562 deletions
diff --git a/Documentation/ABI/testing/sysfs-kernel-slab b/Documentation/ABI/testing/sysfs-kernel-slab index 8b093f8..91bd6ca 100644 --- a/Documentation/ABI/testing/sysfs-kernel-slab +++ b/Documentation/ABI/testing/sysfs-kernel-slab @@ -346,6 +346,10 @@ Description: number of objects per slab. If a slab cannot be allocated because of fragmentation, SLUB will retry with the minimum order possible depending on its characteristics. + When debug_guardpage_minorder=N (N > 0) parameter is specified + (see Documentation/kernel-parameters.txt), the minimum possible + order is used and this sysfs entry can not be used to change + the order at run time. What: /sys/kernel/slab/cache/order_fallback Date: April 2008 diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index 4d8774f..4c95c00 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt @@ -61,7 +61,7 @@ Brief summary of control files. memory.failcnt # show the number of memory usage hits limits memory.memsw.failcnt # show the number of memory+Swap hits limits memory.max_usage_in_bytes # show max memory usage recorded - memory.memsw.usage_in_bytes # show max memory+Swap usage recorded + memory.memsw.max_usage_in_bytes # show max memory+Swap usage recorded memory.soft_limit_in_bytes # set/show soft limit of memory usage memory.stat # show various statistics memory.use_hierarchy # set/show hierarchical account enabled @@ -410,8 +410,11 @@ memory.stat file includes following statistics cache - # of bytes of page cache memory. rss - # of bytes of anonymous and swap cache memory. mapped_file - # of bytes of mapped file (includes tmpfs/shmem) -pgpgin - # of pages paged in (equivalent to # of charging events). -pgpgout - # of pages paged out (equivalent to # of uncharging events). +pgpgin - # of charging events to the memory cgroup. The charging + event happens each time a page is accounted as either mapped + anon page(RSS) or cache page(Page Cache) to the cgroup. +pgpgout - # of uncharging events to the memory cgroup. The uncharging + event happens each time a page is unaccounted from the cgroup. swap - # of bytes of swap usage inactive_anon - # of bytes of anonymous memory and swap cache memory on LRU list. diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 12fee13..a76a26a 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -307,6 +307,9 @@ Table 1-4: Contents of the stat files (as of 2.6.30-rc7) blkio_ticks time spent waiting for block IO gtime guest time of the task in jiffies cgtime guest time of the task children in jiffies + start_data address above which program data+bss is placed + end_data address below which program data+bss is placed + start_brk address above which program heap can be expanded with brk() .............................................................................. The /proc/PID/maps file containing the currently mapped memory regions and diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 6d8cd8b..8c20fbd 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -415,6 +415,14 @@ PIDs of value pid_max or larger are not allocated. ============================================================== +ns_last_pid: + +The last pid allocated in the current (the one task using this sysctl +lives in) pid namespace. When selecting a pid for a next task on fork +kernel tries to allocate a number starting from this one. + +============================================================== + powersave-nap: (PPC only) If set, Linux-PPC will use the 'nap' mode of powersaving, diff --git a/Documentation/vm/slub.txt b/Documentation/vm/slub.txt index 2acdda9..6752870 100644 --- a/Documentation/vm/slub.txt +++ b/Documentation/vm/slub.txt @@ -131,7 +131,10 @@ slub_min_objects. slub_max_order specified the order at which slub_min_objects should no longer be checked. This is useful to avoid SLUB trying to generate super large order pages to fit slub_min_objects of a slab cache with -large object sizes into one high order page. +large object sizes into one high order page. Setting command line +parameter debug_guardpage_minorder=N (N > 0), forces setting +slub_max_order to 0, what cause minimum possible order of slabs +allocation. SLUB Debug output ----------------- diff --git a/arch/Kconfig b/arch/Kconfig index 2505740..4f55c73 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -185,4 +185,18 @@ config HAVE_RCU_TABLE_FREE config ARCH_HAVE_NMI_SAFE_CMPXCHG bool +config HAVE_ALIGNED_STRUCT_PAGE + bool + help + This makes sure that struct pages are double word aligned and that + e.g. the SLUB allocator can perform double word atomic operations + on a struct page for better performance. However selecting this + might increase the size of a struct page by a word. + +config HAVE_CMPXCHG_LOCAL + bool + +config HAVE_CMPXCHG_DOUBLE + bool + source "kernel/gcov/Kconfig" diff --git a/arch/avr32/include/asm/system.h b/arch/avr32/include/asm/system.h index 9702c221..62d9ded 100644 --- a/arch/avr32/include/asm/system.h +++ b/arch/avr32/include/asm/system.h @@ -169,7 +169,7 @@ static inline unsigned long __cmpxchg_local(volatile void *ptr, #define cmpxchg64_local(ptr, o, n) __cmpxchg64_local_generic((ptr), (o), (n)) struct pt_regs; -void NORET_TYPE die(const char *str, struct pt_regs *regs, long err); +void die(const char *str, struct pt_regs *regs, long err); void _exception(long signr, struct pt_regs *regs, int code, unsigned long addr); diff --git a/arch/avr32/kernel/traps.c b/arch/avr32/kernel/traps.c index 7aa2575..3d760c0 100644 --- a/arch/avr32/kernel/traps.c +++ b/arch/avr32/kernel/traps.c @@ -24,7 +24,7 @@ static DEFINE_SPINLOCK(die_lock); -void NORET_TYPE die(const char *str, struct pt_regs *regs, long err) +void die(const char *str, struct pt_regs *regs, long err) { static int die_counter; diff --git a/arch/ia64/include/asm/processor.h b/arch/ia64/include/asm/processor.h index d9f397f..691be0b 100644 --- a/arch/ia64/include/asm/processor.h +++ b/arch/ia64/include/asm/processor.h @@ -309,7 +309,6 @@ struct thread_struct { } #define start_thread(regs,new_ip,new_sp) do { \ - set_fs(USER_DS); \ regs->cr_ipsr = ((regs->cr_ipsr | (IA64_PSR_BITS_TO_SET | IA64_PSR_CPL)) \ & ~(IA64_PSR_BITS_TO_CLEAR | IA64_PSR_RI | IA64_PSR_IS)); \ regs->cr_iip = new_ip; \ diff --git a/arch/ia64/kernel/machine_kexec.c b/arch/ia64/kernel/machine_kexec.c index 3d3aeef..4eed358 100644 --- a/arch/ia64/kernel/machine_kexec.c +++ b/arch/ia64/kernel/machine_kexec.c @@ -27,11 +27,11 @@ #include <asm/sal.h> #include <asm/mca.h> -typedef NORET_TYPE void (*relocate_new_kernel_t)( +typedef void (*relocate_new_kernel_t)( unsigned long indirection_page, unsigned long start_address, struct ia64_boot_param *boot_param, - unsigned long pal_addr) ATTRIB_NORET; + unsigned long pal_addr) __noreturn; struct kimage *ia64_kimage; diff --git a/arch/m68k/amiga/config.c b/arch/m68k/amiga/config.c index 82a4bb5..b95a451 100644 --- a/arch/m68k/amiga/config.c +++ b/arch/m68k/amiga/config.c @@ -511,8 +511,7 @@ static unsigned long amiga_gettimeoffset(void) return ticks + offset; } -static NORET_TYPE void amiga_reset(void) - ATTRIB_NORET; +static void amiga_reset(void) __noreturn; static void amiga_reset(void) { diff --git a/arch/mips/include/asm/ptrace.h b/arch/mips/include/asm/ptrace.h index de39b1f..7b99c67 100644 --- a/arch/mips/include/asm/ptrace.h +++ b/arch/mips/include/asm/ptrace.h @@ -144,7 +144,7 @@ extern int ptrace_set_watch_regs(struct task_struct *child, extern asmlinkage void syscall_trace_enter(struct pt_regs *regs); extern asmlinkage void syscall_trace_leave(struct pt_regs *regs); -extern NORET_TYPE void die(const char *, struct pt_regs *) ATTRIB_NORET; +extern void die(const char *, struct pt_regs *) __noreturn; static inline void die_if_kernel(const char *str, struct pt_regs *regs) { diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c index 5c8a49d..bbddb86 100644 --- a/arch/mips/kernel/traps.c +++ b/arch/mips/kernel/traps.c @@ -1340,7 +1340,7 @@ void ejtag_exception_handler(struct pt_regs *regs) /* * NMI exception handler. */ -NORET_TYPE void ATTRIB_NORET nmi_exception_handler(struct pt_regs *regs) +void __noreturn nmi_exception_handler(struct pt_regs *regs) { bust_spinlocks(1); printk("NMI taken!!!!\n"); diff --git a/arch/mn10300/include/asm/exceptions.h b/arch/mn10300/include/asm/exceptions.h index ca3e205..95a4d42 100644 --- a/arch/mn10300/include/asm/exceptions.h +++ b/arch/mn10300/include/asm/exceptions.h @@ -110,7 +110,7 @@ extern asmlinkage void nmi_handler(void); extern asmlinkage void misalignment(struct pt_regs *, enum exception_code); extern void die(const char *, struct pt_regs *, enum exception_code) - ATTRIB_NORET; + __noreturn; extern int die_if_no_fixup(const char *, struct pt_regs *, enum exception_code); diff --git a/arch/parisc/include/asm/processor.h b/arch/parisc/include/asm/processor.h index 9ce66e9..7213ec9 100644 --- a/arch/parisc/include/asm/processor.h +++ b/arch/parisc/include/asm/processor.h @@ -196,7 +196,6 @@ typedef unsigned int elf_caddr_t; /* offset pc for priv. level */ \ pc |= 3; \ \ - set_fs(USER_DS); \ regs->iasq[0] = spaceid; \ regs->iasq[1] = spaceid; \ regs->iaoq[0] = pc; \ @@ -299,7 +298,6 @@ on downward growing arches, it looks like this: elf_addr_t pc = (elf_addr_t)new_pc | 3; \ elf_caddr_t *argv = (elf_caddr_t *)bprm->exec + 1; \ \ - set_fs(USER_DS); \ regs->iasq[0] = spaceid; \ regs->iasq[1] = spaceid; \ regs->iaoq[0] = pc; \ diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c index 4b4b918..62c60b8 100644 --- a/arch/parisc/kernel/process.c +++ b/arch/parisc/kernel/process.c @@ -192,7 +192,6 @@ void flush_thread(void) /* Only needs to handle fpu stuff or perf monitors. ** REVISIT: several arches implement a "lazy fpu state". */ - set_fs(USER_DS); } void release_thread(struct task_struct *dead_task) diff --git a/arch/powerpc/kernel/machine_kexec_32.c b/arch/powerpc/kernel/machine_kexec_32.c index e63f2e7..affe5dc 100644 --- a/arch/powerpc/kernel/machine_kexec_32.c +++ b/arch/powerpc/kernel/machine_kexec_32.c @@ -16,10 +16,10 @@ #include <asm/hw_irq.h> #include <asm/io.h> -typedef NORET_TYPE void (*relocate_new_kernel_t)( +typedef void (*relocate_new_kernel_t)( unsigned long indirection_page, unsigned long reboot_code_buffer, - unsigned long start_address) ATTRIB_NORET; + unsigned long start_address) __noreturn; /* * This is a generic machine_kexec function suitable at least for diff --git a/arch/powerpc/kernel/machine_kexec_64.c b/arch/powerpc/kernel/machine_kexec_64.c index 26ccbf7..d7f6090 100644 --- a/arch/powerpc/kernel/machine_kexec_64.c +++ b/arch/powerpc/kernel/machine_kexec_64.c @@ -307,9 +307,9 @@ static union thread_union kexec_stack __init_task_data = struct paca_struct kexec_paca; /* Our assembly helper, in kexec_stub.S */ -extern NORET_TYPE void kexec_sequence(void *newstack, unsigned long start, - void *image, void *control, - void (*clear_all)(void)) ATTRIB_NORET; +extern void kexec_sequence(void *newstack, unsigned long start, + void *image, void *control, + void (*clear_all)(void)) __noreturn; /* too late to fail here */ void default_machine_kexec(struct kimage *image) diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 4ff3d8e..3feefc3 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -58,7 +58,7 @@ static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS]; * Allocate node_to_cpumask_map based on number of available nodes * Requires node_possible_map to be valid. * - * Note: node_to_cpumask() is not valid until after this is done. + * Note: cpumask_of_node() is not valid until after this is done. */ static void __init setup_node_to_cpumask_map(void) { diff --git a/arch/powerpc/platforms/pseries/nvram.c b/arch/powerpc/platforms/pseries/nvram.c index 330a57b..36f957f 100644 --- a/arch/powerpc/platforms/pseries/nvram.c +++ b/arch/powerpc/platforms/pseries/nvram.c @@ -638,7 +638,6 @@ static void oops_to_nvram(struct kmsg_dumper *dumper, /* These are almost always orderly shutdowns. */ return; case KMSG_DUMP_OOPS: - case KMSG_DUMP_KEXEC: break; case KMSG_DUMP_PANIC: panicking = true; diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index 27272f6..d25843a 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -236,7 +236,7 @@ static inline unsigned long __rewind_psw(psw_t psw, unsigned long ilc) /* * Function to drop a processor into disabled wait state */ -static inline void ATTRIB_NORET disabled_wait(unsigned long code) +static inline void __noreturn disabled_wait(unsigned long code) { unsigned long ctl_buf; psw_t dw_psw; diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c index fab8843..0fd2e86 100644 --- a/arch/s390/kernel/nmi.c +++ b/arch/s390/kernel/nmi.c @@ -30,7 +30,7 @@ struct mcck_struct { static DEFINE_PER_CPU(struct mcck_struct, cpu_mcck); -static NORET_TYPE void s390_handle_damage(char *msg) +static void s390_handle_damage(char *msg) { smp_send_stop(); disabled_wait((unsigned long) __builtin_return_address(0)); diff --git a/arch/sh/kernel/process_32.c b/arch/sh/kernel/process_32.c index aaf6d59..7ec6651 100644 --- a/arch/sh/kernel/process_32.c +++ b/arch/sh/kernel/process_32.c @@ -70,7 +70,7 @@ void show_regs(struct pt_regs * regs) /* * Create a kernel thread */ -ATTRIB_NORET void kernel_thread_helper(void *arg, int (*fn)(void *)) +__noreturn void kernel_thread_helper(void *arg, int (*fn)(void *)) { do_exit(fn(arg)); } diff --git a/arch/sh/kernel/process_64.c b/arch/sh/kernel/process_64.c index 210c1ca..cbd4e4b 100644 --- a/arch/sh/kernel/process_64.c +++ b/arch/sh/kernel/process_64.c @@ -285,7 +285,7 @@ void show_regs(struct pt_regs *regs) /* * Create a kernel thread */ -ATTRIB_NORET void kernel_thread_helper(void *arg, int (*fn)(void *)) +__noreturn void kernel_thread_helper(void *arg, int (*fn)(void *)) { do_exit(fn(arg)); } diff --git a/arch/tile/kernel/machine_kexec.c b/arch/tile/kernel/machine_kexec.c index e00d717..6255f2e 100644 --- a/arch/tile/kernel/machine_kexec.c +++ b/arch/tile/kernel/machine_kexec.c @@ -248,11 +248,11 @@ static void setup_quasi_va_is_pa(void) } -NORET_TYPE void machine_kexec(struct kimage *image) +void machine_kexec(struct kimage *image) { void *reboot_code_buffer; - NORET_TYPE void (*rnk)(unsigned long, void *, unsigned long) - ATTRIB_NORET; + void (*rnk)(unsigned long, void *, unsigned long) + __noreturn; /* Mask all interrupts before starting to reboot. */ interrupt_mask_set_mask(~0ULL); diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index a150f4c..6c14ecd 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -60,6 +60,9 @@ config X86 select PERF_EVENTS select HAVE_PERF_EVENTS_NMI select ANON_INODES + select HAVE_ALIGNED_STRUCT_PAGE if SLUB && !M386 + select HAVE_CMPXCHG_LOCAL if !M386 + select HAVE_CMPXCHG_DOUBLE select HAVE_ARCH_KMEMCHECK select HAVE_USER_RETURN_NOTIFIER select ARCH_BINFMT_ELF_RANDOMIZE_PIE diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index e3ca7e0..3c57033 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -309,12 +309,6 @@ config X86_INTERNODE_CACHE_SHIFT config X86_CMPXCHG def_bool X86_64 || (X86_32 && !M386) -config CMPXCHG_LOCAL - def_bool X86_64 || (X86_32 && !M386) - -config CMPXCHG_DOUBLE - def_bool y - config X86_L1_CACHE_SHIFT int default "7" if MPENTIUM4 || MPSC diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 020cd2e..19d3fa0 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -110,7 +110,7 @@ void __cpuinit numa_clear_node(int cpu) * Allocate node_to_cpumask_map based on number of available nodes * Requires node_possible_map to be valid. * - * Note: node_to_cpumask() is not valid until after this is done. + * Note: cpumask_of_node() is not valid until after this is done. * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.) */ void __init setup_node_to_cpumask_map(void) diff --git a/arch/x86/um/Kconfig b/arch/x86/um/Kconfig index 1d97bd8..b2b54d2 100644 --- a/arch/x86/um/Kconfig +++ b/arch/x86/um/Kconfig @@ -6,14 +6,6 @@ menu "UML-specific options" menu "Host processor type and features" -config CMPXCHG_LOCAL - bool - default n - -config CMPXCHG_DOUBLE - bool - default n - source "arch/x86/Kconfig.cpu" endmenu diff --git a/drivers/base/memory.c b/drivers/base/memory.c index f17e3ea..ed5de58 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -295,11 +295,22 @@ static int memory_block_change_state(struct memory_block *mem, ret = memory_block_action(mem->start_section_nr, to_state); - if (ret) + if (ret) { mem->state = from_state_req; - else - mem->state = to_state; + goto out; + } + mem->state = to_state; + switch (mem->state) { + case MEM_OFFLINE: + kobject_uevent(&mem->dev.kobj, KOBJ_OFFLINE); + break; + case MEM_ONLINE: + kobject_uevent(&mem->dev.kobj, KOBJ_ONLINE); + break; + default: + break; + } out: mutex_unlock(&mem->state_mutex); return ret; diff --git a/drivers/char/ramoops.c b/drivers/char/ramoops.c index 7c7f42a1f8..9fec323 100644 --- a/drivers/char/ramoops.c +++ b/drivers/char/ramoops.c @@ -83,8 +83,7 @@ static void ramoops_do_dump(struct kmsg_dumper *dumper, struct timeval timestamp; if (reason != KMSG_DUMP_OOPS && - reason != KMSG_DUMP_PANIC && - reason != KMSG_DUMP_KEXEC) + reason != KMSG_DUMP_PANIC) return; /* Only dump oopses if dump_oops is set */ @@ -126,8 +125,8 @@ static int __init ramoops_probe(struct platform_device *pdev) goto fail3; } - rounddown_pow_of_two(pdata->mem_size); - rounddown_pow_of_two(pdata->record_size); + pdata->mem_size = rounddown_pow_of_two(pdata->mem_size); + pdata->record_size = rounddown_pow_of_two(pdata->record_size); /* Check for the minimum memory size */ if (pdata->mem_size < MIN_MEM_SIZE && @@ -148,14 +147,6 @@ static int __init ramoops_probe(struct platform_device *pdev) cxt->phys_addr = pdata->mem_address; cxt->record_size = pdata->record_size; cxt->dump_oops = pdata->dump_oops; - /* - * Update the module parameter variables as well so they are visible - * through /sys/module/ramoops/parameters/ - */ - mem_size = pdata->mem_size; - mem_address = pdata->mem_address; - record_size = pdata->record_size; - dump_oops = pdata->dump_oops; if (!request_mem_region(cxt->phys_addr, cxt->size, "ramoops")) { pr_err("request mem region failed\n"); @@ -176,6 +167,15 @@ static int __init ramoops_probe(struct platform_device *pdev) goto fail1; } + /* + * Update the module parameter variables as well so they are visible + * through /sys/module/ramoops/parameters/ + */ + mem_size = pdata->mem_size; + mem_address = pdata->mem_address; + record_size = pdata->record_size; + dump_oops = pdata->dump_oops; + return 0; fail1: diff --git a/drivers/mtd/mtdoops.c b/drivers/mtd/mtdoops.c index db8e827..3ce99e0 100644 --- a/drivers/mtd/mtdoops.c +++ b/drivers/mtd/mtdoops.c @@ -315,8 +315,7 @@ static void mtdoops_do_dump(struct kmsg_dumper *dumper, char *dst; if (reason != KMSG_DUMP_OOPS && - reason != KMSG_DUMP_PANIC && - reason != KMSG_DUMP_KEXEC) + reason != KMSG_DUMP_PANIC) return; /* Only dump oopses if dump_oops is set */ diff --git a/drivers/parport/parport_pc.c b/drivers/parport/parport_pc.c index d0b597b..0cb64f5 100644 --- a/drivers/parport/parport_pc.c +++ b/drivers/parport/parport_pc.c @@ -3404,8 +3404,8 @@ static int __init parport_init_mode_setup(char *str) #endif #ifdef MODULE -static const char *irq[PARPORT_PC_MAX_PORTS]; -static const char *dma[PARPORT_PC_MAX_PORTS]; +static char *irq[PARPORT_PC_MAX_PORTS]; +static char *dma[PARPORT_PC_MAX_PORTS]; MODULE_PARM_DESC(io, "Base I/O address (SPP regs)"); module_param_array(io, int, NULL, 0); diff --git a/drivers/video/nvidia/nvidia.c b/drivers/video/nvidia/nvidia.c index 081dc47..fe13ac5 100644 --- a/drivers/video/nvidia/nvidia.c +++ b/drivers/video/nvidia/nvidia.c @@ -81,7 +81,7 @@ static int vram __devinitdata = 0; static int bpp __devinitdata = 8; static int reverse_i2c __devinitdata; #ifdef CONFIG_MTRR -static int nomtrr __devinitdata = 0; +static bool nomtrr __devinitdata = false; #endif #ifdef CONFIG_PMAC_BACKLIGHT static int backlight __devinitdata = 1; @@ -1509,7 +1509,7 @@ static int __devinit nvidiafb_setup(char *options) backlight = simple_strtoul(this_opt+10, NULL, 0); #ifdef CONFIG_MTRR } else if (!strncmp(this_opt, "nomtrr", 6)) { - nomtrr = 1; + nomtrr = true; #endif } else if (!strncmp(this_opt, "fpdither:", 9)) { fpdither = simple_strtol(this_opt+9, NULL, 0); @@ -1599,7 +1599,7 @@ MODULE_PARM_DESC(bpp, "pixel width in bits" module_param(reverse_i2c, int, 0); MODULE_PARM_DESC(reverse_i2c, "reverse port assignment of the i2c bus"); #ifdef CONFIG_MTRR -module_param(nomtrr, bool, 0); +module_param(nomtrr, bool, false); MODULE_PARM_DESC(nomtrr, "Disables MTRR support (0 or 1=disabled) " "(default=0)"); #endif diff --git a/fs/block_dev.c b/fs/block_dev.c index afe74dd..0e575d1 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1139,6 +1139,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) mutex_lock_nested(&bdev->bd_mutex, for_part); if (!bdev->bd_openers) { bdev->bd_disk = disk; + bdev->bd_queue = disk->queue; bdev->bd_contains = bdev; if (!partno) { struct backing_dev_info *bdi; @@ -1159,6 +1160,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) disk_put_part(bdev->bd_part); bdev->bd_part = NULL; bdev->bd_disk = NULL; + bdev->bd_queue = NULL; mutex_unlock(&bdev->bd_mutex); disk_unblock_events(disk); put_disk(disk); @@ -1232,6 +1234,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) disk_put_part(bdev->bd_part); bdev->bd_disk = NULL; bdev->bd_part = NULL; + bdev->bd_queue = NULL; bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info); if (bdev != bdev->bd_contains) __blkdev_put(bdev->bd_contains, mode, 1); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index f99a099..d852566 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -872,7 +872,8 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, #ifdef CONFIG_MIGRATION static int btree_migratepage(struct address_space *mapping, - struct page *newpage, struct page *page) + struct page *newpage, struct page *page, + enum migrate_mode mode) { /* * we can't safely write a btree page from here, @@ -887,7 +888,7 @@ static int btree_migratepage(struct address_space *mapping, if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) return -EAGAIN; - return migrate_page(mapping, newpage, page); + return migrate_page(mapping, newpage, page, mode); } #endif diff --git a/fs/direct-io.c b/fs/direct-io.c index d740ab6..4a588db 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -36,6 +36,7 @@ #include <linux/rwsem.h> #include <linux/uio.h> #include <linux/atomic.h> +#include <linux/prefetch.h> /* * How many user pages to map in one call to get_user_pages(). This determines @@ -580,9 +581,8 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio, { int ret; sector_t fs_startblk; /* Into file, in filesystem-sized blocks */ + sector_t fs_endblk; /* Into file, in filesystem-sized blocks */ unsigned long fs_count; /* Number of filesystem-sized blocks */ - unsigned long dio_count;/* Number of dio_block-sized blocks */ - unsigned long blkmask; int create; /* @@ -593,11 +593,9 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio, if (ret == 0) { BUG_ON(sdio->block_in_file >= sdio->final_block_in_request); fs_startblk = sdio->block_in_file >> sdio->blkfactor; - dio_count = sdio->final_block_in_request - sdio->block_in_file; - fs_count = dio_count >> sdio->blkfactor; - blkmask = (1 << sdio->blkfactor) - 1; - if (dio_count & blkmask) - fs_count++; + fs_endblk = (sdio->final_block_in_request - 1) >> + sdio->blkfactor; + fs_count = fs_endblk - fs_startblk + 1; map_bh->b_state = 0; map_bh->b_size = fs_count << dio->inode->i_blkbits; @@ -1090,8 +1088,8 @@ static inline int drop_refcount(struct dio *dio) * individual fields and will generate much worse code. This is important * for the whole file. */ -ssize_t -__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, +static inline ssize_t +do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, struct block_device *bdev, const struct iovec *iov, loff_t offset, unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, dio_submit_t submit_io, int flags) @@ -1100,7 +1098,6 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, size_t size; unsigned long addr; unsigned blkbits = inode->i_blkbits; - unsigned bdev_blkbits = 0; unsigned blocksize_mask = (1 << blkbits) - 1; ssize_t retval = -EINVAL; loff_t end = offset; @@ -1113,12 +1110,14 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, if (rw & WRITE) rw = WRITE_ODIRECT; - if (bdev) - bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev)); + /* + * Avoid references to bdev if not absolutely needed to give + * the early prefetch in the caller enough time. + */ if (offset & blocksize_mask) { if (bdev) - blkbits = bdev_blkbits; + blkbits = blksize_bits(bdev_logical_block_size(bdev)); blocksize_mask = (1 << blkbits) - 1; if (offset & blocksize_mask) goto out; @@ -1129,11 +1128,13 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, addr = (unsigned long)iov[seg].iov_base; size = iov[seg].iov_len; end += size; - if ((addr & blocksize_mask) || (size & blocksize_mask)) { + if (unlikely((addr & blocksize_mask) || + (size & blocksize_mask))) { if (bdev) - blkbits = bdev_blkbits; + blkbits = blksize_bits( + bdev_logical_block_size(bdev)); blocksize_mask = (1 << blkbits) - 1; - if ((addr & blocksize_mask) || (size & blocksize_mask)) + if ((addr & blocksize_mask) || (size & blocksize_mask)) goto out; } } @@ -1316,6 +1317,30 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, out: return retval; } + +ssize_t +__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, + struct block_device *bdev, const struct iovec *iov, loff_t offset, + unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, + dio_submit_t submit_io, int flags) +{ + /* + * The block device state is needed in the end to finally + * submit everything. Since it's likely to be cache cold + * prefetch it here as first thing to hide some of the + * latency. + * + * Attempt to prefetch the pieces we likely need later. + */ + prefetch(&bdev->bd_disk->part_tbl); + prefetch(bdev->bd_queue); + prefetch((char *)bdev->bd_queue + SMP_CACHE_BYTES); + + return do_blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, + nr_segs, get_block, end_io, + submit_io, flags); +} + EXPORT_SYMBOL(__blockdev_direct_IO); static __init int dio_init(void) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 828e750..aabdfc3 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -197,6 +197,12 @@ struct eventpoll { /* The user that created the eventpoll descriptor */ struct user_struct *user; + + struct file *file; + + /* used to optimize loop detection check */ + int visited; + struct list_head visited_list_link; }; /* Wait structure used by the poll hooks */ @@ -255,6 +261,15 @@ static struct kmem_cache *epi_cache __read_mostly; /* Slab cache used to allocate "struct eppoll_entry" */ static struct kmem_cache *pwq_cache __read_mostly; +/* Visited nodes during ep_loop_check(), so we can unset them when we finish */ +static LIST_HEAD(visited_list); + +/* + * List of files with newly added links, where we may need to limit the number + * of emanating paths. Protected by the epmutex. + */ +static LIST_HEAD(tfile_check_list); + #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> @@ -276,6 +291,12 @@ ctl_table epoll_table[] = { }; #endif /* CONFIG_SYSCTL */ +static const struct file_operations eventpoll_fops; + +static inline int is_file_epoll(struct file *f) +{ + return f->f_op == &eventpoll_fops; +} /* Setup the structure that is used as key for the RB tree */ static inline void ep_set_ffd(struct epoll_filefd *ffd, @@ -711,12 +732,6 @@ static const struct file_operations eventpoll_fops = { .llseek = noop_llseek, }; -/* Fast test to see if the file is an eventpoll file */ -static inline int is_file_epoll(struct file *f) -{ - return f->f_op == &eventpoll_fops; -} - /* * This is called from eventpoll_release() to unlink files from the eventpoll * interface. We need to have this facility to cleanup correctly files that are @@ -926,6 +941,99 @@ static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi) rb_insert_color(&epi->rbn, &ep->rbr); } + + +#define PATH_ARR_SIZE 5 +/* + * These are the number paths of length 1 to 5, that we are allowing to emanate + * from a single file of interest. For example, we allow 1000 paths of length + * 1, to emanate from each file of interest. This essentially represents the + * potential wakeup paths, which need to be limited in order to avoid massive + * uncontrolled wakeup storms. The common use case should be a single ep which + * is connected to n file sources. In this case each file source has 1 path + * of length 1. Thus, the numbers below should be more than sufficient. These + * path limits are enforced during an EPOLL_CTL_ADD operation, since a modify + * and delete can't add additional paths. Protected by the epmutex. + */ +static const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 }; +static int path_count[PATH_ARR_SIZE]; + +static int path_count_inc(int nests) +{ + if (++path_count[nests] > path_limits[nests]) + return -1; + return 0; +} + +static void path_count_init(void) +{ + int i; + + for (i = 0; i < PATH_ARR_SIZE; i++) + path_count[i] = 0; +} + +static int reverse_path_check_proc(void *priv, void *cookie, int call_nests) +{ + int error = 0; + struct file *file = priv; + struct file *child_file; + struct epitem *epi; + + list_for_each_entry(epi, &file->f_ep_links, fllink) { + child_file = epi->ep->file; + if (is_file_epoll(child_file)) { + if (list_empty(&child_file->f_ep_links)) { + if (path_count_inc(call_nests)) { + error = -1; + break; + } + } else { + error = ep_call_nested(&poll_loop_ncalls, + EP_MAX_NESTS, + reverse_path_check_proc, + child_file, child_file, + current); + } + if (error != 0) + break; + } else { + printk(KERN_ERR "reverse_path_check_proc: " + "file is not an ep!\n"); + } + } + return error; +} + +/** + * reverse_path_check - The tfile_check_list is list of file *, which have + * links that are proposed to be newly added. We need to + * make sure that those added links don't add too many + * paths such that we will spend all our time waking up + * eventpoll objects. + * + * Returns: Returns zero if the proposed links don't create too many paths, + * -1 otherwise. + */ +static int reverse_path_check(void) +{ + int length = 0; + int error = 0; + struct file *current_file; + + /* let's call this for all tfiles */ + list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) { + length++; + path_count_init(); + error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, + reverse_path_check_proc, current_file, + current_file, current); + if (error) + break; + } + return error; +} + /* * Must be called with "mtx" held. */ @@ -987,6 +1095,11 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, */ ep_rbtree_insert(ep, epi); + /* now check if we've created too many backpaths */ + error = -EINVAL; + if (reverse_path_check()) + goto error_remove_epi; + /* We have to drop the new item inside our item list to keep track of it */ spin_lock_irqsave(&ep->lock, flags); @@ -1011,6 +1124,14 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, return 0; +error_remove_epi: + spin_lock(&tfile->f_lock); + if (ep_is_linked(&epi->fllink)) + list_del_init(&epi->fllink); + spin_unlock(&tfile->f_lock); + + rb_erase(&epi->rbn, &ep->rbr); + error_unregister: ep_unregister_pollwait(ep, epi); @@ -1275,18 +1396,36 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests) int error = 0; struct file *file = priv; struct eventpoll *ep = file->private_data; + struct eventpoll *ep_tovisit; struct rb_node *rbp; struct epitem *epi; mutex_lock_nested(&ep->mtx, call_nests + 1); + ep->visited = 1; + list_add(&ep->visited_list_link, &visited_list); for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { epi = rb_entry(rbp, struct epitem, rbn); if (unlikely(is_file_epoll(epi->ffd.file))) { + ep_tovisit = epi->ffd.file->private_data; + if (ep_tovisit->visited) + continue; error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, - ep_loop_check_proc, epi->ffd.file, - epi->ffd.file->private_data, current); + ep_loop_check_proc, epi->ffd.file, + ep_tovisit, current); if (error != 0) break; + } else { + /* + * If we've reached a file that is not associated with + * an ep, then we need to check if the newly added + * links are going to add too many wakeup paths. We do + * this by adding it to the tfile_check_list, if it's + * not already there, and calling reverse_path_check() + * during ep_insert(). + */ + if (list_empty(&epi->ffd.file->f_tfile_llink)) + list_add(&epi->ffd.file->f_tfile_llink, + &tfile_check_list); } } mutex_unlock(&ep->mtx); @@ -1307,8 +1446,31 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests) */ static int ep_loop_check(struct eventpoll *ep, struct file *file) { - return ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, + int ret; + struct eventpoll *ep_cur, *ep_next; + + ret = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, ep_loop_check_proc, file, ep, current); + /* clear visited list */ + list_for_each_entry_safe(ep_cur, ep_next, &visited_list, + visited_list_link) { + ep_cur->visited = 0; + list_del(&ep_cur->visited_list_link); + } + return ret; +} + +static void clear_tfile_check_list(void) +{ + struct file *file; + + /* first clear the tfile_check_list */ + while (!list_empty(&tfile_check_list)) { + file = list_first_entry(&tfile_check_list, struct file, + f_tfile_llink); + list_del_init(&file->f_tfile_llink); + } + INIT_LIST_HEAD(&tfile_check_list); } /* @@ -1316,8 +1478,9 @@ static int ep_loop_check(struct eventpoll *ep, struct file *file) */ SYSCALL_DEFINE1(epoll_create1, int, flags) { - int error; + int error, fd; struct eventpoll *ep = NULL; + struct file *file; /* Check the EPOLL_* constant for consistency. */ BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC); @@ -1334,11 +1497,25 @@ SYSCALL_DEFINE1(epoll_create1, int, flags) * Creates all the items needed to setup an eventpoll file. That is, * a file structure and a free file descriptor. */ - error = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep, + fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC)); + if (fd < 0) { + error = fd; + goto out_free_ep; + } + file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep, O_RDWR | (flags & O_CLOEXEC)); - if (error < 0) - ep_free(ep); - + if (IS_ERR(file)) { + error = PTR_ERR(file); + goto out_free_fd; + } + fd_install(fd, file); + ep->file = file; + return fd; + +out_free_fd: + put_unused_fd(fd); +out_free_ep: + ep_free(ep); return error; } @@ -1404,21 +1581,27 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, /* * When we insert an epoll file descriptor, inside another epoll file * descriptor, there is the change of creating closed loops, which are - * better be handled here, than in more critical paths. + * better be handled here, than in more critical paths. While we are + * checking for loops we also determine the list of files reachable + * and hang them on the tfile_check_list, so we can check that we + * haven't created too many possible wakeup paths. * - * We hold epmutex across the loop check and the insert in this case, in - * order to prevent two separate inserts from racing and each doing the - * insert "at the same time" such that ep_loop_check passes on both - * before either one does the insert, thereby creating a cycle. + * We need to hold the epmutex across both ep_insert and ep_remove + * b/c we want to make sure we are looking at a coherent view of + * epoll network. */ - if (unlikely(is_file_epoll(tfile) && op == EPOLL_CTL_ADD)) { + if (op == EPOLL_CTL_ADD || op == EPOLL_CTL_DEL) { mutex_lock(&epmutex); did_lock_epmutex = 1; - error = -ELOOP; - if (ep_loop_check(ep, tfile) != 0) - goto error_tgt_fput; } - + if (op == EPOLL_CTL_ADD) { + if (is_file_epoll(tfile)) { + error = -ELOOP; + if (ep_loop_check(ep, tfile) != 0) + goto error_tgt_fput; + } else + list_add(&tfile->f_tfile_llink, &tfile_check_list); + } mutex_lock_nested(&ep->mtx, 0); @@ -1437,6 +1620,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, error = ep_insert(ep, &epds, tfile, fd); } else error = -EEXIST; + clear_tfile_check_list(); break; case EPOLL_CTL_DEL: if (epi) @@ -1455,7 +1639,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, mutex_unlock(&ep->mtx); error_tgt_fput: - if (unlikely(did_lock_epmutex)) + if (did_lock_epmutex) mutex_unlock(&epmutex); fput(tfile); diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index e425ad9..1e85a7a 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -583,7 +583,8 @@ static int hugetlbfs_set_page_dirty(struct page *page) } static int hugetlbfs_migrate_page(struct address_space *mapping, - struct page *newpage, struct page *page) + struct page *newpage, struct page *page, + enum migrate_mode mode) { int rc; diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 5ee9253..8102db9 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -332,7 +332,7 @@ void nfs_commit_release_pages(struct nfs_write_data *data); #ifdef CONFIG_MIGRATION extern int nfs_migrate_page(struct address_space *, - struct page *, struct page *); + struct page *, struct page *, enum migrate_mode); #else #define nfs_migrate_page NULL #endif diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 0c38852..834f0fe 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1688,7 +1688,7 @@ out_error: #ifdef CONFIG_MIGRATION int nfs_migrate_page(struct address_space *mapping, struct page *newpage, - struct page *page) + struct page *page, enum migrate_mode mode) { /* * If PagePrivate is set, then the page is currently associated with @@ -1703,7 +1703,7 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage, nfs_fscache_release_page(page, GFP_KERNEL); - return migrate_page(mapping, newpage, page); + return migrate_page(mapping, newpage, page, mode); } #endif @@ -1137,7 +1137,7 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages) if (nr_pages < pipe->nrbufs) return -EBUSY; - bufs = kcalloc(nr_pages, sizeof(struct pipe_buffer), GFP_KERNEL); + bufs = kcalloc(nr_pages, sizeof(*bufs), GFP_KERNEL | __GFP_NOWARN); if (unlikely(!bufs)) return -ENOMEM; diff --git a/fs/proc/array.c b/fs/proc/array.c index 8c344f0..9252ee3 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -464,7 +464,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, seq_printf(m, "%d (%s) %c %d %d %d %d %d %u %lu \ %lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \ -%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld\n", +%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld %lu %lu %lu\n", pid_nr_ns(pid, ns), tcomm, state, @@ -511,7 +511,10 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, task->policy, (unsigned long long)delayacct_blkio_ticks(task), cputime_to_clock_t(gtime), - cputime_to_clock_t(cgtime)); + cputime_to_clock_t(cgtime), + (mm && permitted) ? mm->start_data : 0, + (mm && permitted) ? mm->end_data : 0, + (mm && permitted) ? mm->start_brk : 0); if (mm) mmput(mm); return 0; diff --git a/fs/proc/base.c b/fs/proc/base.c index 8173dfd..5485a53 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -654,6 +654,8 @@ static int proc_pid_permission(struct inode *inode, int mask) bool has_perms; task = get_proc_task(inode); + if (!task) + return -ESRCH; has_perms = has_pid_permissions(pid, task, 1); put_task_struct(task); diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index e58fa77..f96a5b5 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -139,6 +139,20 @@ static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) __tlb_remove_tlb_entry(tlb, ptep, address); \ } while (0) +/** + * tlb_remove_pmd_tlb_entry - remember a pmd mapping for later tlb invalidation + * This is a nop so far, because only x86 needs it. + */ +#ifndef __tlb_remove_pmd_tlb_entry +#define __tlb_remove_pmd_tlb_entry(tlb, pmdp, address) do {} while (0) +#endif + +#define tlb_remove_pmd_tlb_entry(tlb, pmdp, address) \ + do { \ + tlb->need_flush = 1; \ + __tlb_remove_pmd_tlb_entry(tlb, pmdp, address); \ + } while (0) + #define pte_free_tlb(tlb, ptep, address) \ do { \ tlb->need_flush = 1; \ diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h index 5c4abce..b936763 100644 --- a/include/linux/crash_dump.h +++ b/include/linux/crash_dump.h @@ -5,6 +5,7 @@ #include <linux/kexec.h> #include <linux/device.h> #include <linux/proc_fs.h> +#include <linux/elf.h> #define ELFCORE_ADDR_MAX (-1ULL) #define ELFCORE_ADDR_ERR (-2ULL) diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h index f362733..657ab55 100644 --- a/include/linux/eventpoll.h +++ b/include/linux/eventpoll.h @@ -61,6 +61,7 @@ struct file; static inline void eventpoll_init_file(struct file *file) { INIT_LIST_HEAD(&file->f_ep_links); + INIT_LIST_HEAD(&file->f_tfile_llink); } diff --git a/include/linux/fs.h b/include/linux/fs.h index 7aacf31..4bc8169 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -525,6 +525,7 @@ enum positive_aop_returns { struct page; struct address_space; struct writeback_control; +enum migrate_mode; struct iov_iter { const struct iovec *iov; @@ -609,9 +610,12 @@ struct address_space_operations { loff_t offset, unsigned long nr_segs); int (*get_xip_mem)(struct address_space *, pgoff_t, int, void **, unsigned long *); - /* migrate the contents of a page to the specified target */ + /* + * migrate the contents of a page to the specified target. If sync + * is false, it must not block. + */ int (*migratepage) (struct address_space *, - struct page *, struct page *); + struct page *, struct page *, enum migrate_mode); int (*launder_page) (struct page *); int (*is_partially_uptodate) (struct page *, read_descriptor_t *, unsigned long); @@ -656,6 +660,7 @@ struct address_space { * must be enforced here for CRIS, to let the least significant bit * of struct page's "mapping" pointer be used for PAGE_MAPPING_ANON. */ +struct request_queue; struct block_device { dev_t bd_dev; /* not a kdev_t - it's a search key */ @@ -678,6 +683,7 @@ struct block_device { unsigned bd_part_count; int bd_invalidated; struct gendisk * bd_disk; + struct request_queue * bd_queue; struct list_head bd_list; /* * Private data. You must have bd_claim'ed the block_device @@ -1001,6 +1007,7 @@ struct file { #ifdef CONFIG_EPOLL /* Used by fs/eventpoll.c to link all the hooks to this file */ struct list_head f_ep_links; + struct list_head f_tfile_llink; #endif /* #ifdef CONFIG_EPOLL */ struct address_space *f_mapping; #ifdef CONFIG_DEBUG_WRITECOUNT @@ -2536,7 +2543,8 @@ extern int generic_check_addressable(unsigned, u64); #ifdef CONFIG_MIGRATION extern int buffer_migrate_page(struct address_space *, - struct page *, struct page *); + struct page *, struct page *, + enum migrate_mode); #else #define buffer_migrate_page NULL #endif diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index a9ace9c..1b92129 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -18,7 +18,7 @@ extern struct page *follow_trans_huge_pmd(struct mm_struct *mm, unsigned int flags); extern int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, - pmd_t *pmd); + pmd_t *pmd, unsigned long addr); extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, unsigned char *vec); diff --git a/include/linux/kernel.h b/include/linux/kernel.h index d0a7a0c..e834342 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -185,16 +185,17 @@ static inline void might_fault(void) extern struct atomic_notifier_head panic_notifier_list; extern long (*panic_blink)(int state); -NORET_TYPE void panic(const char * fmt, ...) - __attribute__ ((NORET_AND format (printf, 1, 2))) __cold; +__printf(1, 2) +void panic(const char *fmt, ...) + __noreturn __cold; extern void oops_enter(void); extern void oops_exit(void); void print_oops_end_marker(void); extern int oops_may_print(void); -NORET_TYPE void do_exit(long error_code) - ATTRIB_NORET; -NORET_TYPE void complete_and_exit(struct completion *, long) - ATTRIB_NORET; +void do_exit(long error_code) + __noreturn; +void complete_and_exit(struct completion *, long) + __noreturn; /* Internal, do not use. */ int __must_check _kstrtoul(const char *s, unsigned int base, unsigned long *res); diff --git a/include/linux/kmsg_dump.h b/include/linux/kmsg_dump.h index ee0c952..fee6631 100644 --- a/include/linux/kmsg_dump.h +++ b/include/linux/kmsg_dump.h @@ -18,7 +18,6 @@ enum kmsg_dump_reason { KMSG_DUMP_OOPS, KMSG_DUMP_PANIC, - KMSG_DUMP_KEXEC, KMSG_DUMP_RESTART, KMSG_DUMP_HALT, KMSG_DUMP_POWEROFF, diff --git a/include/linux/linkage.h b/include/linux/linkage.h index 3f46aed..807f1e5 100644 --- a/include/linux/linkage.h +++ b/include/linux/linkage.h @@ -88,8 +88,4 @@ #endif -#define NORET_TYPE /**/ -#define ATTRIB_NORET __attribute__((noreturn)) -#define NORET_AND noreturn, - #endif diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index f944591..4d34356 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -32,13 +32,11 @@ enum mem_cgroup_page_stat_item { MEMCG_NR_FILE_MAPPED, /* # of pages charged as file rss */ }; -extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, - struct list_head *dst, - unsigned long *scanned, int order, - isolate_mode_t mode, - struct zone *z, - struct mem_cgroup *mem_cont, - int active, int file); +struct mem_cgroup_reclaim_cookie { + struct zone *zone; + int priority; + unsigned int generation; +}; #ifdef CONFIG_CGROUP_MEM_RES_CTLR /* @@ -56,20 +54,21 @@ extern int mem_cgroup_newpage_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask); /* for swap handling */ extern int mem_cgroup_try_charge_swapin(struct mm_struct *mm, - struct page *page, gfp_t mask, struct mem_cgroup **ptr); + struct page *page, gfp_t mask, struct mem_cgroup **memcgp); extern void mem_cgroup_commit_charge_swapin(struct page *page, - struct mem_cgroup *ptr); -extern void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *ptr); + struct mem_cgroup *memcg); +extern void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg); extern int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask); -extern void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru); -extern void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru); -extern void mem_cgroup_rotate_reclaimable_page(struct page *page); -extern void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru); -extern void mem_cgroup_del_lru(struct page *page); -extern void mem_cgroup_move_lists(struct page *page, - enum lru_list from, enum lru_list to); + +struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *); +struct lruvec *mem_cgroup_lru_add_list(struct zone *, struct page *, + enum lru_list); +void mem_cgroup_lru_del_list(struct page *, enum lru_list); +void mem_cgroup_lru_del(struct page *); +struct lruvec *mem_cgroup_lru_move_lists(struct zone *, struct page *, + enum lru_list, enum lru_list); /* For coalescing uncharge for reducing memcg' overhead*/ extern void mem_cgroup_uncharge_start(void); @@ -102,10 +101,15 @@ extern struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg); extern int mem_cgroup_prepare_migration(struct page *page, - struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask); + struct page *newpage, struct mem_cgroup **memcgp, gfp_t gfp_mask); extern void mem_cgroup_end_migration(struct mem_cgroup *memcg, struct page *oldpage, struct page *newpage, bool migration_ok); +struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *, + struct mem_cgroup *, + struct mem_cgroup_reclaim_cookie *); +void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *); + /* * For memory reclaim. */ @@ -122,7 +126,10 @@ struct zone_reclaim_stat* mem_cgroup_get_reclaim_stat_from_page(struct page *page); extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p); +extern void mem_cgroup_replace_page_cache(struct page *oldpage, + struct page *newpage); +extern void mem_cgroup_reset_owner(struct page *page); #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP extern int do_swap_account; #endif @@ -157,7 +164,7 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg); void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx); #ifdef CONFIG_TRANSPARENT_HUGEPAGE -void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail); +void mem_cgroup_split_huge_fixup(struct page *head); #endif #ifdef CONFIG_DEBUG_VM @@ -180,17 +187,17 @@ static inline int mem_cgroup_cache_charge(struct page *page, } static inline int mem_cgroup_try_charge_swapin(struct mm_struct *mm, - struct page *page, gfp_t gfp_mask, struct mem_cgroup **ptr) + struct page *page, gfp_t gfp_mask, struct mem_cgroup **memcgp) { return 0; } static inline void mem_cgroup_commit_charge_swapin(struct page *page, - struct mem_cgroup *ptr) + struct mem_cgroup *memcg) { } -static inline void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *ptr) +static inline void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg) { } @@ -210,33 +217,33 @@ static inline void mem_cgroup_uncharge_cache_page(struct page *page) { } -static inline void mem_cgroup_add_lru_list(struct page *page, int lru) -{ -} - -static inline void mem_cgroup_del_lru_list(struct page *page, int lru) +static inline struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone, + struct mem_cgroup *memcg) { - return ; + return &zone->lruvec; } -static inline void mem_cgroup_rotate_reclaimable_page(struct page *page) +static inline struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, + struct page *page, + enum lru_list lru) { - return ; + return &zone->lruvec; } -static inline void mem_cgroup_rotate_lru_list(struct page *page, int lru) +static inline void mem_cgroup_lru_del_list(struct page *page, enum lru_list lru) { - return ; } -static inline void mem_cgroup_del_lru(struct page *page) +static inline void mem_cgroup_lru_del(struct page *page) { - return ; } -static inline void -mem_cgroup_move_lists(struct page *page, enum lru_list from, enum lru_list to) +static inline struct lruvec *mem_cgroup_lru_move_lists(struct zone *zone, + struct page *page, + enum lru_list from, + enum lru_list to) { + return &zone->lruvec; } static inline struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) @@ -269,7 +276,7 @@ static inline struct cgroup_subsys_state static inline int mem_cgroup_prepare_migration(struct page *page, struct page *newpage, - struct mem_cgroup **ptr, gfp_t gfp_mask) + struct mem_cgroup **memcgp, gfp_t gfp_mask) { return 0; } @@ -279,6 +286,19 @@ static inline void mem_cgroup_end_migration(struct mem_cgroup *memcg, { } +static inline struct mem_cgroup * +mem_cgroup_iter(struct mem_cgroup *root, + struct mem_cgroup *prev, + struct mem_cgroup_reclaim_cookie *reclaim) +{ + return NULL; +} + +static inline void mem_cgroup_iter_break(struct mem_cgroup *root, + struct mem_cgroup *prev) +{ +} + static inline int mem_cgroup_get_reclaim_priority(struct mem_cgroup *memcg) { return 0; @@ -360,8 +380,7 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) return 0; } -static inline void mem_cgroup_split_huge_fixup(struct page *head, - struct page *tail) +static inline void mem_cgroup_split_huge_fixup(struct page *head) { } @@ -369,6 +388,14 @@ static inline void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) { } +static inline void mem_cgroup_replace_page_cache(struct page *oldpage, + struct page *newpage) +{ +} + +static inline void mem_cgroup_reset_owner(struct page *page) +{ +} #endif /* CONFIG_CGROUP_MEM_CONT */ #if !defined(CONFIG_CGROUP_MEM_RES_CTLR) || !defined(CONFIG_DEBUG_VM) diff --git a/include/linux/migrate.h b/include/linux/migrate.h index e39aeec..eaf8674 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -6,18 +6,31 @@ typedef struct page *new_page_t(struct page *, unsigned long private, int **); +/* + * MIGRATE_ASYNC means never block + * MIGRATE_SYNC_LIGHT in the current implementation means to allow blocking + * on most operations but not ->writepage as the potential stall time + * is too significant + * MIGRATE_SYNC will block when migrating pages + */ +enum migrate_mode { + MIGRATE_ASYNC, + MIGRATE_SYNC_LIGHT, + MIGRATE_SYNC, +}; + #ifdef CONFIG_MIGRATION #define PAGE_MIGRATION 1 extern void putback_lru_pages(struct list_head *l); extern int migrate_page(struct address_space *, - struct page *, struct page *); + struct page *, struct page *, enum migrate_mode); extern int migrate_pages(struct list_head *l, new_page_t x, unsigned long private, bool offlining, - bool sync); + enum migrate_mode mode); extern int migrate_huge_pages(struct list_head *l, new_page_t x, unsigned long private, bool offlining, - bool sync); + enum migrate_mode mode); extern int fail_migrate_page(struct address_space *, struct page *, struct page *); @@ -36,10 +49,10 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping, static inline void putback_lru_pages(struct list_head *l) {} static inline int migrate_pages(struct list_head *l, new_page_t x, unsigned long private, bool offlining, - bool sync) { return -ENOSYS; } + enum migrate_mode mode) { return -ENOSYS; } static inline int migrate_huge_pages(struct list_head *l, new_page_t x, unsigned long private, bool offlining, - bool sync) { return -ENOSYS; } + enum migrate_mode mode) { return -ENOSYS; } static inline int migrate_prep(void) { return -ENOSYS; } static inline int migrate_prep_local(void) { return -ENOSYS; } diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 8f7d247..227fd3e 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -22,26 +22,21 @@ static inline int page_is_file_cache(struct page *page) } static inline void -__add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l, - struct list_head *head) +add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list lru) { - list_add(&page->lru, head); - __mod_zone_page_state(zone, NR_LRU_BASE + l, hpage_nr_pages(page)); - mem_cgroup_add_lru_list(page, l); -} + struct lruvec *lruvec; -static inline void -add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l) -{ - __add_page_to_lru_list(zone, page, l, &zone->lru[l].list); + lruvec = mem_cgroup_lru_add_list(zone, page, lru); + list_add(&page->lru, &lruvec->lists[lru]); + __mod_zone_page_state(zone, NR_LRU_BASE + lru, hpage_nr_pages(page)); } static inline void -del_page_from_lru_list(struct zone *zone, struct page *page, enum lru_list l) +del_page_from_lru_list(struct zone *zone, struct page *page, enum lru_list lru) { + mem_cgroup_lru_del_list(page, lru); list_del(&page->lru); - __mod_zone_page_state(zone, NR_LRU_BASE + l, -hpage_nr_pages(page)); - mem_cgroup_del_lru_list(page, l); + __mod_zone_page_state(zone, NR_LRU_BASE + lru, -hpage_nr_pages(page)); } /** @@ -59,24 +54,28 @@ static inline enum lru_list page_lru_base_type(struct page *page) return LRU_INACTIVE_ANON; } -static inline void -del_page_from_lru(struct zone *zone, struct page *page) +/** + * page_off_lru - which LRU list was page on? clearing its lru flags. + * @page: the page to test + * + * Returns the LRU list a page was on, as an index into the array of LRU + * lists; and clears its Unevictable or Active flags, ready for freeing. + */ +static inline enum lru_list page_off_lru(struct page *page) { - enum lru_list l; + enum lru_list lru; - list_del(&page->lru); if (PageUnevictable(page)) { __ClearPageUnevictable(page); - l = LRU_UNEVICTABLE; + lru = LRU_UNEVICTABLE; } else { - l = page_lru_base_type(page); + lru = page_lru_base_type(page); if (PageActive(page)) { __ClearPageActive(page); - l += LRU_ACTIVE; + lru += LRU_ACTIVE; } } - __mod_zone_page_state(zone, NR_LRU_BASE + l, -hpage_nr_pages(page)); - mem_cgroup_del_lru_list(page, l); + return lru; } /** @@ -97,7 +96,6 @@ static inline enum lru_list page_lru(struct page *page) if (PageActive(page)) lru += LRU_ACTIVE; } - return lru; } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 5b42f1b..3cc3062 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -151,12 +151,11 @@ struct page { #endif } /* - * If another subsystem starts using the double word pairing for atomic - * operations on struct page then it must change the #if to ensure - * proper alignment of the page struct. + * The struct page can be forced to be double word aligned so that atomic ops + * on double words work. The SLUB allocator can make use of such a feature. */ -#if defined(CONFIG_SLUB) && defined(CONFIG_CMPXCHG_LOCAL) - __attribute__((__aligned__(2*sizeof(unsigned long)))) +#ifdef CONFIG_HAVE_ALIGNED_STRUCT_PAGE + __aligned(2 * sizeof(unsigned long)) #endif ; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index ca6ca92..650ba2f 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -140,25 +140,29 @@ enum lru_list { NR_LRU_LISTS }; -#define for_each_lru(l) for (l = 0; l < NR_LRU_LISTS; l++) +#define for_each_lru(lru) for (lru = 0; lru < NR_LRU_LISTS; lru++) -#define for_each_evictable_lru(l) for (l = 0; l <= LRU_ACTIVE_FILE; l++) +#define for_each_evictable_lru(lru) for (lru = 0; lru <= LRU_ACTIVE_FILE; lru++) -static inline int is_file_lru(enum lru_list l) +static inline int is_file_lru(enum lru_list lru) { - return (l == LRU_INACTIVE_FILE || l == LRU_ACTIVE_FILE); + return (lru == LRU_INACTIVE_FILE || lru == LRU_ACTIVE_FILE); } -static inline int is_active_lru(enum lru_list l) +static inline int is_active_lru(enum lru_list lru) { - return (l == LRU_ACTIVE_ANON || l == LRU_ACTIVE_FILE); + return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE); } -static inline int is_unevictable_lru(enum lru_list l) +static inline int is_unevictable_lru(enum lru_list lru) { - return (l == LRU_UNEVICTABLE); + return (lru == LRU_UNEVICTABLE); } +struct lruvec { + struct list_head lists[NR_LRU_LISTS]; +}; + /* Mask used at gathering information at once (see memcontrol.c) */ #define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE)) #define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON)) @@ -173,6 +177,8 @@ static inline int is_unevictable_lru(enum lru_list l) #define ISOLATE_CLEAN ((__force isolate_mode_t)0x4) /* Isolate unmapped file */ #define ISOLATE_UNMAPPED ((__force isolate_mode_t)0x8) +/* Isolate for asynchronous migration */ +#define ISOLATE_ASYNC_MIGRATE ((__force isolate_mode_t)0x10) /* LRU Isolation modes. */ typedef unsigned __bitwise__ isolate_mode_t; @@ -364,10 +370,8 @@ struct zone { ZONE_PADDING(_pad1_) /* Fields commonly accessed by the page reclaim scanner */ - spinlock_t lru_lock; - struct zone_lru { - struct list_head list; - } lru[NR_LRU_LISTS]; + spinlock_t lru_lock; + struct lruvec lruvec; struct zone_reclaim_stat reclaim_stat; diff --git a/include/linux/oom.h b/include/linux/oom.h index 6f9d04a..552fba9 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -43,7 +43,7 @@ enum oom_constraint { extern void compare_swap_oom_score_adj(int old_val, int new_val); extern int test_set_oom_score_adj(int new_val); -extern unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem, +extern unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg, const nodemask_t *nodemask, unsigned long totalpages); extern int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags); extern void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags); diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h index 961ecc7..a2d11771 100644 --- a/include/linux/page_cgroup.h +++ b/include/linux/page_cgroup.h @@ -10,8 +10,6 @@ enum { /* flags for mem_cgroup and file and I/O status */ PCG_MOVE_LOCK, /* For race between move_account v.s. following bits */ PCG_FILE_MAPPED, /* page is accounted as "mapped" */ - /* No lock in page_cgroup */ - PCG_ACCT_LRU, /* page has been accounted for (under lru_lock) */ __NR_PCG_FLAGS, }; @@ -31,7 +29,6 @@ enum { struct page_cgroup { unsigned long flags; struct mem_cgroup *mem_cgroup; - struct list_head lru; /* per cgroup LRU list */ }; void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat); @@ -76,12 +73,6 @@ TESTPCGFLAG(Used, USED) CLEARPCGFLAG(Used, USED) SETPCGFLAG(Used, USED) -SETPCGFLAG(AcctLRU, ACCT_LRU) -CLEARPCGFLAG(AcctLRU, ACCT_LRU) -TESTPCGFLAG(AcctLRU, ACCT_LRU) -TESTCLEARPCGFLAG(AcctLRU, ACCT_LRU) - - SETPCGFLAG(FileMapped, FILE_MAPPED) CLEARPCGFLAG(FileMapped, FILE_MAPPED) TESTPCGFLAG(FileMapped, FILE_MAPPED) @@ -122,39 +113,6 @@ static inline void move_unlock_page_cgroup(struct page_cgroup *pc, local_irq_restore(*flags); } -#ifdef CONFIG_SPARSEMEM -#define PCG_ARRAYID_WIDTH SECTIONS_SHIFT -#else -#define PCG_ARRAYID_WIDTH NODES_SHIFT -#endif - -#if (PCG_ARRAYID_WIDTH > BITS_PER_LONG - NR_PCG_FLAGS) -#error Not enough space left in pc->flags to store page_cgroup array IDs -#endif - -/* pc->flags: ARRAY-ID | FLAGS */ - -#define PCG_ARRAYID_MASK ((1UL << PCG_ARRAYID_WIDTH) - 1) - -#define PCG_ARRAYID_OFFSET (BITS_PER_LONG - PCG_ARRAYID_WIDTH) -/* - * Zero the shift count for non-existent fields, to prevent compiler - * warnings and ensure references are optimized away. - */ -#define PCG_ARRAYID_SHIFT (PCG_ARRAYID_OFFSET * (PCG_ARRAYID_WIDTH != 0)) - -static inline void set_page_cgroup_array_id(struct page_cgroup *pc, - unsigned long id) -{ - pc->flags &= ~(PCG_ARRAYID_MASK << PCG_ARRAYID_SHIFT); - pc->flags |= (id & PCG_ARRAYID_MASK) << PCG_ARRAYID_SHIFT; -} - -static inline unsigned long page_cgroup_array_id(struct page_cgroup *pc) -{ - return (pc->flags >> PCG_ARRAYID_SHIFT) & PCG_ARRAYID_MASK; -} - #else /* CONFIG_CGROUP_MEM_RES_CTLR */ struct page_cgroup; @@ -183,7 +141,7 @@ static inline void __init page_cgroup_init_flatmem(void) extern unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, unsigned short old, unsigned short new); extern unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id); -extern unsigned short lookup_swap_cgroup(swp_entry_t ent); +extern unsigned short lookup_swap_cgroup_id(swp_entry_t ent); extern int swap_cgroup_swapon(int type, unsigned long max_pages); extern void swap_cgroup_swapoff(int type); #else @@ -195,7 +153,7 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id) } static inline -unsigned short lookup_swap_cgroup(swp_entry_t ent) +unsigned short lookup_swap_cgroup_id(swp_entry_t ent) { return 0; } diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index ed17024..2aa12b8 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -21,8 +21,7 @@ struct pagevec { }; void __pagevec_release(struct pagevec *pvec); -void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru); -void pagevec_strip(struct pagevec *pvec); +void __pagevec_lru_add(struct pagevec *pvec, enum lru_list lru); unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, pgoff_t start, unsigned nr_pages); unsigned pagevec_lookup_tag(struct pagevec *pvec, @@ -59,7 +58,6 @@ static inline unsigned pagevec_add(struct pagevec *pvec, struct page *page) return pagevec_space(pvec); } - static inline void pagevec_release(struct pagevec *pvec) { if (pagevec_count(pvec)) @@ -68,22 +66,22 @@ static inline void pagevec_release(struct pagevec *pvec) static inline void __pagevec_lru_add_anon(struct pagevec *pvec) { - ____pagevec_lru_add(pvec, LRU_INACTIVE_ANON); + __pagevec_lru_add(pvec, LRU_INACTIVE_ANON); } static inline void __pagevec_lru_add_active_anon(struct pagevec *pvec) { - ____pagevec_lru_add(pvec, LRU_ACTIVE_ANON); + __pagevec_lru_add(pvec, LRU_ACTIVE_ANON); } static inline void __pagevec_lru_add_file(struct pagevec *pvec) { - ____pagevec_lru_add(pvec, LRU_INACTIVE_FILE); + __pagevec_lru_add(pvec, LRU_INACTIVE_FILE); } static inline void __pagevec_lru_add_active_file(struct pagevec *pvec) { - ____pagevec_lru_add(pvec, LRU_ACTIVE_FILE); + __pagevec_lru_add(pvec, LRU_ACTIVE_FILE); } static inline void pagevec_lru_add_file(struct pagevec *pvec) diff --git a/include/linux/prctl.h b/include/linux/prctl.h index a3baeb2..7ddc7f1 100644 --- a/include/linux/prctl.h +++ b/include/linux/prctl.h @@ -102,4 +102,16 @@ #define PR_MCE_KILL_GET 34 +/* + * Tune up process memory map specifics. + */ +#define PR_SET_MM 35 +# define PR_SET_MM_START_CODE 1 +# define PR_SET_MM_END_CODE 2 +# define PR_SET_MM_START_DATA 3 +# define PR_SET_MM_END_DATA 4 +# define PR_SET_MM_START_STACK 5 +# define PR_SET_MM_START_BRK 6 +# define PR_SET_MM_BRK 7 + #endif /* _LINUX_PRCTL_H */ diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 9d4539c..07e360b 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -49,9 +49,6 @@ #define RADIX_TREE_EXCEPTIONAL_ENTRY 2 #define RADIX_TREE_EXCEPTIONAL_SHIFT 2 -#define radix_tree_indirect_to_ptr(ptr) \ - radix_tree_indirect_to_ptr((void __force *)(ptr)) - static inline int radix_tree_is_indirect_ptr(void *ptr) { return (int)((unsigned long)ptr & RADIX_TREE_INDIRECT_PTR); diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 1afb995..1cdd62a 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -158,7 +158,7 @@ static inline void page_dup_rmap(struct page *page) * Called from mm/vmscan.c to handle paging out */ int page_referenced(struct page *, int is_locked, - struct mem_cgroup *cnt, unsigned long *vm_flags); + struct mem_cgroup *memcg, unsigned long *vm_flags); int page_referenced_one(struct page *, struct vm_area_struct *, unsigned long address, unsigned int *mapcount, unsigned long *vm_flags); @@ -236,7 +236,7 @@ int rmap_walk(struct page *page, int (*rmap_one)(struct page *, #define anon_vma_link(vma) do {} while (0) static inline int page_referenced(struct page *page, int is_locked, - struct mem_cgroup *cnt, + struct mem_cgroup *memcg, unsigned long *vm_flags) { *vm_flags = 0; diff --git a/include/linux/sched.h b/include/linux/sched.h index 21cd030..4032ec1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2275,7 +2275,7 @@ extern void __cleanup_sighand(struct sighand_struct *); extern void exit_itimers(struct signal_struct *); extern void flush_itimer_signals(void); -extern NORET_TYPE void do_group_exit(int); +extern void do_group_exit(int); extern void daemonize(const char *, ...); extern int allow_signal(int); diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index edc4b3d..f64560e 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -266,9 +266,10 @@ DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template, unsigned long nr_lumpy_taken, unsigned long nr_lumpy_dirty, unsigned long nr_lumpy_failed, - isolate_mode_t isolate_mode), + isolate_mode_t isolate_mode, + int file), - TP_ARGS(order, nr_requested, nr_scanned, nr_taken, nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, isolate_mode), + TP_ARGS(order, nr_requested, nr_scanned, nr_taken, nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, isolate_mode, file), TP_STRUCT__entry( __field(int, order) @@ -279,6 +280,7 @@ DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template, __field(unsigned long, nr_lumpy_dirty) __field(unsigned long, nr_lumpy_failed) __field(isolate_mode_t, isolate_mode) + __field(int, file) ), TP_fast_assign( @@ -290,9 +292,10 @@ DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template, __entry->nr_lumpy_dirty = nr_lumpy_dirty; __entry->nr_lumpy_failed = nr_lumpy_failed; __entry->isolate_mode = isolate_mode; + __entry->file = file; ), - TP_printk("isolate_mode=%d order=%d nr_requested=%lu nr_scanned=%lu nr_taken=%lu contig_taken=%lu contig_dirty=%lu contig_failed=%lu", + TP_printk("isolate_mode=%d order=%d nr_requested=%lu nr_scanned=%lu nr_taken=%lu contig_taken=%lu contig_dirty=%lu contig_failed=%lu file=%d", __entry->isolate_mode, __entry->order, __entry->nr_requested, @@ -300,7 +303,8 @@ DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template, __entry->nr_taken, __entry->nr_lumpy_taken, __entry->nr_lumpy_dirty, - __entry->nr_lumpy_failed) + __entry->nr_lumpy_failed, + __entry->file) ); DEFINE_EVENT(mm_vmscan_lru_isolate_template, mm_vmscan_lru_isolate, @@ -312,9 +316,10 @@ DEFINE_EVENT(mm_vmscan_lru_isolate_template, mm_vmscan_lru_isolate, unsigned long nr_lumpy_taken, unsigned long nr_lumpy_dirty, unsigned long nr_lumpy_failed, - isolate_mode_t isolate_mode), + isolate_mode_t isolate_mode, + int file), - TP_ARGS(order, nr_requested, nr_scanned, nr_taken, nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, isolate_mode) + TP_ARGS(order, nr_requested, nr_scanned, nr_taken, nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, isolate_mode, file) ); @@ -327,9 +332,10 @@ DEFINE_EVENT(mm_vmscan_lru_isolate_template, mm_vmscan_memcg_isolate, unsigned long nr_lumpy_taken, unsigned long nr_lumpy_dirty, unsigned long nr_lumpy_failed, - isolate_mode_t isolate_mode), + isolate_mode_t isolate_mode, + int file), - TP_ARGS(order, nr_requested, nr_scanned, nr_taken, nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, isolate_mode) + TP_ARGS(order, nr_requested, nr_scanned, nr_taken, nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, isolate_mode, file) ); diff --git a/init/Kconfig b/init/Kconfig index 018d206..6ac2236 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -783,6 +783,17 @@ config DEBUG_BLK_CGROUP endif # CGROUPS +config CHECKPOINT_RESTORE + bool "Checkpoint/restore support" if EXPERT + default n + help + Enables additional kernel features in a sake of checkpoint/restore. + In particular it adds auxiliary prctl codes to setup process text, + data and heap segment sizes, and a few additional /proc filesystem + entries. + + If unsure, say N here. + menuconfig NAMESPACES bool "Namespaces support" if EXPERT default !EXPERT diff --git a/kernel/exit.c b/kernel/exit.c index 94ed6e2..c447382 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -887,7 +887,7 @@ static void check_stack_usage(void) static inline void check_stack_usage(void) {} #endif -NORET_TYPE void do_exit(long code) +void do_exit(long code) { struct task_struct *tsk = current; int group_dead; @@ -1051,7 +1051,7 @@ NORET_TYPE void do_exit(long code) EXPORT_SYMBOL_GPL(do_exit); -NORET_TYPE void complete_and_exit(struct completion *comp, long code) +void complete_and_exit(struct completion *comp, long code) { if (comp) complete(comp); @@ -1070,7 +1070,7 @@ SYSCALL_DEFINE1(exit, int, error_code) * Take down every thread in the group. This is called by fatal signals * as well as by sys_exit_group (below). */ -NORET_TYPE void +void do_group_exit(int exit_code) { struct signal_struct *sig = current->signal; diff --git a/kernel/kexec.c b/kernel/kexec.c index 090ee10..7b08867 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -32,7 +32,6 @@ #include <linux/console.h> #include <linux/vmalloc.h> #include <linux/swap.h> -#include <linux/kmsg_dump.h> #include <linux/syscore_ops.h> #include <asm/page.h> @@ -1094,8 +1093,6 @@ void crash_kexec(struct pt_regs *regs) if (kexec_crash_image) { struct pt_regs fixed_regs; - kmsg_dump(KMSG_DUMP_KEXEC); - crash_setup_regs(&fixed_regs, regs); crash_save_vmcoreinfo(); machine_crash_shutdown(&fixed_regs); @@ -1132,6 +1129,8 @@ int crash_shrink_memory(unsigned long new_size) { int ret = 0; unsigned long start, end; + unsigned long old_size; + struct resource *ram_res; mutex_lock(&kexec_mutex); @@ -1141,11 +1140,15 @@ int crash_shrink_memory(unsigned long new_size) } start = crashk_res.start; end = crashk_res.end; + old_size = (end == 0) ? 0 : end - start + 1; + if (new_size >= old_size) { + ret = (new_size == old_size) ? 0 : -EINVAL; + goto unlock; + } - if (new_size >= end - start + 1) { - ret = -EINVAL; - if (new_size == end - start + 1) - ret = 0; + ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL); + if (!ram_res) { + ret = -ENOMEM; goto unlock; } @@ -1157,7 +1160,15 @@ int crash_shrink_memory(unsigned long new_size) if ((start == end) && (crashk_res.parent != NULL)) release_resource(&crashk_res); + + ram_res->start = end; + ram_res->end = crashk_res.end; + ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM; + ram_res->name = "System RAM"; + crashk_res.end = end - 1; + + insert_resource(&iomem_resource, ram_res); crash_unmap_reserved_pages(); unlock: diff --git a/kernel/kprobes.c b/kernel/kprobes.c index e5d8464..95dd721 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2198,7 +2198,7 @@ static ssize_t write_enabled_file_bool(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { char buf[32]; - int buf_size; + size_t buf_size; buf_size = min(count, (sizeof(buf)-1)); if (copy_from_user(buf, user_buf, buf_size)) diff --git a/kernel/panic.c b/kernel/panic.c index 3458469..80aed44 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -49,6 +49,15 @@ static long no_blink(int state) long (*panic_blink)(int state); EXPORT_SYMBOL(panic_blink); +/* + * Stop ourself in panic -- architecture code may override this + */ +void __weak panic_smp_self_stop(void) +{ + while (1) + cpu_relax(); +} + /** * panic - halt the system * @fmt: The text string to print @@ -57,8 +66,9 @@ EXPORT_SYMBOL(panic_blink); * * This function never returns. */ -NORET_TYPE void panic(const char * fmt, ...) +void panic(const char *fmt, ...) { + static DEFINE_SPINLOCK(panic_lock); static char buf[1024]; va_list args; long i, i_next = 0; @@ -68,8 +78,14 @@ NORET_TYPE void panic(const char * fmt, ...) * It's possible to come here directly from a panic-assertion and * not have preempt disabled. Some functions called from here want * preempt to be disabled. No point enabling it later though... + * + * Only one CPU is allowed to execute the panic code from here. For + * multiple parallel invocations of panic, all other CPUs either + * stop themself or will wait until they are stopped by the 1st CPU + * with smp_send_stop(). */ - preempt_disable(); + if (!spin_trylock(&panic_lock)) + panic_smp_self_stop(); console_verbose(); bust_spinlocks(1); @@ -78,7 +94,11 @@ NORET_TYPE void panic(const char * fmt, ...) va_end(args); printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf); #ifdef CONFIG_DEBUG_BUGVERBOSE - dump_stack(); + /* + * Avoid nested stack-dumping if a panic occurs during oops processing + */ + if (!oops_in_progress) + dump_stack(); #endif /* diff --git a/kernel/pid.c b/kernel/pid.c index fa5f722..ce8e00d 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -137,7 +137,9 @@ static int pid_before(int base, int a, int b) } /* - * We might be racing with someone else trying to set pid_ns->last_pid. + * We might be racing with someone else trying to set pid_ns->last_pid + * at the pid allocation time (there's also a sysctl for this, but racing + * with this one is OK, see comment in kernel/pid_namespace.c about it). * We want the winner to have the "later" value, because if the * "earlier" value prevails, then a pid may get reused immediately. * diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index e9c9adc..a896839 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -191,9 +191,40 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) return; } +static int pid_ns_ctl_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table tmp = *table; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + /* + * Writing directly to ns' last_pid field is OK, since this field + * is volatile in a living namespace anyway and a code writing to + * it should synchronize its usage with external means. + */ + + tmp.data = ¤t->nsproxy->pid_ns->last_pid; + return proc_dointvec(&tmp, write, buffer, lenp, ppos); +} + +static struct ctl_table pid_ns_ctl_table[] = { + { + .procname = "ns_last_pid", + .maxlen = sizeof(int), + .mode = 0666, /* permissions are checked in the handler */ + .proc_handler = pid_ns_ctl_handler, + }, + { } +}; + +static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } }; + static __init int pid_namespaces_init(void) { pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); + register_sysctl_paths(kern_path, pid_ns_ctl_table); return 0; } diff --git a/kernel/sys.c b/kernel/sys.c index ddf8155..4070153 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1692,6 +1692,124 @@ SYSCALL_DEFINE1(umask, int, mask) return mask; } +#ifdef CONFIG_CHECKPOINT_RESTORE +static int prctl_set_mm(int opt, unsigned long addr, + unsigned long arg4, unsigned long arg5) +{ + unsigned long rlim = rlimit(RLIMIT_DATA); + unsigned long vm_req_flags; + unsigned long vm_bad_flags; + struct vm_area_struct *vma; + int error = 0; + struct mm_struct *mm = current->mm; + + if (arg4 | arg5) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (addr >= TASK_SIZE) + return -EINVAL; + + down_read(&mm->mmap_sem); + vma = find_vma(mm, addr); + + if (opt != PR_SET_MM_START_BRK && opt != PR_SET_MM_BRK) { + /* It must be existing VMA */ + if (!vma || vma->vm_start > addr) + goto out; + } + + error = -EINVAL; + switch (opt) { + case PR_SET_MM_START_CODE: + case PR_SET_MM_END_CODE: + vm_req_flags = VM_READ | VM_EXEC; + vm_bad_flags = VM_WRITE | VM_MAYSHARE; + + if ((vma->vm_flags & vm_req_flags) != vm_req_flags || + (vma->vm_flags & vm_bad_flags)) + goto out; + + if (opt == PR_SET_MM_START_CODE) + mm->start_code = addr; + else + mm->end_code = addr; + break; + + case PR_SET_MM_START_DATA: + case PR_SET_MM_END_DATA: + vm_req_flags = VM_READ | VM_WRITE; + vm_bad_flags = VM_EXEC | VM_MAYSHARE; + + if ((vma->vm_flags & vm_req_flags) != vm_req_flags || + (vma->vm_flags & vm_bad_flags)) + goto out; + + if (opt == PR_SET_MM_START_DATA) + mm->start_data = addr; + else + mm->end_data = addr; + break; + + case PR_SET_MM_START_STACK: + +#ifdef CONFIG_STACK_GROWSUP + vm_req_flags = VM_READ | VM_WRITE | VM_GROWSUP; +#else + vm_req_flags = VM_READ | VM_WRITE | VM_GROWSDOWN; +#endif + if ((vma->vm_flags & vm_req_flags) != vm_req_flags) + goto out; + + mm->start_stack = addr; + break; + + case PR_SET_MM_START_BRK: + if (addr <= mm->end_data) + goto out; + + if (rlim < RLIM_INFINITY && + (mm->brk - addr) + + (mm->end_data - mm->start_data) > rlim) + goto out; + + mm->start_brk = addr; + break; + + case PR_SET_MM_BRK: + if (addr <= mm->end_data) + goto out; + + if (rlim < RLIM_INFINITY && + (addr - mm->start_brk) + + (mm->end_data - mm->start_data) > rlim) + goto out; + + mm->brk = addr; + break; + + default: + error = -EINVAL; + goto out; + } + + error = 0; + +out: + up_read(&mm->mmap_sem); + + return error; +} +#else /* CONFIG_CHECKPOINT_RESTORE */ +static int prctl_set_mm(int opt, unsigned long addr, + unsigned long arg4, unsigned long arg5) +{ + return -EINVAL; +} +#endif + SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, unsigned long, arg4, unsigned long, arg5) { @@ -1841,6 +1959,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, else error = PR_MCE_KILL_DEFAULT; break; + case PR_SET_MM: + error = prctl_set_mm(arg2, arg3, arg4, arg5); + break; default: error = -EINVAL; break; diff --git a/lib/decompress_unlzo.c b/lib/decompress_unlzo.c index 5a7a2ad..4531294 100644 --- a/lib/decompress_unlzo.c +++ b/lib/decompress_unlzo.c @@ -279,7 +279,7 @@ STATIC inline int INIT unlzo(u8 *input, int in_len, ret = 0; exit_2: if (!input) - free(in_buf); + free(in_buf_save); exit_1: if (!output) free(out_buf); diff --git a/lib/radix-tree.c b/lib/radix-tree.c index d9df745..dc63d08 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -48,16 +48,14 @@ struct radix_tree_node { unsigned int height; /* Height from the bottom */ unsigned int count; - struct rcu_head rcu_head; + union { + struct radix_tree_node *parent; /* Used when ascending tree */ + struct rcu_head rcu_head; /* Used when freeing node */ + }; void __rcu *slots[RADIX_TREE_MAP_SIZE]; unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS]; }; -struct radix_tree_path { - struct radix_tree_node *node; - int offset; -}; - #define RADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long)) #define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \ RADIX_TREE_MAP_SHIFT)) @@ -256,6 +254,7 @@ static inline unsigned long radix_tree_maxindex(unsigned int height) static int radix_tree_extend(struct radix_tree_root *root, unsigned long index) { struct radix_tree_node *node; + struct radix_tree_node *slot; unsigned int height; int tag; @@ -274,18 +273,23 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index) if (!(node = radix_tree_node_alloc(root))) return -ENOMEM; - /* Increase the height. */ - node->slots[0] = indirect_to_ptr(root->rnode); - /* Propagate the aggregated tag info into the new root */ for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { if (root_tag_get(root, tag)) tag_set(node, tag, 0); } + /* Increase the height. */ newheight = root->height+1; node->height = newheight; node->count = 1; + node->parent = NULL; + slot = root->rnode; + if (newheight > 1) { + slot = indirect_to_ptr(slot); + slot->parent = node; + } + node->slots[0] = slot; node = ptr_to_indirect(node); rcu_assign_pointer(root->rnode, node); root->height = newheight; @@ -331,6 +335,7 @@ int radix_tree_insert(struct radix_tree_root *root, if (!(slot = radix_tree_node_alloc(root))) return -ENOMEM; slot->height = height; + slot->parent = node; if (node) { rcu_assign_pointer(node->slots[offset], slot); node->count++; @@ -504,47 +509,41 @@ EXPORT_SYMBOL(radix_tree_tag_set); void *radix_tree_tag_clear(struct radix_tree_root *root, unsigned long index, unsigned int tag) { - /* - * The radix tree path needs to be one longer than the maximum path - * since the "list" is null terminated. - */ - struct radix_tree_path path[RADIX_TREE_MAX_PATH + 1], *pathp = path; + struct radix_tree_node *node = NULL; struct radix_tree_node *slot = NULL; unsigned int height, shift; + int uninitialized_var(offset); height = root->height; if (index > radix_tree_maxindex(height)) goto out; - shift = (height - 1) * RADIX_TREE_MAP_SHIFT; - pathp->node = NULL; + shift = height * RADIX_TREE_MAP_SHIFT; slot = indirect_to_ptr(root->rnode); - while (height > 0) { - int offset; - + while (shift) { if (slot == NULL) goto out; + shift -= RADIX_TREE_MAP_SHIFT; offset = (index >> shift) & RADIX_TREE_MAP_MASK; - pathp[1].offset = offset; - pathp[1].node = slot; + node = slot; slot = slot->slots[offset]; - pathp++; - shift -= RADIX_TREE_MAP_SHIFT; - height--; } if (slot == NULL) goto out; - while (pathp->node) { - if (!tag_get(pathp->node, tag, pathp->offset)) + while (node) { + if (!tag_get(node, tag, offset)) goto out; - tag_clear(pathp->node, tag, pathp->offset); - if (any_tag_set(pathp->node, tag)) + tag_clear(node, tag, offset); + if (any_tag_set(node, tag)) goto out; - pathp--; + + index >>= RADIX_TREE_MAP_SHIFT; + offset = index & RADIX_TREE_MAP_MASK; + node = node->parent; } /* clear the root's tag bit */ @@ -646,8 +645,7 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root, unsigned int iftag, unsigned int settag) { unsigned int height = root->height; - struct radix_tree_path path[height]; - struct radix_tree_path *pathp = path; + struct radix_tree_node *node = NULL; struct radix_tree_node *slot; unsigned int shift; unsigned long tagged = 0; @@ -671,14 +669,8 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root, shift = (height - 1) * RADIX_TREE_MAP_SHIFT; slot = indirect_to_ptr(root->rnode); - /* - * we fill the path from (root->height - 2) to 0, leaving the index at - * (root->height - 1) as a terminator. Zero the node in the terminator - * so that we can use this to end walk loops back up the path. - */ - path[height - 1].node = NULL; - for (;;) { + unsigned long upindex; int offset; offset = (index >> shift) & RADIX_TREE_MAP_MASK; @@ -686,12 +678,10 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root, goto next; if (!tag_get(slot, iftag, offset)) goto next; - if (height > 1) { + if (shift) { /* Go down one level */ - height--; shift -= RADIX_TREE_MAP_SHIFT; - path[height - 1].node = slot; - path[height - 1].offset = offset; + node = slot; slot = slot->slots[offset]; continue; } @@ -701,15 +691,27 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root, tag_set(slot, settag, offset); /* walk back up the path tagging interior nodes */ - pathp = &path[0]; - while (pathp->node) { + upindex = index; + while (node) { + upindex >>= RADIX_TREE_MAP_SHIFT; + offset = upindex & RADIX_TREE_MAP_MASK; + /* stop if we find a node with the tag already set */ - if (tag_get(pathp->node, settag, pathp->offset)) + if (tag_get(node, settag, offset)) break; - tag_set(pathp->node, settag, pathp->offset); - pathp++; + tag_set(node, settag, offset); + node = node->parent; } + /* + * Small optimization: now clear that node pointer. + * Since all of this slot's ancestors now have the tag set + * from setting it above, we have no further need to walk + * back up the tree setting tags, until we update slot to + * point to another radix_tree_node. + */ + node = NULL; + next: /* Go to next item at level determined by 'shift' */ index = ((index >> shift) + 1) << shift; @@ -724,8 +726,7 @@ next: * last_index is guaranteed to be in the tree, what * we do below cannot wander astray. */ - slot = path[height - 1].node; - height++; + slot = slot->parent; shift += RADIX_TREE_MAP_SHIFT; } } @@ -1299,7 +1300,7 @@ static inline void radix_tree_shrink(struct radix_tree_root *root) /* try to shrink tree height */ while (root->height > 0) { struct radix_tree_node *to_free = root->rnode; - void *newptr; + struct radix_tree_node *slot; BUG_ON(!radix_tree_is_indirect_ptr(to_free)); to_free = indirect_to_ptr(to_free); @@ -1320,10 +1321,12 @@ static inline void radix_tree_shrink(struct radix_tree_root *root) * (to_free->slots[0]), it will be safe to dereference the new * one (root->rnode) as far as dependent read barriers go. */ - newptr = to_free->slots[0]; - if (root->height > 1) - newptr = ptr_to_indirect(newptr); - root->rnode = newptr; + slot = to_free->slots[0]; + if (root->height > 1) { + slot->parent = NULL; + slot = ptr_to_indirect(slot); + } + root->rnode = slot; root->height--; /* @@ -1363,16 +1366,12 @@ static inline void radix_tree_shrink(struct radix_tree_root *root) */ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) { - /* - * The radix tree path needs to be one longer than the maximum path - * since the "list" is null terminated. - */ - struct radix_tree_path path[RADIX_TREE_MAX_PATH + 1], *pathp = path; + struct radix_tree_node *node = NULL; struct radix_tree_node *slot = NULL; struct radix_tree_node *to_free; unsigned int height, shift; int tag; - int offset; + int uninitialized_var(offset); height = root->height; if (index > radix_tree_maxindex(height)) @@ -1385,39 +1384,35 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) goto out; } slot = indirect_to_ptr(slot); - - shift = (height - 1) * RADIX_TREE_MAP_SHIFT; - pathp->node = NULL; + shift = height * RADIX_TREE_MAP_SHIFT; do { if (slot == NULL) goto out; - pathp++; + shift -= RADIX_TREE_MAP_SHIFT; offset = (index >> shift) & RADIX_TREE_MAP_MASK; - pathp->offset = offset; - pathp->node = slot; + node = slot; slot = slot->slots[offset]; - shift -= RADIX_TREE_MAP_SHIFT; - height--; - } while (height > 0); + } while (shift); if (slot == NULL) goto out; /* - * Clear all tags associated with the just-deleted item + * Clear all tags associated with the item to be deleted. + * This way of doing it would be inefficient, but seldom is any set. */ for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { - if (tag_get(pathp->node, tag, pathp->offset)) + if (tag_get(node, tag, offset)) radix_tree_tag_clear(root, index, tag); } to_free = NULL; /* Now free the nodes we do not need anymore */ - while (pathp->node) { - pathp->node->slots[pathp->offset] = NULL; - pathp->node->count--; + while (node) { + node->slots[offset] = NULL; + node->count--; /* * Queue the node for deferred freeing after the * last reference to it disappears (set NULL, above). @@ -1425,17 +1420,20 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) if (to_free) radix_tree_node_free(to_free); - if (pathp->node->count) { - if (pathp->node == indirect_to_ptr(root->rnode)) + if (node->count) { + if (node == indirect_to_ptr(root->rnode)) radix_tree_shrink(root); goto out; } /* Node with zero slots in use so free it */ - to_free = pathp->node; - pathp--; + to_free = node; + index >>= RADIX_TREE_MAP_SHIFT; + offset = index & RADIX_TREE_MAP_MASK; + node = node->parent; } + root_tag_clear_all(root); root->height = 0; root->rnode = NULL; diff --git a/mm/compaction.c b/mm/compaction.c index e6670c34..71a58f6 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -350,7 +350,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, } if (!cc->sync) - mode |= ISOLATE_CLEAN; + mode |= ISOLATE_ASYNC_MIGRATE; /* Try isolate the page */ if (__isolate_lru_page(page, mode, 0) != 0) @@ -557,7 +557,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) nr_migrate = cc->nr_migratepages; err = migrate_pages(&cc->migratepages, compaction_alloc, (unsigned long)cc, false, - cc->sync); + cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC); update_nr_listpages(cc); nr_remaining = cc->nr_migratepages; @@ -671,6 +671,7 @@ static int compact_node(int nid) .nr_freepages = 0, .nr_migratepages = 0, .order = -1, + .sync = true, }; zone = &pgdat->node_zones[zoneid]; diff --git a/mm/filemap.c b/mm/filemap.c index c4ee2e9..97f49ed 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -393,24 +393,11 @@ EXPORT_SYMBOL(filemap_write_and_wait_range); int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) { int error; - struct mem_cgroup *memcg = NULL; VM_BUG_ON(!PageLocked(old)); VM_BUG_ON(!PageLocked(new)); VM_BUG_ON(new->mapping); - /* - * This is not page migration, but prepare_migration and - * end_migration does enough work for charge replacement. - * - * In the longer term we probably want a specialized function - * for moving the charge from old to new in a more efficient - * manner. - */ - error = mem_cgroup_prepare_migration(old, new, &memcg, gfp_mask); - if (error) - return error; - error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); if (!error) { struct address_space *mapping = old->mapping; @@ -432,13 +419,12 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) if (PageSwapBacked(new)) __inc_zone_page_state(new, NR_SHMEM); spin_unlock_irq(&mapping->tree_lock); + /* mem_cgroup codes must not be called under tree_lock */ + mem_cgroup_replace_page_cache(old, new); radix_tree_preload_end(); if (freepage) freepage(old); page_cache_release(old); - mem_cgroup_end_migration(memcg, old, new, true); - } else { - mem_cgroup_end_migration(memcg, old, new, false); } return error; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 36b3d98..b3ffc21 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -487,41 +487,68 @@ static struct attribute_group khugepaged_attr_group = { .attrs = khugepaged_attr, .name = "khugepaged", }; -#endif /* CONFIG_SYSFS */ -static int __init hugepage_init(void) +static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj) { int err; -#ifdef CONFIG_SYSFS - static struct kobject *hugepage_kobj; -#endif - - err = -EINVAL; - if (!has_transparent_hugepage()) { - transparent_hugepage_flags = 0; - goto out; - } -#ifdef CONFIG_SYSFS - err = -ENOMEM; - hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); - if (unlikely(!hugepage_kobj)) { + *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); + if (unlikely(!*hugepage_kobj)) { printk(KERN_ERR "hugepage: failed kobject create\n"); - goto out; + return -ENOMEM; } - err = sysfs_create_group(hugepage_kobj, &hugepage_attr_group); + err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group); if (err) { printk(KERN_ERR "hugepage: failed register hugeage group\n"); - goto out; + goto delete_obj; } - err = sysfs_create_group(hugepage_kobj, &khugepaged_attr_group); + err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group); if (err) { printk(KERN_ERR "hugepage: failed register hugeage group\n"); - goto out; + goto remove_hp_group; } -#endif + + return 0; + +remove_hp_group: + sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group); +delete_obj: + kobject_put(*hugepage_kobj); + return err; +} + +static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj) +{ + sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group); + sysfs_remove_group(hugepage_kobj, &hugepage_attr_group); + kobject_put(hugepage_kobj); +} +#else +static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj) +{ + return 0; +} + +static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj) +{ +} +#endif /* CONFIG_SYSFS */ + +static int __init hugepage_init(void) +{ + int err; + struct kobject *hugepage_kobj; + + if (!has_transparent_hugepage()) { + transparent_hugepage_flags = 0; + return -EINVAL; + } + + err = hugepage_init_sysfs(&hugepage_kobj); + if (err) + return err; err = khugepaged_slab_init(); if (err) @@ -545,7 +572,9 @@ static int __init hugepage_init(void) set_recommended_min_free_kbytes(); + return 0; out: + hugepage_exit_sysfs(hugepage_kobj); return err; } module_init(hugepage_init) @@ -997,7 +1026,7 @@ out: } int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, - pmd_t *pmd) + pmd_t *pmd, unsigned long addr) { int ret = 0; @@ -1013,6 +1042,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pgtable = get_pmd_huge_pte(tlb->mm); page = pmd_page(*pmd); pmd_clear(pmd); + tlb_remove_pmd_tlb_entry(tlb, pmd, addr); page_remove_rmap(page); VM_BUG_ON(page_mapcount(page) < 0); add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); @@ -1116,7 +1146,6 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, entry = pmd_modify(entry, newprot); set_pmd_at(mm, addr, pmd, entry); spin_unlock(&vma->vm_mm->page_table_lock); - flush_tlb_range(vma, addr, addr + HPAGE_PMD_SIZE); ret = 1; } } else @@ -1199,16 +1228,16 @@ static int __split_huge_page_splitting(struct page *page, static void __split_huge_page_refcount(struct page *page) { int i; - unsigned long head_index = page->index; struct zone *zone = page_zone(page); - int zonestat; int tail_count = 0; /* prevent PageLRU to go away from under us, and freeze lru stats */ spin_lock_irq(&zone->lru_lock); compound_lock(page); + /* complete memcg works before add pages to LRU */ + mem_cgroup_split_huge_fixup(page); - for (i = 1; i < HPAGE_PMD_NR; i++) { + for (i = HPAGE_PMD_NR - 1; i >= 1; i--) { struct page *page_tail = page + i; /* tail_page->_mapcount cannot change */ @@ -1271,14 +1300,13 @@ static void __split_huge_page_refcount(struct page *page) BUG_ON(page_tail->mapping); page_tail->mapping = page->mapping; - page_tail->index = ++head_index; + page_tail->index = page->index + i; BUG_ON(!PageAnon(page_tail)); BUG_ON(!PageUptodate(page_tail)); BUG_ON(!PageDirty(page_tail)); BUG_ON(!PageSwapBacked(page_tail)); - mem_cgroup_split_huge_fixup(page, page_tail); lru_add_page_tail(zone, page, page_tail); } @@ -1288,15 +1316,6 @@ static void __split_huge_page_refcount(struct page *page) __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR); - /* - * A hugepage counts for HPAGE_PMD_NR pages on the LRU statistics, - * so adjust those appropriately if this page is on the LRU. - */ - if (PageLRU(page)) { - zonestat = NR_LRU_BASE + page_lru(page); - __mod_zone_page_state(zone, zonestat, -(HPAGE_PMD_NR-1)); - } - ClearPageCompound(page); compound_unlock(page); spin_unlock_irq(&zone->lru_lock); @@ -28,6 +28,7 @@ #include <linux/kthread.h> #include <linux/wait.h> #include <linux/slab.h> +#include <linux/memcontrol.h> #include <linux/rbtree.h> #include <linux/memory.h> #include <linux/mmu_notifier.h> @@ -1571,6 +1572,16 @@ struct page *ksm_does_need_to_copy(struct page *page, new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); if (new_page) { + /* + * The memcg-specific accounting when moving + * pages around the LRU lists relies on the + * page's owner (memcg) to be valid. Usually, + * pages are assigned to a new owner before + * being put on the LRU list, but since this + * is not the case here, the stale owner from + * a previous allocation cycle must be reset. + */ + mem_cgroup_reset_owner(new_page); copy_user_highpage(new_page, page, address, vma); SetPageDirty(new_page); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d87aa35..602207b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -123,16 +123,22 @@ struct mem_cgroup_stat_cpu { unsigned long targets[MEM_CGROUP_NTARGETS]; }; +struct mem_cgroup_reclaim_iter { + /* css_id of the last scanned hierarchy member */ + int position; + /* scan generation, increased every round-trip */ + unsigned int generation; +}; + /* * per-zone information in memory controller. */ struct mem_cgroup_per_zone { - /* - * spin_lock to protect the per cgroup LRU - */ - struct list_head lists[NR_LRU_LISTS]; + struct lruvec lruvec; unsigned long count[NR_LRU_LISTS]; + struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; + struct zone_reclaim_stat reclaim_stat; struct rb_node tree_node; /* RB tree node */ unsigned long long usage_in_excess;/* Set to the value by which */ @@ -233,11 +239,6 @@ struct mem_cgroup { * per zone LRU lists. */ struct mem_cgroup_lru_info info; - /* - * While reclaiming in a hierarchy, we cache the last child we - * reclaimed from. - */ - int last_scanned_child; int last_scanned_node; #if MAX_NUMNODES > 1 nodemask_t scan_nodes; @@ -366,8 +367,6 @@ enum charge_type { #define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT) #define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1 #define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) -#define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2 -#define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT) static void mem_cgroup_get(struct mem_cgroup *memcg); static void mem_cgroup_put(struct mem_cgroup *memcg); @@ -566,7 +565,7 @@ static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) struct mem_cgroup_per_zone *mz; struct mem_cgroup_tree_per_zone *mctz; - for_each_node_state(node, N_POSSIBLE) { + for_each_node(node) { for (zone = 0; zone < MAX_NR_ZONES; zone++) { mz = mem_cgroup_zoneinfo(memcg, node, zone); mctz = soft_limit_tree_node_zone(node, zone); @@ -656,16 +655,6 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); } -void mem_cgroup_pgfault(struct mem_cgroup *memcg, int val) -{ - this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT], val); -} - -void mem_cgroup_pgmajfault(struct mem_cgroup *memcg, int val) -{ - this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT], val); -} - static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, enum mem_cgroup_events_index idx) { @@ -749,37 +738,32 @@ static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, return total; } -static bool __memcg_event_check(struct mem_cgroup *memcg, int target) +static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, + enum mem_cgroup_events_target target) { unsigned long val, next; val = __this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]); next = __this_cpu_read(memcg->stat->targets[target]); /* from time_after() in jiffies.h */ - return ((long)next - (long)val < 0); -} - -static void __mem_cgroup_target_update(struct mem_cgroup *memcg, int target) -{ - unsigned long val, next; - - val = __this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]); - - switch (target) { - case MEM_CGROUP_TARGET_THRESH: - next = val + THRESHOLDS_EVENTS_TARGET; - break; - case MEM_CGROUP_TARGET_SOFTLIMIT: - next = val + SOFTLIMIT_EVENTS_TARGET; - break; - case MEM_CGROUP_TARGET_NUMAINFO: - next = val + NUMAINFO_EVENTS_TARGET; - break; - default: - return; + if ((long)next - (long)val < 0) { + switch (target) { + case MEM_CGROUP_TARGET_THRESH: + next = val + THRESHOLDS_EVENTS_TARGET; + break; + case MEM_CGROUP_TARGET_SOFTLIMIT: + next = val + SOFTLIMIT_EVENTS_TARGET; + break; + case MEM_CGROUP_TARGET_NUMAINFO: + next = val + NUMAINFO_EVENTS_TARGET; + break; + default: + break; + } + __this_cpu_write(memcg->stat->targets[target], next); + return true; } - - __this_cpu_write(memcg->stat->targets[target], next); + return false; } /* @@ -790,25 +774,27 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) { preempt_disable(); /* threshold event is triggered in finer grain than soft limit */ - if (unlikely(__memcg_event_check(memcg, MEM_CGROUP_TARGET_THRESH))) { + if (unlikely(mem_cgroup_event_ratelimit(memcg, + MEM_CGROUP_TARGET_THRESH))) { + bool do_softlimit, do_numainfo; + + do_softlimit = mem_cgroup_event_ratelimit(memcg, + MEM_CGROUP_TARGET_SOFTLIMIT); +#if MAX_NUMNODES > 1 + do_numainfo = mem_cgroup_event_ratelimit(memcg, + MEM_CGROUP_TARGET_NUMAINFO); +#endif + preempt_enable(); + mem_cgroup_threshold(memcg); - __mem_cgroup_target_update(memcg, MEM_CGROUP_TARGET_THRESH); - if (unlikely(__memcg_event_check(memcg, - MEM_CGROUP_TARGET_SOFTLIMIT))) { + if (unlikely(do_softlimit)) mem_cgroup_update_tree(memcg, page); - __mem_cgroup_target_update(memcg, - MEM_CGROUP_TARGET_SOFTLIMIT); - } #if MAX_NUMNODES > 1 - if (unlikely(__memcg_event_check(memcg, - MEM_CGROUP_TARGET_NUMAINFO))) { + if (unlikely(do_numainfo)) atomic_inc(&memcg->numainfo_events); - __mem_cgroup_target_update(memcg, - MEM_CGROUP_TARGET_NUMAINFO); - } #endif - } - preempt_enable(); + } else + preempt_enable(); } struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) @@ -853,83 +839,116 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) return memcg; } -/* The caller has to guarantee "mem" exists before calling this */ -static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *memcg) +/** + * mem_cgroup_iter - iterate over memory cgroup hierarchy + * @root: hierarchy root + * @prev: previously returned memcg, NULL on first invocation + * @reclaim: cookie for shared reclaim walks, NULL for full walks + * + * Returns references to children of the hierarchy below @root, or + * @root itself, or %NULL after a full round-trip. + * + * Caller must pass the return value in @prev on subsequent + * invocations for reference counting, or use mem_cgroup_iter_break() + * to cancel a hierarchy walk before the round-trip is complete. + * + * Reclaimers can specify a zone and a priority level in @reclaim to + * divide up the memcgs in the hierarchy among all concurrent + * reclaimers operating on the same zone and priority. + */ +struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, + struct mem_cgroup *prev, + struct mem_cgroup_reclaim_cookie *reclaim) { - struct cgroup_subsys_state *css; - int found; + struct mem_cgroup *memcg = NULL; + int id = 0; - if (!memcg) /* ROOT cgroup has the smallest ID */ - return root_mem_cgroup; /*css_put/get against root is ignored*/ - if (!memcg->use_hierarchy) { - if (css_tryget(&memcg->css)) - return memcg; + if (mem_cgroup_disabled()) return NULL; - } - rcu_read_lock(); - /* - * searching a memory cgroup which has the smallest ID under given - * ROOT cgroup. (ID >= 1) - */ - css = css_get_next(&mem_cgroup_subsys, 1, &memcg->css, &found); - if (css && css_tryget(css)) - memcg = container_of(css, struct mem_cgroup, css); - else - memcg = NULL; - rcu_read_unlock(); - return memcg; -} -static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter, - struct mem_cgroup *root, - bool cond) -{ - int nextid = css_id(&iter->css) + 1; - int found; - int hierarchy_used; - struct cgroup_subsys_state *css; + if (!root) + root = root_mem_cgroup; - hierarchy_used = iter->use_hierarchy; + if (prev && !reclaim) + id = css_id(&prev->css); - css_put(&iter->css); - /* If no ROOT, walk all, ignore hierarchy */ - if (!cond || (root && !hierarchy_used)) - return NULL; + if (prev && prev != root) + css_put(&prev->css); - if (!root) - root = root_mem_cgroup; + if (!root->use_hierarchy && root != root_mem_cgroup) { + if (prev) + return NULL; + return root; + } - do { - iter = NULL; - rcu_read_lock(); + while (!memcg) { + struct mem_cgroup_reclaim_iter *uninitialized_var(iter); + struct cgroup_subsys_state *css; + + if (reclaim) { + int nid = zone_to_nid(reclaim->zone); + int zid = zone_idx(reclaim->zone); + struct mem_cgroup_per_zone *mz; - css = css_get_next(&mem_cgroup_subsys, nextid, - &root->css, &found); - if (css && css_tryget(css)) - iter = container_of(css, struct mem_cgroup, css); + mz = mem_cgroup_zoneinfo(root, nid, zid); + iter = &mz->reclaim_iter[reclaim->priority]; + if (prev && reclaim->generation != iter->generation) + return NULL; + id = iter->position; + } + + rcu_read_lock(); + css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id); + if (css) { + if (css == &root->css || css_tryget(css)) + memcg = container_of(css, + struct mem_cgroup, css); + } else + id = 0; rcu_read_unlock(); - /* If css is NULL, no more cgroups will be found */ - nextid = found + 1; - } while (css && !iter); - return iter; + if (reclaim) { + iter->position = id; + if (!css) + iter->generation++; + else if (!prev && memcg) + reclaim->generation = iter->generation; + } + + if (prev && !css) + return NULL; + } + return memcg; } -/* - * for_eacn_mem_cgroup_tree() for visiting all cgroup under tree. Please - * be careful that "break" loop is not allowed. We have reference count. - * Instead of that modify "cond" to be false and "continue" to exit the loop. - */ -#define for_each_mem_cgroup_tree_cond(iter, root, cond) \ - for (iter = mem_cgroup_start_loop(root);\ - iter != NULL;\ - iter = mem_cgroup_get_next(iter, root, cond)) -#define for_each_mem_cgroup_tree(iter, root) \ - for_each_mem_cgroup_tree_cond(iter, root, true) +/** + * mem_cgroup_iter_break - abort a hierarchy walk prematurely + * @root: hierarchy root + * @prev: last visited hierarchy member as returned by mem_cgroup_iter() + */ +void mem_cgroup_iter_break(struct mem_cgroup *root, + struct mem_cgroup *prev) +{ + if (!root) + root = root_mem_cgroup; + if (prev && prev != root) + css_put(&prev->css); +} -#define for_each_mem_cgroup_all(iter) \ - for_each_mem_cgroup_tree_cond(iter, NULL, true) +/* + * Iteration constructs for visiting all cgroups (under a tree). If + * loops are exited prematurely (break), mem_cgroup_iter_break() must + * be used for reference counting. + */ +#define for_each_mem_cgroup_tree(iter, root) \ + for (iter = mem_cgroup_iter(root, NULL, NULL); \ + iter != NULL; \ + iter = mem_cgroup_iter(root, iter, NULL)) +#define for_each_mem_cgroup(iter) \ + for (iter = mem_cgroup_iter(NULL, NULL, NULL); \ + iter != NULL; \ + iter = mem_cgroup_iter(NULL, iter, NULL)) static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) { @@ -949,11 +968,11 @@ void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) goto out; switch (idx) { - case PGMAJFAULT: - mem_cgroup_pgmajfault(memcg, 1); - break; case PGFAULT: - mem_cgroup_pgfault(memcg, 1); + this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]); + break; + case PGMAJFAULT: + this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]); break; default: BUG(); @@ -963,6 +982,27 @@ out: } EXPORT_SYMBOL(mem_cgroup_count_vm_event); +/** + * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg + * @zone: zone of the wanted lruvec + * @mem: memcg of the wanted lruvec + * + * Returns the lru list vector holding pages for the given @zone and + * @mem. This can be the global zone lruvec, if the memory controller + * is disabled. + */ +struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone, + struct mem_cgroup *memcg) +{ + struct mem_cgroup_per_zone *mz; + + if (mem_cgroup_disabled()) + return &zone->lruvec; + + mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone)); + return &mz->lruvec; +} + /* * Following LRU functions are allowed to be used without PCG_LOCK. * Operations are called by routine of global LRU independently from memcg. @@ -977,180 +1017,91 @@ EXPORT_SYMBOL(mem_cgroup_count_vm_event); * When moving account, the page is not on LRU. It's isolated. */ -void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) -{ - struct page_cgroup *pc; - struct mem_cgroup_per_zone *mz; - - if (mem_cgroup_disabled()) - return; - pc = lookup_page_cgroup(page); - /* can happen while we handle swapcache. */ - if (!TestClearPageCgroupAcctLRU(pc)) - return; - VM_BUG_ON(!pc->mem_cgroup); - /* - * We don't check PCG_USED bit. It's cleared when the "page" is finally - * removed from global LRU. - */ - mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); - /* huge page split is done under lru_lock. so, we have no races. */ - MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page); - if (mem_cgroup_is_root(pc->mem_cgroup)) - return; - VM_BUG_ON(list_empty(&pc->lru)); - list_del_init(&pc->lru); -} - -void mem_cgroup_del_lru(struct page *page) -{ - mem_cgroup_del_lru_list(page, page_lru(page)); -} - -/* - * Writeback is about to end against a page which has been marked for immediate - * reclaim. If it still appears to be reclaimable, move it to the tail of the - * inactive list. +/** + * mem_cgroup_lru_add_list - account for adding an lru page and return lruvec + * @zone: zone of the page + * @page: the page + * @lru: current lru + * + * This function accounts for @page being added to @lru, and returns + * the lruvec for the given @zone and the memcg @page is charged to. + * + * The callsite is then responsible for physically linking the page to + * the returned lruvec->lists[@lru]. */ -void mem_cgroup_rotate_reclaimable_page(struct page *page) +struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page, + enum lru_list lru) { struct mem_cgroup_per_zone *mz; + struct mem_cgroup *memcg; struct page_cgroup *pc; - enum lru_list lru = page_lru(page); if (mem_cgroup_disabled()) - return; + return &zone->lruvec; pc = lookup_page_cgroup(page); - /* unused or root page is not rotated. */ - if (!PageCgroupUsed(pc)) - return; - /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ - smp_rmb(); - if (mem_cgroup_is_root(pc->mem_cgroup)) - return; - mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); - list_move_tail(&pc->lru, &mz->lists[lru]); + memcg = pc->mem_cgroup; + mz = page_cgroup_zoneinfo(memcg, page); + /* compound_order() is stabilized through lru_lock */ + MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page); + return &mz->lruvec; } -void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) +/** + * mem_cgroup_lru_del_list - account for removing an lru page + * @page: the page + * @lru: target lru + * + * This function accounts for @page being removed from @lru. + * + * The callsite is then responsible for physically unlinking + * @page->lru. + */ +void mem_cgroup_lru_del_list(struct page *page, enum lru_list lru) { struct mem_cgroup_per_zone *mz; + struct mem_cgroup *memcg; struct page_cgroup *pc; if (mem_cgroup_disabled()) return; pc = lookup_page_cgroup(page); - /* unused or root page is not rotated. */ - if (!PageCgroupUsed(pc)) - return; - /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ - smp_rmb(); - if (mem_cgroup_is_root(pc->mem_cgroup)) - return; - mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); - list_move(&pc->lru, &mz->lists[lru]); -} - -void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) -{ - struct page_cgroup *pc; - struct mem_cgroup_per_zone *mz; - - if (mem_cgroup_disabled()) - return; - pc = lookup_page_cgroup(page); - VM_BUG_ON(PageCgroupAcctLRU(pc)); - /* - * putback: charge: - * SetPageLRU SetPageCgroupUsed - * smp_mb smp_mb - * PageCgroupUsed && add to memcg LRU PageLRU && add to memcg LRU - * - * Ensure that one of the two sides adds the page to the memcg - * LRU during a race. - */ - smp_mb(); - if (!PageCgroupUsed(pc)) - return; - /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ - smp_rmb(); - mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); + memcg = pc->mem_cgroup; + VM_BUG_ON(!memcg); + mz = page_cgroup_zoneinfo(memcg, page); /* huge page split is done under lru_lock. so, we have no races. */ - MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page); - SetPageCgroupAcctLRU(pc); - if (mem_cgroup_is_root(pc->mem_cgroup)) - return; - list_add(&pc->lru, &mz->lists[lru]); -} - -/* - * At handling SwapCache and other FUSE stuff, pc->mem_cgroup may be changed - * while it's linked to lru because the page may be reused after it's fully - * uncharged. To handle that, unlink page_cgroup from LRU when charge it again. - * It's done under lock_page and expected that zone->lru_lock isnever held. - */ -static void mem_cgroup_lru_del_before_commit(struct page *page) -{ - unsigned long flags; - struct zone *zone = page_zone(page); - struct page_cgroup *pc = lookup_page_cgroup(page); - - /* - * Doing this check without taking ->lru_lock seems wrong but this - * is safe. Because if page_cgroup's USED bit is unset, the page - * will not be added to any memcg's LRU. If page_cgroup's USED bit is - * set, the commit after this will fail, anyway. - * This all charge/uncharge is done under some mutual execustion. - * So, we don't need to taking care of changes in USED bit. - */ - if (likely(!PageLRU(page))) - return; - - spin_lock_irqsave(&zone->lru_lock, flags); - /* - * Forget old LRU when this page_cgroup is *not* used. This Used bit - * is guarded by lock_page() because the page is SwapCache. - */ - if (!PageCgroupUsed(pc)) - mem_cgroup_del_lru_list(page, page_lru(page)); - spin_unlock_irqrestore(&zone->lru_lock, flags); + VM_BUG_ON(MEM_CGROUP_ZSTAT(mz, lru) < (1 << compound_order(page))); + MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page); } -static void mem_cgroup_lru_add_after_commit(struct page *page) +void mem_cgroup_lru_del(struct page *page) { - unsigned long flags; - struct zone *zone = page_zone(page); - struct page_cgroup *pc = lookup_page_cgroup(page); - /* - * putback: charge: - * SetPageLRU SetPageCgroupUsed - * smp_mb smp_mb - * PageCgroupUsed && add to memcg LRU PageLRU && add to memcg LRU - * - * Ensure that one of the two sides adds the page to the memcg - * LRU during a race. - */ - smp_mb(); - /* taking care of that the page is added to LRU while we commit it */ - if (likely(!PageLRU(page))) - return; - spin_lock_irqsave(&zone->lru_lock, flags); - /* link when the page is linked to LRU but page_cgroup isn't */ - if (PageLRU(page) && !PageCgroupAcctLRU(pc)) - mem_cgroup_add_lru_list(page, page_lru(page)); - spin_unlock_irqrestore(&zone->lru_lock, flags); + mem_cgroup_lru_del_list(page, page_lru(page)); } - -void mem_cgroup_move_lists(struct page *page, - enum lru_list from, enum lru_list to) +/** + * mem_cgroup_lru_move_lists - account for moving a page between lrus + * @zone: zone of the page + * @page: the page + * @from: current lru + * @to: target lru + * + * This function accounts for @page being moved between the lrus @from + * and @to, and returns the lruvec for the given @zone and the memcg + * @page is charged to. + * + * The callsite is then responsible for physically relinking + * @page->lru to the returned lruvec->lists[@to]. + */ +struct lruvec *mem_cgroup_lru_move_lists(struct zone *zone, + struct page *page, + enum lru_list from, + enum lru_list to) { - if (mem_cgroup_disabled()) - return; - mem_cgroup_del_lru_list(page, from); - mem_cgroup_add_lru_list(page, to); + /* XXX: Optimize this, especially for @from == @to */ + mem_cgroup_lru_del_list(page, from); + return mem_cgroup_lru_add_list(zone, page, to); } /* @@ -1175,10 +1126,21 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg) struct task_struct *p; p = find_lock_task_mm(task); - if (!p) - return 0; - curr = try_get_mem_cgroup_from_mm(p->mm); - task_unlock(p); + if (p) { + curr = try_get_mem_cgroup_from_mm(p->mm); + task_unlock(p); + } else { + /* + * All threads may have already detached their mm's, but the oom + * killer still needs to detect if they have already been oom + * killed to prevent needlessly killing additional tasks. + */ + task_lock(task); + curr = mem_cgroup_from_task(task); + if (curr) + css_get(&curr->css); + task_unlock(task); + } if (!curr) return 0; /* @@ -1258,68 +1220,6 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page) return &mz->reclaim_stat; } -unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, - struct list_head *dst, - unsigned long *scanned, int order, - isolate_mode_t mode, - struct zone *z, - struct mem_cgroup *mem_cont, - int active, int file) -{ - unsigned long nr_taken = 0; - struct page *page; - unsigned long scan; - LIST_HEAD(pc_list); - struct list_head *src; - struct page_cgroup *pc, *tmp; - int nid = zone_to_nid(z); - int zid = zone_idx(z); - struct mem_cgroup_per_zone *mz; - int lru = LRU_FILE * file + active; - int ret; - - BUG_ON(!mem_cont); - mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); - src = &mz->lists[lru]; - - scan = 0; - list_for_each_entry_safe_reverse(pc, tmp, src, lru) { - if (scan >= nr_to_scan) - break; - - if (unlikely(!PageCgroupUsed(pc))) - continue; - - page = lookup_cgroup_page(pc); - - if (unlikely(!PageLRU(page))) - continue; - - scan++; - ret = __isolate_lru_page(page, mode, file); - switch (ret) { - case 0: - list_move(&page->lru, dst); - mem_cgroup_del_lru(page); - nr_taken += hpage_nr_pages(page); - break; - case -EBUSY: - /* we don't affect global LRU but rotate in our LRU */ - mem_cgroup_rotate_lru_list(page, page_lru(page)); - break; - default: - break; - } - } - - *scanned = scan; - - trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken, - 0, 0, 0, mode); - - return nr_taken; -} - #define mem_cgroup_from_res_counter(counter, member) \ container_of(counter, struct mem_cgroup, member) @@ -1536,41 +1436,40 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) return min(limit, memsw); } -/* - * Visit the first child (need not be the first child as per the ordering - * of the cgroup list, since we track last_scanned_child) of @mem and use - * that to reclaim free pages from. - */ -static struct mem_cgroup * -mem_cgroup_select_victim(struct mem_cgroup *root_memcg) +static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg, + gfp_t gfp_mask, + unsigned long flags) { - struct mem_cgroup *ret = NULL; - struct cgroup_subsys_state *css; - int nextid, found; - - if (!root_memcg->use_hierarchy) { - css_get(&root_memcg->css); - ret = root_memcg; - } + unsigned long total = 0; + bool noswap = false; + int loop; - while (!ret) { - rcu_read_lock(); - nextid = root_memcg->last_scanned_child + 1; - css = css_get_next(&mem_cgroup_subsys, nextid, &root_memcg->css, - &found); - if (css && css_tryget(css)) - ret = container_of(css, struct mem_cgroup, css); + if (flags & MEM_CGROUP_RECLAIM_NOSWAP) + noswap = true; + if (!(flags & MEM_CGROUP_RECLAIM_SHRINK) && memcg->memsw_is_minimum) + noswap = true; - rcu_read_unlock(); - /* Updates scanning parameter */ - if (!css) { - /* this means start scan from ID:1 */ - root_memcg->last_scanned_child = 0; - } else - root_memcg->last_scanned_child = found; + for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) { + if (loop) + drain_all_stock_async(memcg); + total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap); + /* + * Allow limit shrinkers, which are triggered directly + * by userspace, to catch signals and stop reclaim + * after minimal progress, regardless of the margin. + */ + if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK)) + break; + if (mem_cgroup_margin(memcg)) + break; + /* + * If nothing was reclaimed after two attempts, there + * may be no reclaimable pages in this hierarchy. + */ + if (loop && !total) + break; } - - return ret; + return total; } /** @@ -1710,61 +1609,35 @@ bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) } #endif -/* - * Scan the hierarchy if needed to reclaim memory. We remember the last child - * we reclaimed from, so that we don't end up penalizing one child extensively - * based on its position in the children list. - * - * root_memcg is the original ancestor that we've been reclaim from. - * - * We give up and return to the caller when we visit root_memcg twice. - * (other groups can be removed while we're walking....) - * - * If shrink==true, for avoiding to free too much, this returns immedieately. - */ -static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_memcg, - struct zone *zone, - gfp_t gfp_mask, - unsigned long reclaim_options, - unsigned long *total_scanned) -{ - struct mem_cgroup *victim; - int ret, total = 0; +static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, + struct zone *zone, + gfp_t gfp_mask, + unsigned long *total_scanned) +{ + struct mem_cgroup *victim = NULL; + int total = 0; int loop = 0; - bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP; - bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; - bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; unsigned long excess; unsigned long nr_scanned; + struct mem_cgroup_reclaim_cookie reclaim = { + .zone = zone, + .priority = 0, + }; excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT; - /* If memsw_is_minimum==1, swap-out is of-no-use. */ - if (!check_soft && !shrink && root_memcg->memsw_is_minimum) - noswap = true; - while (1) { - victim = mem_cgroup_select_victim(root_memcg); - if (victim == root_memcg) { + victim = mem_cgroup_iter(root_memcg, victim, &reclaim); + if (!victim) { loop++; - /* - * We are not draining per cpu cached charges during - * soft limit reclaim because global reclaim doesn't - * care about charges. It tries to free some memory and - * charges will not give any. - */ - if (!check_soft && loop >= 1) - drain_all_stock_async(root_memcg); if (loop >= 2) { /* * If we have not been able to reclaim * anything, it might because there are * no reclaimable pages under this hierarchy */ - if (!check_soft || !total) { - css_put(&victim->css); + if (!total) break; - } /* * We want to do more targeted reclaim. * excess >> 2 is not to excessive so as to @@ -1772,40 +1645,20 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_memcg, * coming back to reclaim from this cgroup */ if (total >= (excess >> 2) || - (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) { - css_put(&victim->css); + (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) break; - } } - } - if (!mem_cgroup_reclaimable(victim, noswap)) { - /* this cgroup's local usage == 0 */ - css_put(&victim->css); continue; } - /* we use swappiness of local cgroup */ - if (check_soft) { - ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, - noswap, zone, &nr_scanned); - *total_scanned += nr_scanned; - } else - ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, - noswap); - css_put(&victim->css); - /* - * At shrinking usage, we can't check we should stop here or - * reclaim more. It's depends on callers. last_scanned_child - * will work enough for keeping fairness under tree. - */ - if (shrink) - return ret; - total += ret; - if (check_soft) { - if (!res_counter_soft_limit_excess(&root_memcg->res)) - return total; - } else if (mem_cgroup_margin(root_memcg)) - return total; + if (!mem_cgroup_reclaimable(victim, false)) + continue; + total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false, + zone, &nr_scanned); + *total_scanned += nr_scanned; + if (!res_counter_soft_limit_excess(&root_memcg->res)) + break; } + mem_cgroup_iter_break(root_memcg, victim); return total; } @@ -1817,16 +1670,16 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_memcg, static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg) { struct mem_cgroup *iter, *failed = NULL; - bool cond = true; - for_each_mem_cgroup_tree_cond(iter, memcg, cond) { + for_each_mem_cgroup_tree(iter, memcg) { if (iter->oom_lock) { /* * this subtree of our hierarchy is already locked * so we cannot give a lock. */ failed = iter; - cond = false; + mem_cgroup_iter_break(memcg, iter); + break; } else iter->oom_lock = true; } @@ -1838,11 +1691,10 @@ static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg) * OK, we failed to lock the whole subtree so we have to clean up * what we set up to the failing subtree */ - cond = true; - for_each_mem_cgroup_tree_cond(iter, memcg, cond) { + for_each_mem_cgroup_tree(iter, memcg) { if (iter == failed) { - cond = false; - continue; + mem_cgroup_iter_break(memcg, iter); + break; } iter->oom_lock = false; } @@ -2007,7 +1859,7 @@ void mem_cgroup_update_page_stat(struct page *page, bool need_unlock = false; unsigned long uninitialized_var(flags); - if (unlikely(!pc)) + if (mem_cgroup_disabled()) return; rcu_read_lock(); @@ -2238,7 +2090,7 @@ static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb, struct mem_cgroup *iter; if ((action == CPU_ONLINE)) { - for_each_mem_cgroup_all(iter) + for_each_mem_cgroup(iter) synchronize_mem_cgroup_on_move(iter, cpu); return NOTIFY_OK; } @@ -2246,7 +2098,7 @@ static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb, if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN) return NOTIFY_OK; - for_each_mem_cgroup_all(iter) + for_each_mem_cgroup(iter) mem_cgroup_drain_pcp_counter(iter, cpu); stock = &per_cpu(memcg_stock, cpu); @@ -2300,8 +2152,7 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, if (!(gfp_mask & __GFP_WAIT)) return CHARGE_WOULDBLOCK; - ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, - gfp_mask, flags, NULL); + ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags); if (mem_cgroup_margin(mem_over_limit) >= nr_pages) return CHARGE_RETRY; /* @@ -2334,8 +2185,25 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, } /* - * Unlike exported interface, "oom" parameter is added. if oom==true, - * oom-killer can be invoked. + * __mem_cgroup_try_charge() does + * 1. detect memcg to be charged against from passed *mm and *ptr, + * 2. update res_counter + * 3. call memory reclaim if necessary. + * + * In some special case, if the task is fatal, fatal_signal_pending() or + * has TIF_MEMDIE, this function returns -EINTR while writing root_mem_cgroup + * to *ptr. There are two reasons for this. 1: fatal threads should quit as soon + * as possible without any hazards. 2: all pages should have a valid + * pc->mem_cgroup. If mm is NULL and the caller doesn't pass a valid memcg + * pointer, that is treated as a charge to root_mem_cgroup. + * + * So __mem_cgroup_try_charge() will return + * 0 ... on success, filling *ptr with a valid memcg pointer. + * -ENOMEM ... charge failure because of resource limits. + * -EINTR ... if thread is fatal. *ptr is filled with root_mem_cgroup. + * + * Unlike the exported interface, an "oom" parameter is added. if oom==true, + * the oom-killer can be invoked. */ static int __mem_cgroup_try_charge(struct mm_struct *mm, gfp_t gfp_mask, @@ -2364,7 +2232,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, * set, if so charge the init_mm (happens for pagecache usage). */ if (!*ptr && !mm) - goto bypass; + *ptr = root_mem_cgroup; again: if (*ptr) { /* css should be a valid one */ memcg = *ptr; @@ -2390,7 +2258,9 @@ again: * task-struct. So, mm->owner can be NULL. */ memcg = mem_cgroup_from_task(p); - if (!memcg || mem_cgroup_is_root(memcg)) { + if (!memcg) + memcg = root_mem_cgroup; + if (mem_cgroup_is_root(memcg)) { rcu_read_unlock(); goto done; } @@ -2465,8 +2335,8 @@ nomem: *ptr = NULL; return -ENOMEM; bypass: - *ptr = NULL; - return 0; + *ptr = root_mem_cgroup; + return -EINTR; } /* @@ -2522,7 +2392,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) memcg = NULL; } else if (PageSwapCache(page)) { ent.val = page_private(page); - id = lookup_swap_cgroup(ent); + id = lookup_swap_cgroup_id(ent); rcu_read_lock(); memcg = mem_cgroup_lookup(id); if (memcg && !css_tryget(&memcg->css)) @@ -2574,6 +2444,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), nr_pages); unlock_page_cgroup(pc); + WARN_ON_ONCE(PageLRU(page)); /* * "charge_statistics" updated event counter. Then, check it. * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. @@ -2585,44 +2456,29 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\ - (1 << PCG_ACCT_LRU) | (1 << PCG_MIGRATION)) + (1 << PCG_MIGRATION)) /* * Because tail pages are not marked as "used", set it. We're under - * zone->lru_lock, 'splitting on pmd' and compund_lock. + * zone->lru_lock, 'splitting on pmd' and compound_lock. + * charge/uncharge will be never happen and move_account() is done under + * compound_lock(), so we don't have to take care of races. */ -void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail) +void mem_cgroup_split_huge_fixup(struct page *head) { struct page_cgroup *head_pc = lookup_page_cgroup(head); - struct page_cgroup *tail_pc = lookup_page_cgroup(tail); - unsigned long flags; + struct page_cgroup *pc; + int i; if (mem_cgroup_disabled()) return; - /* - * We have no races with charge/uncharge but will have races with - * page state accounting. - */ - move_lock_page_cgroup(head_pc, &flags); - - tail_pc->mem_cgroup = head_pc->mem_cgroup; - smp_wmb(); /* see __commit_charge() */ - if (PageCgroupAcctLRU(head_pc)) { - enum lru_list lru; - struct mem_cgroup_per_zone *mz; - - /* - * LRU flags cannot be copied because we need to add tail - *.page to LRU by generic call and our hook will be called. - * We hold lru_lock, then, reduce counter directly. - */ - lru = page_lru(head); - mz = page_cgroup_zoneinfo(head_pc->mem_cgroup, head); - MEM_CGROUP_ZSTAT(mz, lru) -= 1; + for (i = 1; i < HPAGE_PMD_NR; i++) { + pc = head_pc + i; + pc->mem_cgroup = head_pc->mem_cgroup; + smp_wmb();/* see __commit_charge() */ + pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; } - tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; - move_unlock_page_cgroup(head_pc, &flags); } -#endif +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ /** * mem_cgroup_move_account - move account of the page @@ -2737,7 +2593,7 @@ static int mem_cgroup_move_parent(struct page *page, parent = mem_cgroup_from_cont(pcg); ret = __mem_cgroup_try_charge(NULL, gfp_mask, nr_pages, &parent, false); - if (ret || !parent) + if (ret) goto put_back; if (nr_pages > 1) @@ -2783,12 +2639,9 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, } pc = lookup_page_cgroup(page); - BUG_ON(!pc); /* XXX: remove this and move pc lookup into commit */ - ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom); - if (ret || !memcg) + if (ret == -ENOMEM) return ret; - __mem_cgroup_commit_charge(memcg, page, nr_pages, pc, ctype); return 0; } @@ -2798,19 +2651,11 @@ int mem_cgroup_newpage_charge(struct page *page, { if (mem_cgroup_disabled()) return 0; - /* - * If already mapped, we don't have to account. - * If page cache, page->mapping has address_space. - * But page->mapping may have out-of-use anon_vma pointer, - * detecit it by PageAnon() check. newly-mapped-anon's page->mapping - * is NULL. - */ - if (page_mapped(page) || (page->mapping && !PageAnon(page))) - return 0; - if (unlikely(!mm)) - mm = &init_mm; + VM_BUG_ON(page_mapped(page)); + VM_BUG_ON(page->mapping && !PageAnon(page)); + VM_BUG_ON(!mm); return mem_cgroup_charge_common(page, mm, gfp_mask, - MEM_CGROUP_CHARGE_TYPE_MAPPED); + MEM_CGROUP_CHARGE_TYPE_MAPPED); } static void @@ -2822,14 +2667,27 @@ __mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *memcg, enum charge_type ctype) { struct page_cgroup *pc = lookup_page_cgroup(page); + struct zone *zone = page_zone(page); + unsigned long flags; + bool removed = false; + /* * In some case, SwapCache, FUSE(splice_buf->radixtree), the page * is already on LRU. It means the page may on some other page_cgroup's * LRU. Take care of it. */ - mem_cgroup_lru_del_before_commit(page); + spin_lock_irqsave(&zone->lru_lock, flags); + if (PageLRU(page)) { + del_page_from_lru_list(zone, page, page_lru(page)); + ClearPageLRU(page); + removed = true; + } __mem_cgroup_commit_charge(memcg, page, 1, pc, ctype); - mem_cgroup_lru_add_after_commit(page); + if (removed) { + add_page_to_lru_list(zone, page, page_lru(page)); + SetPageLRU(page); + } + spin_unlock_irqrestore(&zone->lru_lock, flags); return; } @@ -2837,6 +2695,7 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { struct mem_cgroup *memcg = NULL; + enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; int ret; if (mem_cgroup_disabled()) @@ -2846,31 +2705,16 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, if (unlikely(!mm)) mm = &init_mm; + if (!page_is_file_cache(page)) + type = MEM_CGROUP_CHARGE_TYPE_SHMEM; - if (page_is_file_cache(page)) { - ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &memcg, true); - if (ret || !memcg) - return ret; - - /* - * FUSE reuses pages without going through the final - * put that would remove them from the LRU list, make - * sure that they get relinked properly. - */ - __mem_cgroup_commit_charge_lrucare(page, memcg, - MEM_CGROUP_CHARGE_TYPE_CACHE); - return ret; - } - /* shmem */ - if (PageSwapCache(page)) { + if (!PageSwapCache(page)) + ret = mem_cgroup_charge_common(page, mm, gfp_mask, type); + else { /* page is swapcache/shmem */ ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &memcg); if (!ret) - __mem_cgroup_commit_charge_swapin(page, memcg, - MEM_CGROUP_CHARGE_TYPE_SHMEM); - } else - ret = mem_cgroup_charge_common(page, mm, gfp_mask, - MEM_CGROUP_CHARGE_TYPE_SHMEM); - + __mem_cgroup_commit_charge_swapin(page, memcg, type); + } return ret; } @@ -2882,12 +2726,12 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, */ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page, - gfp_t mask, struct mem_cgroup **ptr) + gfp_t mask, struct mem_cgroup **memcgp) { struct mem_cgroup *memcg; int ret; - *ptr = NULL; + *memcgp = NULL; if (mem_cgroup_disabled()) return 0; @@ -2905,27 +2749,32 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, memcg = try_get_mem_cgroup_from_page(page); if (!memcg) goto charge_cur_mm; - *ptr = memcg; - ret = __mem_cgroup_try_charge(NULL, mask, 1, ptr, true); + *memcgp = memcg; + ret = __mem_cgroup_try_charge(NULL, mask, 1, memcgp, true); css_put(&memcg->css); + if (ret == -EINTR) + ret = 0; return ret; charge_cur_mm: if (unlikely(!mm)) mm = &init_mm; - return __mem_cgroup_try_charge(mm, mask, 1, ptr, true); + ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true); + if (ret == -EINTR) + ret = 0; + return ret; } static void -__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, +__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg, enum charge_type ctype) { if (mem_cgroup_disabled()) return; - if (!ptr) + if (!memcg) return; - cgroup_exclude_rmdir(&ptr->css); + cgroup_exclude_rmdir(&memcg->css); - __mem_cgroup_commit_charge_lrucare(page, ptr, ctype); + __mem_cgroup_commit_charge_lrucare(page, memcg, ctype); /* * Now swap is on-memory. This means this page may be * counted both as mem and swap....double count. @@ -2935,21 +2784,22 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, */ if (do_swap_account && PageSwapCache(page)) { swp_entry_t ent = {.val = page_private(page)}; + struct mem_cgroup *swap_memcg; unsigned short id; - struct mem_cgroup *memcg; id = swap_cgroup_record(ent, 0); rcu_read_lock(); - memcg = mem_cgroup_lookup(id); - if (memcg) { + swap_memcg = mem_cgroup_lookup(id); + if (swap_memcg) { /* * This recorded memcg can be obsolete one. So, avoid * calling css_tryget */ - if (!mem_cgroup_is_root(memcg)) - res_counter_uncharge(&memcg->memsw, PAGE_SIZE); - mem_cgroup_swap_statistics(memcg, false); - mem_cgroup_put(memcg); + if (!mem_cgroup_is_root(swap_memcg)) + res_counter_uncharge(&swap_memcg->memsw, + PAGE_SIZE); + mem_cgroup_swap_statistics(swap_memcg, false); + mem_cgroup_put(swap_memcg); } rcu_read_unlock(); } @@ -2958,13 +2808,14 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, * So, rmdir()->pre_destroy() can be called while we do this charge. * In that case, we need to call pre_destroy() again. check it here. */ - cgroup_release_and_wakeup_rmdir(&ptr->css); + cgroup_release_and_wakeup_rmdir(&memcg->css); } -void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) +void mem_cgroup_commit_charge_swapin(struct page *page, + struct mem_cgroup *memcg) { - __mem_cgroup_commit_charge_swapin(page, ptr, - MEM_CGROUP_CHARGE_TYPE_MAPPED); + __mem_cgroup_commit_charge_swapin(page, memcg, + MEM_CGROUP_CHARGE_TYPE_MAPPED); } void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg) @@ -3054,7 +2905,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) * Check if our page_cgroup is valid */ pc = lookup_page_cgroup(page); - if (unlikely(!pc || !PageCgroupUsed(pc))) + if (unlikely(!PageCgroupUsed(pc))) return NULL; lock_page_cgroup(pc); @@ -3117,8 +2968,7 @@ void mem_cgroup_uncharge_page(struct page *page) /* early check. */ if (page_mapped(page)) return; - if (page->mapping && !PageAnon(page)) - return; + VM_BUG_ON(page->mapping && !PageAnon(page)); __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED); } @@ -3176,6 +3026,23 @@ void mem_cgroup_uncharge_end(void) batch->memcg = NULL; } +/* + * A function for resetting pc->mem_cgroup for newly allocated pages. + * This function should be called if the newpage will be added to LRU + * before start accounting. + */ +void mem_cgroup_reset_owner(struct page *newpage) +{ + struct page_cgroup *pc; + + if (mem_cgroup_disabled()) + return; + + pc = lookup_page_cgroup(newpage); + VM_BUG_ON(PageCgroupUsed(pc)); + pc->mem_cgroup = root_mem_cgroup; +} + #ifdef CONFIG_SWAP /* * called after __delete_from_swap_cache() and drop "page" account. @@ -3293,14 +3160,14 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry, * page belongs to. */ int mem_cgroup_prepare_migration(struct page *page, - struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask) + struct page *newpage, struct mem_cgroup **memcgp, gfp_t gfp_mask) { struct mem_cgroup *memcg = NULL; struct page_cgroup *pc; enum charge_type ctype; int ret = 0; - *ptr = NULL; + *memcgp = NULL; VM_BUG_ON(PageTransHuge(page)); if (mem_cgroup_disabled()) @@ -3351,10 +3218,10 @@ int mem_cgroup_prepare_migration(struct page *page, if (!memcg) return 0; - *ptr = memcg; - ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, ptr, false); + *memcgp = memcg; + ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, memcgp, false); css_put(&memcg->css);/* drop extra refcnt */ - if (ret || *ptr == NULL) { + if (ret) { if (PageAnon(page)) { lock_page_cgroup(pc); ClearPageCgroupMigration(pc); @@ -3364,6 +3231,7 @@ int mem_cgroup_prepare_migration(struct page *page, */ mem_cgroup_uncharge_page(page); } + /* we'll need to revisit this error code (we have -EINTR) */ return -ENOMEM; } /* @@ -3432,12 +3300,51 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg, cgroup_release_and_wakeup_rmdir(&memcg->css); } +/* + * At replace page cache, newpage is not under any memcg but it's on + * LRU. So, this function doesn't touch res_counter but handles LRU + * in correct way. Both pages are locked so we cannot race with uncharge. + */ +void mem_cgroup_replace_page_cache(struct page *oldpage, + struct page *newpage) +{ + struct mem_cgroup *memcg; + struct page_cgroup *pc; + enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; + + if (mem_cgroup_disabled()) + return; + + pc = lookup_page_cgroup(oldpage); + /* fix accounting on old pages */ + lock_page_cgroup(pc); + memcg = pc->mem_cgroup; + mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), -1); + ClearPageCgroupUsed(pc); + unlock_page_cgroup(pc); + + if (PageSwapBacked(oldpage)) + type = MEM_CGROUP_CHARGE_TYPE_SHMEM; + + /* + * Even if newpage->mapping was NULL before starting replacement, + * the newpage may be on LRU(or pagevec for LRU) already. We lock + * LRU while we overwrite pc->mem_cgroup. + */ + __mem_cgroup_commit_charge_lrucare(newpage, memcg, type); +} + #ifdef CONFIG_DEBUG_VM static struct page_cgroup *lookup_page_cgroup_used(struct page *page) { struct page_cgroup *pc; pc = lookup_page_cgroup(page); + /* + * Can be NULL while feeding pages into the page allocator for + * the first time, i.e. during boot or memory hotplug; + * or when mem_cgroup_disabled(). + */ if (likely(pc) && PageCgroupUsed(pc)) return pc; return NULL; @@ -3457,23 +3364,8 @@ void mem_cgroup_print_bad_page(struct page *page) pc = lookup_page_cgroup_used(page); if (pc) { - int ret = -1; - char *path; - - printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p", + printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p\n", pc, pc->flags, pc->mem_cgroup); - - path = kmalloc(PATH_MAX, GFP_KERNEL); - if (path) { - rcu_read_lock(); - ret = cgroup_path(pc->mem_cgroup->css.cgroup, - path, PATH_MAX); - rcu_read_unlock(); - } - - printk(KERN_CONT "(%s)\n", - (ret < 0) ? "cannot get the path" : path); - kfree(path); } } #endif @@ -3534,9 +3426,8 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, if (!ret) break; - mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, - MEM_CGROUP_RECLAIM_SHRINK, - NULL); + mem_cgroup_reclaim(memcg, GFP_KERNEL, + MEM_CGROUP_RECLAIM_SHRINK); curusage = res_counter_read_u64(&memcg->res, RES_USAGE); /* Usage is reduced ? */ if (curusage >= oldusage) @@ -3594,10 +3485,9 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, if (!ret) break; - mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, - MEM_CGROUP_RECLAIM_NOSWAP | - MEM_CGROUP_RECLAIM_SHRINK, - NULL); + mem_cgroup_reclaim(memcg, GFP_KERNEL, + MEM_CGROUP_RECLAIM_NOSWAP | + MEM_CGROUP_RECLAIM_SHRINK); curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); /* Usage is reduced ? */ if (curusage >= oldusage) @@ -3640,10 +3530,8 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, break; nr_scanned = 0; - reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone, - gfp_mask, - MEM_CGROUP_RECLAIM_SOFT, - &nr_scanned); + reclaimed = mem_cgroup_soft_reclaim(mz->mem, zone, + gfp_mask, &nr_scanned); nr_reclaimed += reclaimed; *total_scanned += nr_scanned; spin_lock(&mctz->lock); @@ -3711,22 +3599,23 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg, int node, int zid, enum lru_list lru) { - struct zone *zone; struct mem_cgroup_per_zone *mz; - struct page_cgroup *pc, *busy; unsigned long flags, loop; struct list_head *list; + struct page *busy; + struct zone *zone; int ret = 0; zone = &NODE_DATA(node)->node_zones[zid]; mz = mem_cgroup_zoneinfo(memcg, node, zid); - list = &mz->lists[lru]; + list = &mz->lruvec.lists[lru]; loop = MEM_CGROUP_ZSTAT(mz, lru); /* give some margin against EBUSY etc...*/ loop += 256; busy = NULL; while (loop--) { + struct page_cgroup *pc; struct page *page; ret = 0; @@ -3735,24 +3624,24 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg, spin_unlock_irqrestore(&zone->lru_lock, flags); break; } - pc = list_entry(list->prev, struct page_cgroup, lru); - if (busy == pc) { - list_move(&pc->lru, list); + page = list_entry(list->prev, struct page, lru); + if (busy == page) { + list_move(&page->lru, list); busy = NULL; spin_unlock_irqrestore(&zone->lru_lock, flags); continue; } spin_unlock_irqrestore(&zone->lru_lock, flags); - page = lookup_cgroup_page(pc); + pc = lookup_page_cgroup(page); ret = mem_cgroup_move_parent(page, pc, memcg, GFP_KERNEL); - if (ret == -ENOMEM) + if (ret == -ENOMEM || ret == -EINTR) break; if (ret == -EBUSY || ret == -EINVAL) { /* found lock contention or "pc" is obsolete. */ - busy = pc; + busy = page; cond_resched(); } else busy = NULL; @@ -4846,7 +4735,7 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) for (zone = 0; zone < MAX_NR_ZONES; zone++) { mz = &pn->zoneinfo[zone]; for_each_lru(l) - INIT_LIST_HEAD(&mz->lists[l]); + INIT_LIST_HEAD(&mz->lruvec.lists[l]); mz->usage_in_excess = 0; mz->on_tree = false; mz->mem = memcg; @@ -4906,7 +4795,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) mem_cgroup_remove_from_trees(memcg); free_css_id(&mem_cgroup_subsys, &memcg->css); - for_each_node_state(node, N_POSSIBLE) + for_each_node(node) free_mem_cgroup_per_zone_info(memcg, node); free_percpu(memcg->stat); @@ -4965,13 +4854,13 @@ static int mem_cgroup_soft_limit_tree_init(void) struct mem_cgroup_tree_per_zone *rtpz; int tmp, node, zone; - for_each_node_state(node, N_POSSIBLE) { + for_each_node(node) { tmp = node; if (!node_state(node, N_NORMAL_MEMORY)) tmp = -1; rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); if (!rtpn) - return 1; + goto err_cleanup; soft_limit_tree.rb_tree_per_node[node] = rtpn; @@ -4982,6 +4871,16 @@ static int mem_cgroup_soft_limit_tree_init(void) } } return 0; + +err_cleanup: + for_each_node(node) { + if (!soft_limit_tree.rb_tree_per_node[node]) + break; + kfree(soft_limit_tree.rb_tree_per_node[node]); + soft_limit_tree.rb_tree_per_node[node] = NULL; + } + return 1; + } static struct cgroup_subsys_state * __ref @@ -4995,7 +4894,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) if (!memcg) return ERR_PTR(error); - for_each_node_state(node, N_POSSIBLE) + for_each_node(node) if (alloc_mem_cgroup_per_zone_info(memcg, node)) goto free_out; @@ -5033,7 +4932,6 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) res_counter_init(&memcg->res, NULL); res_counter_init(&memcg->memsw, NULL); } - memcg->last_scanned_child = 0; memcg->last_scanned_node = MAX_NUMNODES; INIT_LIST_HEAD(&memcg->oom_notify); @@ -5129,9 +5027,9 @@ one_by_one: } ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, 1, &memcg, false); - if (ret || !memcg) + if (ret) /* mem_cgroup_clear_mc() will do uncharge later */ - return -ENOMEM; + return ret; mc.precharge++; } return ret; @@ -5276,7 +5174,7 @@ static int is_target_pte_for_mc(struct vm_area_struct *vma, } /* There is a swap entry and a page doesn't exist or isn't charged */ if (ent.val && !ret && - css_id(&mc.from->css) == lookup_swap_cgroup(ent)) { + css_id(&mc.from->css) == lookup_swap_cgroup_id(ent)) { ret = MC_TARGET_SWAP; if (target) target->ent = ent; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 06d3479..56080ea 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1557,7 +1557,7 @@ int soft_offline_page(struct page *page, int flags) page_is_file_cache(page)); list_add(&page->lru, &pagelist); ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, - 0, true); + 0, MIGRATE_SYNC); if (ret) { putback_lru_pages(&pagelist); pr_info("soft offline: %#lx: migration failed %d, type %lx\n", diff --git a/mm/memory.c b/mm/memory.c index 829d437..5e30583 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -293,7 +293,7 @@ int __tlb_remove_page(struct mmu_gather *tlb, struct page *page) { struct mmu_gather_batch *batch; - tlb->need_flush = 1; + VM_BUG_ON(!tlb->need_flush); if (tlb_fast_mode(tlb)) { free_page_and_swap_cache(page); @@ -1231,7 +1231,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, if (next-addr != HPAGE_PMD_SIZE) { VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem)); split_huge_page_pmd(vma->vm_mm, pmd); - } else if (zap_huge_pmd(tlb, vma, pmd)) + } else if (zap_huge_pmd(tlb, vma, pmd, addr)) continue; /* fall through */ } diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 2168489..6629faf 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -809,7 +809,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) } /* this function returns # of failed pages */ ret = migrate_pages(&source, hotremove_migrate_alloc, 0, - true, true); + true, MIGRATE_SYNC); if (ret) putback_lru_pages(&source); } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e3d58f0..06b145fb 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -942,7 +942,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, new_node_page, dest, - false, true); + false, MIGRATE_SYNC); if (err) putback_lru_pages(&pagelist); } diff --git a/mm/migrate.c b/mm/migrate.c index 89ea085..9871a56 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -216,6 +216,56 @@ out: pte_unmap_unlock(ptep, ptl); } +#ifdef CONFIG_BLOCK +/* Returns true if all buffers are successfully locked */ +static bool buffer_migrate_lock_buffers(struct buffer_head *head, + enum migrate_mode mode) +{ + struct buffer_head *bh = head; + + /* Simple case, sync compaction */ + if (mode != MIGRATE_ASYNC) { + do { + get_bh(bh); + lock_buffer(bh); + bh = bh->b_this_page; + + } while (bh != head); + + return true; + } + + /* async case, we cannot block on lock_buffer so use trylock_buffer */ + do { + get_bh(bh); + if (!trylock_buffer(bh)) { + /* + * We failed to lock the buffer and cannot stall in + * async migration. Release the taken locks + */ + struct buffer_head *failed_bh = bh; + put_bh(failed_bh); + bh = head; + while (bh != failed_bh) { + unlock_buffer(bh); + put_bh(bh); + bh = bh->b_this_page; + } + return false; + } + + bh = bh->b_this_page; + } while (bh != head); + return true; +} +#else +static inline bool buffer_migrate_lock_buffers(struct buffer_head *head, + enum migrate_mode mode) +{ + return true; +} +#endif /* CONFIG_BLOCK */ + /* * Replace the page in the mapping. * @@ -225,7 +275,8 @@ out: * 3 for pages with a mapping and PagePrivate/PagePrivate2 set. */ static int migrate_page_move_mapping(struct address_space *mapping, - struct page *newpage, struct page *page) + struct page *newpage, struct page *page, + struct buffer_head *head, enum migrate_mode mode) { int expected_count; void **pslot; @@ -255,6 +306,20 @@ static int migrate_page_move_mapping(struct address_space *mapping, } /* + * In the async migration case of moving a page with buffers, lock the + * buffers using trylock before the mapping is moved. If the mapping + * was moved, we later failed to lock the buffers and could not move + * the mapping back due to an elevated page count, we would have to + * block waiting on other references to be dropped. + */ + if (mode == MIGRATE_ASYNC && head && + !buffer_migrate_lock_buffers(head, mode)) { + page_unfreeze_refs(page, expected_count); + spin_unlock_irq(&mapping->tree_lock); + return -EAGAIN; + } + + /* * Now we know that no one else is looking at the page. */ get_page(newpage); /* add cache reference */ @@ -409,13 +474,14 @@ EXPORT_SYMBOL(fail_migrate_page); * Pages are locked upon entry and exit. */ int migrate_page(struct address_space *mapping, - struct page *newpage, struct page *page) + struct page *newpage, struct page *page, + enum migrate_mode mode) { int rc; BUG_ON(PageWriteback(page)); /* Writeback must be complete */ - rc = migrate_page_move_mapping(mapping, newpage, page); + rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode); if (rc) return rc; @@ -432,28 +498,28 @@ EXPORT_SYMBOL(migrate_page); * exist. */ int buffer_migrate_page(struct address_space *mapping, - struct page *newpage, struct page *page) + struct page *newpage, struct page *page, enum migrate_mode mode) { struct buffer_head *bh, *head; int rc; if (!page_has_buffers(page)) - return migrate_page(mapping, newpage, page); + return migrate_page(mapping, newpage, page, mode); head = page_buffers(page); - rc = migrate_page_move_mapping(mapping, newpage, page); + rc = migrate_page_move_mapping(mapping, newpage, page, head, mode); if (rc) return rc; - bh = head; - do { - get_bh(bh); - lock_buffer(bh); - bh = bh->b_this_page; - - } while (bh != head); + /* + * In the async case, migrate_page_move_mapping locked the buffers + * with an IRQ-safe spinlock held. In the sync case, the buffers + * need to be locked now + */ + if (mode != MIGRATE_ASYNC) + BUG_ON(!buffer_migrate_lock_buffers(head, mode)); ClearPagePrivate(page); set_page_private(newpage, page_private(page)); @@ -530,10 +596,14 @@ static int writeout(struct address_space *mapping, struct page *page) * Default handling if a filesystem does not provide a migration function. */ static int fallback_migrate_page(struct address_space *mapping, - struct page *newpage, struct page *page) + struct page *newpage, struct page *page, enum migrate_mode mode) { - if (PageDirty(page)) + if (PageDirty(page)) { + /* Only writeback pages in full synchronous migration */ + if (mode != MIGRATE_SYNC) + return -EBUSY; return writeout(mapping, page); + } /* * Buffers may be managed in a filesystem specific way. @@ -543,7 +613,7 @@ static int fallback_migrate_page(struct address_space *mapping, !try_to_release_page(page, GFP_KERNEL)) return -EAGAIN; - return migrate_page(mapping, newpage, page); + return migrate_page(mapping, newpage, page, mode); } /* @@ -558,7 +628,7 @@ static int fallback_migrate_page(struct address_space *mapping, * == 0 - success */ static int move_to_new_page(struct page *newpage, struct page *page, - int remap_swapcache, bool sync) + int remap_swapcache, enum migrate_mode mode) { struct address_space *mapping; int rc; @@ -579,29 +649,18 @@ static int move_to_new_page(struct page *newpage, struct page *page, mapping = page_mapping(page); if (!mapping) - rc = migrate_page(mapping, newpage, page); - else { + rc = migrate_page(mapping, newpage, page, mode); + else if (mapping->a_ops->migratepage) /* - * Do not writeback pages if !sync and migratepage is - * not pointing to migrate_page() which is nonblocking - * (swapcache/tmpfs uses migratepage = migrate_page). + * Most pages have a mapping and most filesystems provide a + * migratepage callback. Anonymous pages are part of swap + * space which also has its own migratepage callback. This + * is the most common path for page migration. */ - if (PageDirty(page) && !sync && - mapping->a_ops->migratepage != migrate_page) - rc = -EBUSY; - else if (mapping->a_ops->migratepage) - /* - * Most pages have a mapping and most filesystems - * should provide a migration function. Anonymous - * pages are part of swap space which also has its - * own migration function. This is the most common - * path for page migration. - */ - rc = mapping->a_ops->migratepage(mapping, - newpage, page); - else - rc = fallback_migrate_page(mapping, newpage, page); - } + rc = mapping->a_ops->migratepage(mapping, + newpage, page, mode); + else + rc = fallback_migrate_page(mapping, newpage, page, mode); if (rc) { newpage->mapping = NULL; @@ -616,7 +675,7 @@ static int move_to_new_page(struct page *newpage, struct page *page, } static int __unmap_and_move(struct page *page, struct page *newpage, - int force, bool offlining, bool sync) + int force, bool offlining, enum migrate_mode mode) { int rc = -EAGAIN; int remap_swapcache = 1; @@ -625,7 +684,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, struct anon_vma *anon_vma = NULL; if (!trylock_page(page)) { - if (!force || !sync) + if (!force || mode == MIGRATE_ASYNC) goto out; /* @@ -671,10 +730,12 @@ static int __unmap_and_move(struct page *page, struct page *newpage, if (PageWriteback(page)) { /* - * For !sync, there is no point retrying as the retry loop - * is expected to be too short for PageWriteback to be cleared + * Only in the case of a full syncronous migration is it + * necessary to wait for PageWriteback. In the async case, + * the retry loop is too short and in the sync-light case, + * the overhead of stalling is too much */ - if (!sync) { + if (mode != MIGRATE_SYNC) { rc = -EBUSY; goto uncharge; } @@ -745,7 +806,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, skip_unmap: if (!page_mapped(page)) - rc = move_to_new_page(newpage, page, remap_swapcache, sync); + rc = move_to_new_page(newpage, page, remap_swapcache, mode); if (rc && remap_swapcache) remove_migration_ptes(page, page); @@ -768,7 +829,8 @@ out: * to the newly allocated page in newpage. */ static int unmap_and_move(new_page_t get_new_page, unsigned long private, - struct page *page, int force, bool offlining, bool sync) + struct page *page, int force, bool offlining, + enum migrate_mode mode) { int rc = 0; int *result = NULL; @@ -777,6 +839,8 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, if (!newpage) return -ENOMEM; + mem_cgroup_reset_owner(newpage); + if (page_count(page) == 1) { /* page was freed from under us. So we are done. */ goto out; @@ -786,7 +850,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, if (unlikely(split_huge_page(page))) goto out; - rc = __unmap_and_move(page, newpage, force, offlining, sync); + rc = __unmap_and_move(page, newpage, force, offlining, mode); out: if (rc != -EAGAIN) { /* @@ -834,7 +898,8 @@ out: */ static int unmap_and_move_huge_page(new_page_t get_new_page, unsigned long private, struct page *hpage, - int force, bool offlining, bool sync) + int force, bool offlining, + enum migrate_mode mode) { int rc = 0; int *result = NULL; @@ -847,7 +912,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, rc = -EAGAIN; if (!trylock_page(hpage)) { - if (!force || !sync) + if (!force || mode != MIGRATE_SYNC) goto out; lock_page(hpage); } @@ -858,7 +923,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); if (!page_mapped(hpage)) - rc = move_to_new_page(new_hpage, hpage, 1, sync); + rc = move_to_new_page(new_hpage, hpage, 1, mode); if (rc) remove_migration_ptes(hpage, hpage); @@ -901,7 +966,7 @@ out: */ int migrate_pages(struct list_head *from, new_page_t get_new_page, unsigned long private, bool offlining, - bool sync) + enum migrate_mode mode) { int retry = 1; int nr_failed = 0; @@ -922,7 +987,7 @@ int migrate_pages(struct list_head *from, rc = unmap_and_move(get_new_page, private, page, pass > 2, offlining, - sync); + mode); switch(rc) { case -ENOMEM: @@ -952,7 +1017,7 @@ out: int migrate_huge_pages(struct list_head *from, new_page_t get_new_page, unsigned long private, bool offlining, - bool sync) + enum migrate_mode mode) { int retry = 1; int nr_failed = 0; @@ -969,7 +1034,7 @@ int migrate_huge_pages(struct list_head *from, rc = unmap_and_move_huge_page(get_new_page, private, page, pass > 2, offlining, - sync); + mode); switch(rc) { case -ENOMEM: @@ -1098,7 +1163,7 @@ set_status: err = 0; if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, new_page_node, - (unsigned long)pm, 0, true); + (unsigned long)pm, 0, MIGRATE_SYNC); if (err) putback_lru_pages(&pagelist); } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 7c122faa..2958fd8 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -152,7 +152,7 @@ struct task_struct *find_lock_task_mm(struct task_struct *p) /* return true if the task is not adequate as candidate victim task. */ static bool oom_unkillable_task(struct task_struct *p, - const struct mem_cgroup *mem, const nodemask_t *nodemask) + const struct mem_cgroup *memcg, const nodemask_t *nodemask) { if (is_global_init(p)) return true; @@ -160,7 +160,7 @@ static bool oom_unkillable_task(struct task_struct *p, return true; /* When mem_cgroup_out_of_memory() and p is not member of the group */ - if (mem && !task_in_mem_cgroup(p, mem)) + if (memcg && !task_in_mem_cgroup(p, memcg)) return true; /* p may not have freeable memory in nodemask */ @@ -179,12 +179,12 @@ static bool oom_unkillable_task(struct task_struct *p, * predictable as possible. The goal is to return the highest value for the * task consuming the most memory to avoid subsequent oom failures. */ -unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem, +unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg, const nodemask_t *nodemask, unsigned long totalpages) { long points; - if (oom_unkillable_task(p, mem, nodemask)) + if (oom_unkillable_task(p, memcg, nodemask)) return 0; p = find_lock_task_mm(p); @@ -308,7 +308,7 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist, * (not docbooked, we don't want this one cluttering up the manual) */ static struct task_struct *select_bad_process(unsigned int *ppoints, - unsigned long totalpages, struct mem_cgroup *mem, + unsigned long totalpages, struct mem_cgroup *memcg, const nodemask_t *nodemask) { struct task_struct *g, *p; @@ -320,7 +320,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, if (p->exit_state) continue; - if (oom_unkillable_task(p, mem, nodemask)) + if (oom_unkillable_task(p, memcg, nodemask)) continue; /* @@ -364,7 +364,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, } } - points = oom_badness(p, mem, nodemask, totalpages); + points = oom_badness(p, memcg, nodemask, totalpages); if (points > *ppoints) { chosen = p; *ppoints = points; @@ -387,14 +387,14 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, * * Call with tasklist_lock read-locked. */ -static void dump_tasks(const struct mem_cgroup *mem, const nodemask_t *nodemask) +static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemask) { struct task_struct *p; struct task_struct *task; pr_info("[ pid ] uid tgid total_vm rss cpu oom_adj oom_score_adj name\n"); for_each_process(p) { - if (oom_unkillable_task(p, mem, nodemask)) + if (oom_unkillable_task(p, memcg, nodemask)) continue; task = find_lock_task_mm(p); @@ -417,7 +417,7 @@ static void dump_tasks(const struct mem_cgroup *mem, const nodemask_t *nodemask) } static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, - struct mem_cgroup *mem, const nodemask_t *nodemask) + struct mem_cgroup *memcg, const nodemask_t *nodemask) { task_lock(current); pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " @@ -427,14 +427,14 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, cpuset_print_task_mems_allowed(current); task_unlock(current); dump_stack(); - mem_cgroup_print_oom_info(mem, p); + mem_cgroup_print_oom_info(memcg, p); show_mem(SHOW_MEM_FILTER_NODES); if (sysctl_oom_dump_tasks) - dump_tasks(mem, nodemask); + dump_tasks(memcg, nodemask); } #define K(x) ((x) << (PAGE_SHIFT-10)) -static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem) +static int oom_kill_task(struct task_struct *p) { struct task_struct *q; struct mm_struct *mm; @@ -484,7 +484,7 @@ static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem) static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, unsigned int points, unsigned long totalpages, - struct mem_cgroup *mem, nodemask_t *nodemask, + struct mem_cgroup *memcg, nodemask_t *nodemask, const char *message) { struct task_struct *victim = p; @@ -493,7 +493,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, unsigned int victim_points = 0; if (printk_ratelimit()) - dump_header(p, gfp_mask, order, mem, nodemask); + dump_header(p, gfp_mask, order, memcg, nodemask); /* * If the task is already exiting, don't alarm the sysadmin or kill @@ -524,7 +524,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, /* * oom_badness() returns 0 if the thread is unkillable */ - child_points = oom_badness(child, mem, nodemask, + child_points = oom_badness(child, memcg, nodemask, totalpages); if (child_points > victim_points) { victim = child; @@ -533,7 +533,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, } } while_each_thread(p, t); - return oom_kill_task(victim, mem); + return oom_kill_task(victim); } /* @@ -561,7 +561,7 @@ static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, } #ifdef CONFIG_CGROUP_MEM_RES_CTLR -void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask) +void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask) { unsigned long limit; unsigned int points = 0; @@ -578,14 +578,14 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask) } check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0, NULL); - limit = mem_cgroup_get_limit(mem) >> PAGE_SHIFT; + limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT; read_lock(&tasklist_lock); retry: - p = select_bad_process(&points, limit, mem, NULL); + p = select_bad_process(&points, limit, memcg, NULL); if (!p || PTR_ERR(p) == -1UL) goto out; - if (oom_kill_process(p, gfp_mask, 0, points, limit, mem, NULL, + if (oom_kill_process(p, gfp_mask, 0, points, limit, memcg, NULL, "Memory cgroup out of memory")) goto retry; out: diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 794e671..0027d8f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1981,14 +1981,20 @@ static struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int migratetype, unsigned long *did_some_progress, - bool sync_migration) + int migratetype, bool sync_migration, + bool *deferred_compaction, + unsigned long *did_some_progress) { struct page *page; - if (!order || compaction_deferred(preferred_zone)) + if (!order) return NULL; + if (compaction_deferred(preferred_zone)) { + *deferred_compaction = true; + return NULL; + } + current->flags |= PF_MEMALLOC; *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, nodemask, sync_migration); @@ -2016,7 +2022,13 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, * but not enough to satisfy watermarks. */ count_vm_event(COMPACTFAIL); - defer_compaction(preferred_zone); + + /* + * As async compaction considers a subset of pageblocks, only + * defer if the failure was a sync compaction failure. + */ + if (sync_migration) + defer_compaction(preferred_zone); cond_resched(); } @@ -2028,8 +2040,9 @@ static inline struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int migratetype, unsigned long *did_some_progress, - bool sync_migration) + int migratetype, bool sync_migration, + bool *deferred_compaction, + unsigned long *did_some_progress) { return NULL; } @@ -2179,6 +2192,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, unsigned long pages_reclaimed = 0; unsigned long did_some_progress; bool sync_migration = false; + bool deferred_compaction = false; /* * In the slowpath, we sanity check order to avoid ever trying to @@ -2259,12 +2273,22 @@ rebalance: zonelist, high_zoneidx, nodemask, alloc_flags, preferred_zone, - migratetype, &did_some_progress, - sync_migration); + migratetype, sync_migration, + &deferred_compaction, + &did_some_progress); if (page) goto got_pg; sync_migration = true; + /* + * If compaction is deferred for high-order allocations, it is because + * sync compaction recently failed. In this is the case and the caller + * has requested the system not be heavily disrupted, fail the + * allocation now instead of entering direct reclaim + */ + if (deferred_compaction && (gfp_mask & __GFP_NO_KSWAPD)) + goto nopage; + /* Try direct reclaim and then allocating */ page = __alloc_pages_direct_reclaim(gfp_mask, order, zonelist, high_zoneidx, @@ -2328,8 +2352,9 @@ rebalance: zonelist, high_zoneidx, nodemask, alloc_flags, preferred_zone, - migratetype, &did_some_progress, - sync_migration); + migratetype, sync_migration, + &deferred_compaction, + &did_some_progress); if (page) goto got_pg; } @@ -4237,7 +4262,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; unsigned long size, realsize, memmap_pages; - enum lru_list l; + enum lru_list lru; size = zone_spanned_pages_in_node(nid, j, zones_size); realsize = size - zone_absent_pages_in_node(nid, j, @@ -4287,8 +4312,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, zone->zone_pgdat = pgdat; zone_pcp_init(zone); - for_each_lru(l) - INIT_LIST_HEAD(&zone->lru[l].list); + for_each_lru(lru) + INIT_LIST_HEAD(&zone->lruvec.lists[lru]); zone->reclaim_stat.recent_rotated[0] = 0; zone->reclaim_stat.recent_rotated[1] = 0; zone->reclaim_stat.recent_scanned[0] = 0; @@ -4642,8 +4667,10 @@ static void check_for_regular_memory(pg_data_t *pgdat) for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) { struct zone *zone = &pgdat->node_zones[zone_type]; - if (zone->present_pages) + if (zone->present_pages) { node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY); + break; + } } #endif } diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 2d123f9..de1616a 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -11,13 +11,6 @@ #include <linux/swapops.h> #include <linux/kmemleak.h> -static void __meminit init_page_cgroup(struct page_cgroup *pc, unsigned long id) -{ - pc->flags = 0; - set_page_cgroup_array_id(pc, id); - pc->mem_cgroup = NULL; - INIT_LIST_HEAD(&pc->lru); -} static unsigned long total_usage; #if !defined(CONFIG_SPARSEMEM) @@ -35,35 +28,27 @@ struct page_cgroup *lookup_page_cgroup(struct page *page) struct page_cgroup *base; base = NODE_DATA(page_to_nid(page))->node_page_cgroup; +#ifdef CONFIG_DEBUG_VM + /* + * The sanity checks the page allocator does upon freeing a + * page can reach here before the page_cgroup arrays are + * allocated when feeding a range of pages to the allocator + * for the first time during bootup or memory hotplug. + */ if (unlikely(!base)) return NULL; - +#endif offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn; return base + offset; } -struct page *lookup_cgroup_page(struct page_cgroup *pc) -{ - unsigned long pfn; - struct page *page; - pg_data_t *pgdat; - - pgdat = NODE_DATA(page_cgroup_array_id(pc)); - pfn = pc - pgdat->node_page_cgroup + pgdat->node_start_pfn; - page = pfn_to_page(pfn); - VM_BUG_ON(pc != lookup_page_cgroup(page)); - return page; -} - static int __init alloc_node_page_cgroup(int nid) { - struct page_cgroup *base, *pc; + struct page_cgroup *base; unsigned long table_size; - unsigned long start_pfn, nr_pages, index; + unsigned long nr_pages; - start_pfn = NODE_DATA(nid)->node_start_pfn; nr_pages = NODE_DATA(nid)->node_spanned_pages; - if (!nr_pages) return 0; @@ -73,10 +58,6 @@ static int __init alloc_node_page_cgroup(int nid) table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); if (!base) return -ENOMEM; - for (index = 0; index < nr_pages; index++) { - pc = base + index; - init_page_cgroup(pc, nid); - } NODE_DATA(nid)->node_page_cgroup = base; total_usage += table_size; return 0; @@ -111,29 +92,23 @@ struct page_cgroup *lookup_page_cgroup(struct page *page) { unsigned long pfn = page_to_pfn(page); struct mem_section *section = __pfn_to_section(pfn); - +#ifdef CONFIG_DEBUG_VM + /* + * The sanity checks the page allocator does upon freeing a + * page can reach here before the page_cgroup arrays are + * allocated when feeding a range of pages to the allocator + * for the first time during bootup or memory hotplug. + */ if (!section->page_cgroup) return NULL; +#endif return section->page_cgroup + pfn; } -struct page *lookup_cgroup_page(struct page_cgroup *pc) -{ - struct mem_section *section; - struct page *page; - unsigned long nr; - - nr = page_cgroup_array_id(pc); - section = __nr_to_section(nr); - page = pfn_to_page(pc - section->page_cgroup); - VM_BUG_ON(pc != lookup_page_cgroup(page)); - return page; -} - static void *__meminit alloc_page_cgroup(size_t size, int nid) { + gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN; void *addr = NULL; - gfp_t flags = GFP_KERNEL | __GFP_NOWARN; addr = alloc_pages_exact_nid(nid, size, flags); if (addr) { @@ -142,39 +117,20 @@ static void *__meminit alloc_page_cgroup(size_t size, int nid) } if (node_state(nid, N_HIGH_MEMORY)) - addr = vmalloc_node(size, nid); + addr = vzalloc_node(size, nid); else - addr = vmalloc(size); + addr = vzalloc(size); return addr; } -#ifdef CONFIG_MEMORY_HOTPLUG -static void free_page_cgroup(void *addr) -{ - if (is_vmalloc_addr(addr)) { - vfree(addr); - } else { - struct page *page = virt_to_page(addr); - size_t table_size = - sizeof(struct page_cgroup) * PAGES_PER_SECTION; - - BUG_ON(PageReserved(page)); - free_pages_exact(addr, table_size); - } -} -#endif - static int __meminit init_section_page_cgroup(unsigned long pfn, int nid) { - struct page_cgroup *base, *pc; struct mem_section *section; + struct page_cgroup *base; unsigned long table_size; - unsigned long nr; - int index; - nr = pfn_to_section_nr(pfn); - section = __nr_to_section(nr); + section = __pfn_to_section(pfn); if (section->page_cgroup) return 0; @@ -194,10 +150,6 @@ static int __meminit init_section_page_cgroup(unsigned long pfn, int nid) return -ENOMEM; } - for (index = 0; index < PAGES_PER_SECTION; index++) { - pc = base + index; - init_page_cgroup(pc, nr); - } /* * The passed "pfn" may not be aligned to SECTION. For the calculation * we need to apply a mask. @@ -208,6 +160,20 @@ static int __meminit init_section_page_cgroup(unsigned long pfn, int nid) return 0; } #ifdef CONFIG_MEMORY_HOTPLUG +static void free_page_cgroup(void *addr) +{ + if (is_vmalloc_addr(addr)) { + vfree(addr); + } else { + struct page *page = virt_to_page(addr); + size_t table_size = + sizeof(struct page_cgroup) * PAGES_PER_SECTION; + + BUG_ON(PageReserved(page)); + free_pages_exact(addr, table_size); + } +} + void __free_page_cgroup(unsigned long pfn) { struct mem_section *ms; @@ -366,7 +332,6 @@ struct swap_cgroup { unsigned short id; }; #define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup)) -#define SC_POS_MASK (SC_PER_PAGE - 1) /* * SwapCgroup implements "lookup" and "exchange" operations. @@ -408,6 +373,21 @@ not_enough_page: return -ENOMEM; } +static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent, + struct swap_cgroup_ctrl **ctrlp) +{ + pgoff_t offset = swp_offset(ent); + struct swap_cgroup_ctrl *ctrl; + struct page *mappage; + + ctrl = &swap_cgroup_ctrl[swp_type(ent)]; + if (ctrlp) + *ctrlp = ctrl; + + mappage = ctrl->map[offset / SC_PER_PAGE]; + return page_address(mappage) + offset % SC_PER_PAGE; +} + /** * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry. * @end: swap entry to be cmpxchged @@ -420,21 +400,13 @@ not_enough_page: unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, unsigned short old, unsigned short new) { - int type = swp_type(ent); - unsigned long offset = swp_offset(ent); - unsigned long idx = offset / SC_PER_PAGE; - unsigned long pos = offset & SC_POS_MASK; struct swap_cgroup_ctrl *ctrl; - struct page *mappage; struct swap_cgroup *sc; unsigned long flags; unsigned short retval; - ctrl = &swap_cgroup_ctrl[type]; + sc = lookup_swap_cgroup(ent, &ctrl); - mappage = ctrl->map[idx]; - sc = page_address(mappage); - sc += pos; spin_lock_irqsave(&ctrl->lock, flags); retval = sc->id; if (retval == old) @@ -455,21 +427,13 @@ unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, */ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id) { - int type = swp_type(ent); - unsigned long offset = swp_offset(ent); - unsigned long idx = offset / SC_PER_PAGE; - unsigned long pos = offset & SC_POS_MASK; struct swap_cgroup_ctrl *ctrl; - struct page *mappage; struct swap_cgroup *sc; unsigned short old; unsigned long flags; - ctrl = &swap_cgroup_ctrl[type]; + sc = lookup_swap_cgroup(ent, &ctrl); - mappage = ctrl->map[idx]; - sc = page_address(mappage); - sc += pos; spin_lock_irqsave(&ctrl->lock, flags); old = sc->id; sc->id = id; @@ -479,28 +443,14 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id) } /** - * lookup_swap_cgroup - lookup mem_cgroup tied to swap entry + * lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry * @ent: swap entry to be looked up. * * Returns CSS ID of mem_cgroup at success. 0 at failure. (0 is invalid ID) */ -unsigned short lookup_swap_cgroup(swp_entry_t ent) +unsigned short lookup_swap_cgroup_id(swp_entry_t ent) { - int type = swp_type(ent); - unsigned long offset = swp_offset(ent); - unsigned long idx = offset / SC_PER_PAGE; - unsigned long pos = offset & SC_POS_MASK; - struct swap_cgroup_ctrl *ctrl; - struct page *mappage; - struct swap_cgroup *sc; - unsigned short ret; - - ctrl = &swap_cgroup_ctrl[type]; - mappage = ctrl->map[idx]; - sc = page_address(mappage); - sc += pos; - ret = sc->id; - return ret; + return lookup_swap_cgroup(ent, NULL)->id; } int swap_cgroup_swapon(int type, unsigned long max_pages) @@ -773,7 +773,7 @@ out: } static int page_referenced_anon(struct page *page, - struct mem_cgroup *mem_cont, + struct mem_cgroup *memcg, unsigned long *vm_flags) { unsigned int mapcount; @@ -796,7 +796,7 @@ static int page_referenced_anon(struct page *page, * counting on behalf of references from different * cgroups */ - if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) + if (memcg && !mm_match_cgroup(vma->vm_mm, memcg)) continue; referenced += page_referenced_one(page, vma, address, &mapcount, vm_flags); @@ -811,7 +811,7 @@ static int page_referenced_anon(struct page *page, /** * page_referenced_file - referenced check for object-based rmap * @page: the page we're checking references on. - * @mem_cont: target memory controller + * @memcg: target memory control group * @vm_flags: collect encountered vma->vm_flags who actually referenced the page * * For an object-based mapped page, find all the places it is mapped and @@ -822,7 +822,7 @@ static int page_referenced_anon(struct page *page, * This function is only called from page_referenced for object-based pages. */ static int page_referenced_file(struct page *page, - struct mem_cgroup *mem_cont, + struct mem_cgroup *memcg, unsigned long *vm_flags) { unsigned int mapcount; @@ -864,7 +864,7 @@ static int page_referenced_file(struct page *page, * counting on behalf of references from different * cgroups */ - if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) + if (memcg && !mm_match_cgroup(vma->vm_mm, memcg)) continue; referenced += page_referenced_one(page, vma, address, &mapcount, vm_flags); @@ -880,7 +880,7 @@ static int page_referenced_file(struct page *page, * page_referenced - test if the page was referenced * @page: the page to test * @is_locked: caller holds lock on the page - * @mem_cont: target memory controller + * @memcg: target memory cgroup * @vm_flags: collect encountered vma->vm_flags who actually referenced the page * * Quick test_and_clear_referenced for all mappings to a page, @@ -888,7 +888,7 @@ static int page_referenced_file(struct page *page, */ int page_referenced(struct page *page, int is_locked, - struct mem_cgroup *mem_cont, + struct mem_cgroup *memcg, unsigned long *vm_flags) { int referenced = 0; @@ -904,13 +904,13 @@ int page_referenced(struct page *page, } } if (unlikely(PageKsm(page))) - referenced += page_referenced_ksm(page, mem_cont, + referenced += page_referenced_ksm(page, memcg, vm_flags); else if (PageAnon(page)) - referenced += page_referenced_anon(page, mem_cont, + referenced += page_referenced_anon(page, memcg, vm_flags); else if (page->mapping) - referenced += page_referenced_file(page, mem_cont, + referenced += page_referenced_file(page, memcg, vm_flags); if (we_locked) unlock_page(page); @@ -366,7 +366,8 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page const char *n) { VM_BUG_ON(!irqs_disabled()); -#ifdef CONFIG_CMPXCHG_DOUBLE +#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ + defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) if (s->flags & __CMPXCHG_DOUBLE) { if (cmpxchg_double(&page->freelist, &page->counters, freelist_old, counters_old, @@ -400,7 +401,8 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, void *freelist_new, unsigned long counters_new, const char *n) { -#ifdef CONFIG_CMPXCHG_DOUBLE +#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ + defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) if (s->flags & __CMPXCHG_DOUBLE) { if (cmpxchg_double(&page->freelist, &page->counters, freelist_old, counters_old, @@ -3014,7 +3016,8 @@ static int kmem_cache_open(struct kmem_cache *s, } } -#ifdef CONFIG_CMPXCHG_DOUBLE +#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ + defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) if (system_has_cmpxchg_double() && (s->flags & SLAB_DEBUG_FLAGS) == 0) /* Enable fast mode */ s->flags |= __CMPXCHG_DOUBLE; @@ -23,7 +23,6 @@ #include <linux/init.h> #include <linux/export.h> #include <linux/mm_inline.h> -#include <linux/buffer_head.h> /* for try_to_release_page() */ #include <linux/percpu_counter.h> #include <linux/percpu.h> #include <linux/cpu.h> @@ -54,7 +53,7 @@ static void __page_cache_release(struct page *page) spin_lock_irqsave(&zone->lru_lock, flags); VM_BUG_ON(!PageLRU(page)); __ClearPageLRU(page); - del_page_from_lru(zone, page); + del_page_from_lru_list(zone, page, page_off_lru(page)); spin_unlock_irqrestore(&zone->lru_lock, flags); } } @@ -232,12 +231,14 @@ static void pagevec_lru_move_fn(struct pagevec *pvec, static void pagevec_move_tail_fn(struct page *page, void *arg) { int *pgmoved = arg; - struct zone *zone = page_zone(page); if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { enum lru_list lru = page_lru_base_type(page); - list_move_tail(&page->lru, &zone->lru[lru].list); - mem_cgroup_rotate_reclaimable_page(page); + struct lruvec *lruvec; + + lruvec = mem_cgroup_lru_move_lists(page_zone(page), + page, lru, lru); + list_move_tail(&page->lru, &lruvec->lists[lru]); (*pgmoved)++; } } @@ -368,7 +369,6 @@ void mark_page_accessed(struct page *page) SetPageReferenced(page); } } - EXPORT_SYMBOL(mark_page_accessed); void __lru_cache_add(struct page *page, enum lru_list lru) @@ -377,7 +377,7 @@ void __lru_cache_add(struct page *page, enum lru_list lru) page_cache_get(page); if (!pagevec_add(pvec, page)) - ____pagevec_lru_add(pvec, lru); + __pagevec_lru_add(pvec, lru); put_cpu_var(lru_add_pvecs); } EXPORT_SYMBOL(__lru_cache_add); @@ -476,12 +476,13 @@ static void lru_deactivate_fn(struct page *page, void *arg) */ SetPageReclaim(page); } else { + struct lruvec *lruvec; /* * The page's writeback ends up during pagevec * We moves tha page into tail of inactive. */ - list_move_tail(&page->lru, &zone->lru[lru].list); - mem_cgroup_rotate_reclaimable_page(page); + lruvec = mem_cgroup_lru_move_lists(zone, page, lru, lru); + list_move_tail(&page->lru, &lruvec->lists[lru]); __count_vm_event(PGROTATED); } @@ -504,7 +505,7 @@ static void drain_cpu_pagevecs(int cpu) for_each_lru(lru) { pvec = &pvecs[lru - LRU_BASE]; if (pagevec_count(pvec)) - ____pagevec_lru_add(pvec, lru); + __pagevec_lru_add(pvec, lru); } pvec = &per_cpu(lru_rotate_pvecs, cpu); @@ -616,7 +617,7 @@ void release_pages(struct page **pages, int nr, int cold) } VM_BUG_ON(!PageLRU(page)); __ClearPageLRU(page); - del_page_from_lru(zone, page); + del_page_from_lru_list(zone, page, page_off_lru(page)); } list_add(&page->lru, &pages_to_free); @@ -644,9 +645,9 @@ void __pagevec_release(struct pagevec *pvec) release_pages(pvec->pages, pagevec_count(pvec), pvec->cold); pagevec_reinit(pvec); } - EXPORT_SYMBOL(__pagevec_release); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE /* used by __split_huge_page_refcount() */ void lru_add_page_tail(struct zone* zone, struct page *page, struct page *page_tail) @@ -654,7 +655,6 @@ void lru_add_page_tail(struct zone* zone, int active; enum lru_list lru; const int file = 0; - struct list_head *head; VM_BUG_ON(!PageHead(page)); VM_BUG_ON(PageCompound(page_tail)); @@ -673,18 +673,30 @@ void lru_add_page_tail(struct zone* zone, lru = LRU_INACTIVE_ANON; } update_page_reclaim_stat(zone, page_tail, file, active); - if (likely(PageLRU(page))) - head = page->lru.prev; - else - head = &zone->lru[lru].list; - __add_page_to_lru_list(zone, page_tail, lru, head); } else { SetPageUnevictable(page_tail); - add_page_to_lru_list(zone, page_tail, LRU_UNEVICTABLE); + lru = LRU_UNEVICTABLE; + } + + if (likely(PageLRU(page))) + list_add_tail(&page_tail->lru, &page->lru); + else { + struct list_head *list_head; + /* + * Head page has not yet been counted, as an hpage, + * so we must account for each subpage individually. + * + * Use the standard add function to put page_tail on the list, + * but then correct its position so they all end up in order. + */ + add_page_to_lru_list(zone, page_tail, lru); + list_head = page_tail->lru.prev; + list_move_tail(&page_tail->lru, list_head); } } +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -static void ____pagevec_lru_add_fn(struct page *page, void *arg) +static void __pagevec_lru_add_fn(struct page *page, void *arg) { enum lru_list lru = (enum lru_list)arg; struct zone *zone = page_zone(page); @@ -706,32 +718,13 @@ static void ____pagevec_lru_add_fn(struct page *page, void *arg) * Add the passed pages to the LRU, then drop the caller's refcount * on them. Reinitialises the caller's pagevec. */ -void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru) +void __pagevec_lru_add(struct pagevec *pvec, enum lru_list lru) { VM_BUG_ON(is_unevictable_lru(lru)); - pagevec_lru_move_fn(pvec, ____pagevec_lru_add_fn, (void *)lru); -} - -EXPORT_SYMBOL(____pagevec_lru_add); - -/* - * Try to drop buffers from the pages in a pagevec - */ -void pagevec_strip(struct pagevec *pvec) -{ - int i; - - for (i = 0; i < pagevec_count(pvec); i++) { - struct page *page = pvec->pages[i]; - - if (page_has_private(page) && trylock_page(page)) { - if (page_has_private(page)) - try_to_release_page(page, 0); - unlock_page(page); - } - } + pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, (void *)lru); } +EXPORT_SYMBOL(__pagevec_lru_add); /** * pagevec_lookup - gang pagecache lookup @@ -755,7 +748,6 @@ unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages); return pagevec_count(pvec); } - EXPORT_SYMBOL(pagevec_lookup); unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, @@ -765,7 +757,6 @@ unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, nr_pages, pvec->pages); return pagevec_count(pvec); } - EXPORT_SYMBOL(pagevec_lookup_tag); /* diff --git a/mm/swap_state.c b/mm/swap_state.c index ea6b32d..470038a 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -300,6 +300,16 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, new_page = alloc_page_vma(gfp_mask, vma, addr); if (!new_page) break; /* Out of memory */ + /* + * The memcg-specific accounting when moving + * pages around the LRU lists relies on the + * page's owner (memcg) to be valid. Usually, + * pages are assigned to a new owner before + * being put on the LRU list, but since this + * is not the case here, the stale owner from + * a previous allocation cycle must be reset. + */ + mem_cgroup_reset_owner(new_page); } /* diff --git a/mm/swapfile.c b/mm/swapfile.c index 9520592..d999f09 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -847,12 +847,13 @@ unsigned int count_swap_pages(int type, int free) static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, swp_entry_t entry, struct page *page) { - struct mem_cgroup *ptr; + struct mem_cgroup *memcg; spinlock_t *ptl; pte_t *pte; int ret = 1; - if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, GFP_KERNEL, &ptr)) { + if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, + GFP_KERNEL, &memcg)) { ret = -ENOMEM; goto out_nolock; } @@ -860,7 +861,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) { if (ret > 0) - mem_cgroup_cancel_charge_swapin(ptr); + mem_cgroup_cancel_charge_swapin(memcg); ret = 0; goto out; } @@ -871,7 +872,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, set_pte_at(vma->vm_mm, addr, pte, pte_mkold(mk_pte(page, vma->vm_page_prot))); page_add_anon_rmap(page, vma, addr); - mem_cgroup_commit_charge_swapin(page, ptr); + mem_cgroup_commit_charge_swapin(page, memcg); swap_free(entry); /* * Move the page to the active list so it is not diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 877ca04..86ce9a5 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2378,7 +2378,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, vms = kzalloc(sizeof(vms[0]) * nr_vms, GFP_KERNEL); vas = kzalloc(sizeof(vas[0]) * nr_vms, GFP_KERNEL); if (!vas || !vms) - goto err_free; + goto err_free2; for (area = 0; area < nr_vms; area++) { vas[area] = kzalloc(sizeof(struct vmap_area), GFP_KERNEL); @@ -2476,11 +2476,10 @@ found: err_free: for (area = 0; area < nr_vms; area++) { - if (vas) - kfree(vas[area]); - if (vms) - kfree(vms[area]); + kfree(vas[area]); + kfree(vms[area]); } +err_free2: kfree(vas); kfree(vms); return NULL; diff --git a/mm/vmscan.c b/mm/vmscan.c index 26f4a8a..2880396 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -103,8 +103,11 @@ struct scan_control { */ reclaim_mode_t reclaim_mode; - /* Which cgroup do we reclaim from */ - struct mem_cgroup *mem_cgroup; + /* + * The memory cgroup that hit its limit and as a result is the + * primary target of this reclaim invocation. + */ + struct mem_cgroup *target_mem_cgroup; /* * Nodemask of nodes allowed by the caller. If NULL, all nodes @@ -113,6 +116,11 @@ struct scan_control { nodemask_t *nodemask; }; +struct mem_cgroup_zone { + struct mem_cgroup *mem_cgroup; + struct zone *zone; +}; + #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) #ifdef ARCH_HAS_PREFETCH @@ -153,28 +161,45 @@ static LIST_HEAD(shrinker_list); static DECLARE_RWSEM(shrinker_rwsem); #ifdef CONFIG_CGROUP_MEM_RES_CTLR -#define scanning_global_lru(sc) (!(sc)->mem_cgroup) +static bool global_reclaim(struct scan_control *sc) +{ + return !sc->target_mem_cgroup; +} + +static bool scanning_global_lru(struct mem_cgroup_zone *mz) +{ + return !mz->mem_cgroup; +} #else -#define scanning_global_lru(sc) (1) +static bool global_reclaim(struct scan_control *sc) +{ + return true; +} + +static bool scanning_global_lru(struct mem_cgroup_zone *mz) +{ + return true; +} #endif -static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone, - struct scan_control *sc) +static struct zone_reclaim_stat *get_reclaim_stat(struct mem_cgroup_zone *mz) { - if (!scanning_global_lru(sc)) - return mem_cgroup_get_reclaim_stat(sc->mem_cgroup, zone); + if (!scanning_global_lru(mz)) + return mem_cgroup_get_reclaim_stat(mz->mem_cgroup, mz->zone); - return &zone->reclaim_stat; + return &mz->zone->reclaim_stat; } -static unsigned long zone_nr_lru_pages(struct zone *zone, - struct scan_control *sc, enum lru_list lru) +static unsigned long zone_nr_lru_pages(struct mem_cgroup_zone *mz, + enum lru_list lru) { - if (!scanning_global_lru(sc)) - return mem_cgroup_zone_nr_lru_pages(sc->mem_cgroup, - zone_to_nid(zone), zone_idx(zone), BIT(lru)); + if (!scanning_global_lru(mz)) + return mem_cgroup_zone_nr_lru_pages(mz->mem_cgroup, + zone_to_nid(mz->zone), + zone_idx(mz->zone), + BIT(lru)); - return zone_page_state(zone, NR_LRU_BASE + lru); + return zone_page_state(mz->zone, NR_LRU_BASE + lru); } @@ -677,12 +702,13 @@ enum page_references { }; static enum page_references page_check_references(struct page *page, + struct mem_cgroup_zone *mz, struct scan_control *sc) { int referenced_ptes, referenced_page; unsigned long vm_flags; - referenced_ptes = page_referenced(page, 1, sc->mem_cgroup, &vm_flags); + referenced_ptes = page_referenced(page, 1, mz->mem_cgroup, &vm_flags); referenced_page = TestClearPageReferenced(page); /* Lumpy reclaim - ignore references */ @@ -738,7 +764,7 @@ static enum page_references page_check_references(struct page *page, * shrink_page_list() returns the number of reclaimed pages */ static unsigned long shrink_page_list(struct list_head *page_list, - struct zone *zone, + struct mem_cgroup_zone *mz, struct scan_control *sc, int priority, unsigned long *ret_nr_dirty, @@ -769,7 +795,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, goto keep; VM_BUG_ON(PageActive(page)); - VM_BUG_ON(page_zone(page) != zone); + VM_BUG_ON(page_zone(page) != mz->zone); sc->nr_scanned++; @@ -803,7 +829,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, } } - references = page_check_references(page, sc); + references = page_check_references(page, mz, sc); switch (references) { case PAGEREF_ACTIVATE: goto activate_locked; @@ -994,8 +1020,8 @@ keep_lumpy: * back off and wait for congestion to clear because further reclaim * will encounter the same problem */ - if (nr_dirty && nr_dirty == nr_congested && scanning_global_lru(sc)) - zone_set_flag(zone, ZONE_CONGESTED); + if (nr_dirty && nr_dirty == nr_congested && global_reclaim(sc)) + zone_set_flag(mz->zone, ZONE_CONGESTED); free_hot_cold_page_list(&free_pages, 1); @@ -1049,8 +1075,39 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file) ret = -EBUSY; - if ((mode & ISOLATE_CLEAN) && (PageDirty(page) || PageWriteback(page))) - return ret; + /* + * To minimise LRU disruption, the caller can indicate that it only + * wants to isolate pages it will be able to operate on without + * blocking - clean pages for the most part. + * + * ISOLATE_CLEAN means that only clean pages should be isolated. This + * is used by reclaim when it is cannot write to backing storage + * + * ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages + * that it is possible to migrate without blocking + */ + if (mode & (ISOLATE_CLEAN|ISOLATE_ASYNC_MIGRATE)) { + /* All the caller can do on PageWriteback is block */ + if (PageWriteback(page)) + return ret; + + if (PageDirty(page)) { + struct address_space *mapping; + + /* ISOLATE_CLEAN means only clean pages */ + if (mode & ISOLATE_CLEAN) + return ret; + + /* + * Only pages without mappings or that have a + * ->migratepage callback are possible to migrate + * without blocking + */ + mapping = page_mapping(page); + if (mapping && !mapping->a_ops->migratepage) + return ret; + } + } if ((mode & ISOLATE_UNMAPPED) && page_mapped(page)) return ret; @@ -1079,25 +1136,36 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file) * Appropriate locks must be held before calling this function. * * @nr_to_scan: The number of pages to look through on the list. - * @src: The LRU list to pull pages off. + * @mz: The mem_cgroup_zone to pull pages from. * @dst: The temp list to put pages on to. - * @scanned: The number of pages that were scanned. + * @nr_scanned: The number of pages that were scanned. * @order: The caller's attempted allocation order * @mode: One of the LRU isolation modes + * @active: True [1] if isolating active pages * @file: True [1] if isolating file [!anon] pages * * returns how many pages were moved onto *@dst. */ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, - struct list_head *src, struct list_head *dst, - unsigned long *scanned, int order, isolate_mode_t mode, - int file) + struct mem_cgroup_zone *mz, struct list_head *dst, + unsigned long *nr_scanned, int order, isolate_mode_t mode, + int active, int file) { + struct lruvec *lruvec; + struct list_head *src; unsigned long nr_taken = 0; unsigned long nr_lumpy_taken = 0; unsigned long nr_lumpy_dirty = 0; unsigned long nr_lumpy_failed = 0; unsigned long scan; + int lru = LRU_BASE; + + lruvec = mem_cgroup_zone_lruvec(mz->zone, mz->mem_cgroup); + if (active) + lru += LRU_ACTIVE; + if (file) + lru += LRU_FILE; + src = &lruvec->lists[lru]; for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { struct page *page; @@ -1113,15 +1181,14 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, switch (__isolate_lru_page(page, mode, file)) { case 0: + mem_cgroup_lru_del(page); list_move(&page->lru, dst); - mem_cgroup_del_lru(page); nr_taken += hpage_nr_pages(page); break; case -EBUSY: /* else it is being freed elsewhere */ list_move(&page->lru, src); - mem_cgroup_rotate_lru_list(page, page_lru(page)); continue; default: @@ -1171,13 +1238,17 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, break; if (__isolate_lru_page(cursor_page, mode, file) == 0) { + unsigned int isolated_pages; + + mem_cgroup_lru_del(cursor_page); list_move(&cursor_page->lru, dst); - mem_cgroup_del_lru(cursor_page); - nr_taken += hpage_nr_pages(cursor_page); - nr_lumpy_taken++; + isolated_pages = hpage_nr_pages(cursor_page); + nr_taken += isolated_pages; + nr_lumpy_taken += isolated_pages; if (PageDirty(cursor_page)) - nr_lumpy_dirty++; + nr_lumpy_dirty += isolated_pages; scan++; + pfn += isolated_pages - 1; } else { /* * Check if the page is freed already. @@ -1203,57 +1274,16 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, nr_lumpy_failed++; } - *scanned = scan; + *nr_scanned = scan; trace_mm_vmscan_lru_isolate(order, nr_to_scan, scan, nr_taken, nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, - mode); + mode, file); return nr_taken; } -static unsigned long isolate_pages_global(unsigned long nr, - struct list_head *dst, - unsigned long *scanned, int order, - isolate_mode_t mode, - struct zone *z, int active, int file) -{ - int lru = LRU_BASE; - if (active) - lru += LRU_ACTIVE; - if (file) - lru += LRU_FILE; - return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order, - mode, file); -} - -/* - * clear_active_flags() is a helper for shrink_active_list(), clearing - * any active bits from the pages in the list. - */ -static unsigned long clear_active_flags(struct list_head *page_list, - unsigned int *count) -{ - int nr_active = 0; - int lru; - struct page *page; - - list_for_each_entry(page, page_list, lru) { - int numpages = hpage_nr_pages(page); - lru = page_lru_base_type(page); - if (PageActive(page)) { - lru += LRU_ACTIVE; - ClearPageActive(page); - nr_active += numpages; - } - if (count) - count[lru] += numpages; - } - - return nr_active; -} - /** * isolate_lru_page - tries to isolate a page from its LRU list * @page: page to isolate from its LRU list @@ -1313,7 +1343,7 @@ static int too_many_isolated(struct zone *zone, int file, if (current_is_kswapd()) return 0; - if (!scanning_global_lru(sc)) + if (!global_reclaim(sc)) return 0; if (file) { @@ -1327,27 +1357,21 @@ static int too_many_isolated(struct zone *zone, int file, return isolated > inactive; } -/* - * TODO: Try merging with migrations version of putback_lru_pages - */ static noinline_for_stack void -putback_lru_pages(struct zone *zone, struct scan_control *sc, - unsigned long nr_anon, unsigned long nr_file, - struct list_head *page_list) +putback_inactive_pages(struct mem_cgroup_zone *mz, + struct list_head *page_list) { - struct page *page; - struct pagevec pvec; - struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); - - pagevec_init(&pvec, 1); + struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); + struct zone *zone = mz->zone; + LIST_HEAD(pages_to_free); /* * Put back any unfreeable pages. */ - spin_lock(&zone->lru_lock); while (!list_empty(page_list)) { + struct page *page = lru_to_page(page_list); int lru; - page = lru_to_page(page_list); + VM_BUG_ON(PageLRU(page)); list_del(&page->lru); if (unlikely(!page_evictable(page, NULL))) { @@ -1364,30 +1388,53 @@ putback_lru_pages(struct zone *zone, struct scan_control *sc, int numpages = hpage_nr_pages(page); reclaim_stat->recent_rotated[file] += numpages; } - if (!pagevec_add(&pvec, page)) { - spin_unlock_irq(&zone->lru_lock); - __pagevec_release(&pvec); - spin_lock_irq(&zone->lru_lock); + if (put_page_testzero(page)) { + __ClearPageLRU(page); + __ClearPageActive(page); + del_page_from_lru_list(zone, page, lru); + + if (unlikely(PageCompound(page))) { + spin_unlock_irq(&zone->lru_lock); + (*get_compound_page_dtor(page))(page); + spin_lock_irq(&zone->lru_lock); + } else + list_add(&page->lru, &pages_to_free); } } - __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon); - __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file); - spin_unlock_irq(&zone->lru_lock); - pagevec_release(&pvec); + /* + * To save our caller's stack, now use input list for pages to free. + */ + list_splice(&pages_to_free, page_list); } -static noinline_for_stack void update_isolated_counts(struct zone *zone, - struct scan_control *sc, - unsigned long *nr_anon, - unsigned long *nr_file, - struct list_head *isolated_list) +static noinline_for_stack void +update_isolated_counts(struct mem_cgroup_zone *mz, + struct list_head *page_list, + unsigned long *nr_anon, + unsigned long *nr_file) { - unsigned long nr_active; + struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); + struct zone *zone = mz->zone; unsigned int count[NR_LRU_LISTS] = { 0, }; - struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); + unsigned long nr_active = 0; + struct page *page; + int lru; + + /* + * Count pages and clear active flags + */ + list_for_each_entry(page, page_list, lru) { + int numpages = hpage_nr_pages(page); + lru = page_lru_base_type(page); + if (PageActive(page)) { + lru += LRU_ACTIVE; + ClearPageActive(page); + nr_active += numpages; + } + count[lru] += numpages; + } - nr_active = clear_active_flags(isolated_list, count); __count_vm_events(PGDEACTIVATE, nr_active); __mod_zone_page_state(zone, NR_ACTIVE_FILE, @@ -1401,8 +1448,6 @@ static noinline_for_stack void update_isolated_counts(struct zone *zone, *nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON]; *nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE]; - __mod_zone_page_state(zone, NR_ISOLATED_ANON, *nr_anon); - __mod_zone_page_state(zone, NR_ISOLATED_FILE, *nr_file); reclaim_stat->recent_scanned[0] += *nr_anon; reclaim_stat->recent_scanned[1] += *nr_file; @@ -1454,8 +1499,8 @@ static inline bool should_reclaim_stall(unsigned long nr_taken, * of reclaimed pages */ static noinline_for_stack unsigned long -shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, - struct scan_control *sc, int priority, int file) +shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz, + struct scan_control *sc, int priority, int file) { LIST_HEAD(page_list); unsigned long nr_scanned; @@ -1466,6 +1511,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, unsigned long nr_dirty = 0; unsigned long nr_writeback = 0; isolate_mode_t reclaim_mode = ISOLATE_INACTIVE; + struct zone *zone = mz->zone; while (unlikely(too_many_isolated(zone, file, sc))) { congestion_wait(BLK_RW_ASYNC, HZ/10); @@ -1488,9 +1534,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, spin_lock_irq(&zone->lru_lock); - if (scanning_global_lru(sc)) { - nr_taken = isolate_pages_global(nr_to_scan, &page_list, - &nr_scanned, sc->order, reclaim_mode, zone, 0, file); + nr_taken = isolate_lru_pages(nr_to_scan, mz, &page_list, + &nr_scanned, sc->order, + reclaim_mode, 0, file); + if (global_reclaim(sc)) { zone->pages_scanned += nr_scanned; if (current_is_kswapd()) __count_zone_vm_events(PGSCAN_KSWAPD, zone, @@ -1498,14 +1545,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, else __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scanned); - } else { - nr_taken = mem_cgroup_isolate_pages(nr_to_scan, &page_list, - &nr_scanned, sc->order, reclaim_mode, zone, - sc->mem_cgroup, 0, file); - /* - * mem_cgroup_isolate_pages() keeps track of - * scanned pages on its own. - */ } if (nr_taken == 0) { @@ -1513,26 +1552,37 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, return 0; } - update_isolated_counts(zone, sc, &nr_anon, &nr_file, &page_list); + update_isolated_counts(mz, &page_list, &nr_anon, &nr_file); + + __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon); + __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file); spin_unlock_irq(&zone->lru_lock); - nr_reclaimed = shrink_page_list(&page_list, zone, sc, priority, + nr_reclaimed = shrink_page_list(&page_list, mz, sc, priority, &nr_dirty, &nr_writeback); /* Check if we should syncronously wait for writeback */ if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { set_reclaim_mode(priority, sc, true); - nr_reclaimed += shrink_page_list(&page_list, zone, sc, + nr_reclaimed += shrink_page_list(&page_list, mz, sc, priority, &nr_dirty, &nr_writeback); } - local_irq_disable(); + spin_lock_irq(&zone->lru_lock); + if (current_is_kswapd()) __count_vm_events(KSWAPD_STEAL, nr_reclaimed); __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed); - putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list); + putback_inactive_pages(mz, &page_list); + + __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon); + __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file); + + spin_unlock_irq(&zone->lru_lock); + + free_hot_cold_page_list(&page_list, 1); /* * If reclaim is isolating dirty pages under writeback, it implies @@ -1588,30 +1638,47 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, static void move_active_pages_to_lru(struct zone *zone, struct list_head *list, + struct list_head *pages_to_free, enum lru_list lru) { unsigned long pgmoved = 0; - struct pagevec pvec; struct page *page; - pagevec_init(&pvec, 1); + if (buffer_heads_over_limit) { + spin_unlock_irq(&zone->lru_lock); + list_for_each_entry(page, list, lru) { + if (page_has_private(page) && trylock_page(page)) { + if (page_has_private(page)) + try_to_release_page(page, 0); + unlock_page(page); + } + } + spin_lock_irq(&zone->lru_lock); + } while (!list_empty(list)) { + struct lruvec *lruvec; + page = lru_to_page(list); VM_BUG_ON(PageLRU(page)); SetPageLRU(page); - list_move(&page->lru, &zone->lru[lru].list); - mem_cgroup_add_lru_list(page, lru); + lruvec = mem_cgroup_lru_add_list(zone, page, lru); + list_move(&page->lru, &lruvec->lists[lru]); pgmoved += hpage_nr_pages(page); - if (!pagevec_add(&pvec, page) || list_empty(list)) { - spin_unlock_irq(&zone->lru_lock); - if (buffer_heads_over_limit) - pagevec_strip(&pvec); - __pagevec_release(&pvec); - spin_lock_irq(&zone->lru_lock); + if (put_page_testzero(page)) { + __ClearPageLRU(page); + __ClearPageActive(page); + del_page_from_lru_list(zone, page, lru); + + if (unlikely(PageCompound(page))) { + spin_unlock_irq(&zone->lru_lock); + (*get_compound_page_dtor(page))(page); + spin_lock_irq(&zone->lru_lock); + } else + list_add(&page->lru, pages_to_free); } } __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); @@ -1619,19 +1686,22 @@ static void move_active_pages_to_lru(struct zone *zone, __count_vm_events(PGDEACTIVATE, pgmoved); } -static void shrink_active_list(unsigned long nr_pages, struct zone *zone, - struct scan_control *sc, int priority, int file) +static void shrink_active_list(unsigned long nr_to_scan, + struct mem_cgroup_zone *mz, + struct scan_control *sc, + int priority, int file) { unsigned long nr_taken; - unsigned long pgscanned; + unsigned long nr_scanned; unsigned long vm_flags; LIST_HEAD(l_hold); /* The pages which were snipped off */ LIST_HEAD(l_active); LIST_HEAD(l_inactive); struct page *page; - struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); + struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); unsigned long nr_rotated = 0; isolate_mode_t reclaim_mode = ISOLATE_ACTIVE; + struct zone *zone = mz->zone; lru_add_drain(); @@ -1641,26 +1711,16 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, reclaim_mode |= ISOLATE_CLEAN; spin_lock_irq(&zone->lru_lock); - if (scanning_global_lru(sc)) { - nr_taken = isolate_pages_global(nr_pages, &l_hold, - &pgscanned, sc->order, - reclaim_mode, zone, - 1, file); - zone->pages_scanned += pgscanned; - } else { - nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold, - &pgscanned, sc->order, - reclaim_mode, zone, - sc->mem_cgroup, 1, file); - /* - * mem_cgroup_isolate_pages() keeps track of - * scanned pages on its own. - */ - } + + nr_taken = isolate_lru_pages(nr_to_scan, mz, &l_hold, + &nr_scanned, sc->order, + reclaim_mode, 1, file); + if (global_reclaim(sc)) + zone->pages_scanned += nr_scanned; reclaim_stat->recent_scanned[file] += nr_taken; - __count_zone_vm_events(PGREFILL, zone, pgscanned); + __count_zone_vm_events(PGREFILL, zone, nr_scanned); if (file) __mod_zone_page_state(zone, NR_ACTIVE_FILE, -nr_taken); else @@ -1678,7 +1738,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, continue; } - if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) { + if (page_referenced(page, 0, mz->mem_cgroup, &vm_flags)) { nr_rotated += hpage_nr_pages(page); /* * Identify referenced, file-backed active pages and @@ -1711,12 +1771,14 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, */ reclaim_stat->recent_rotated[file] += nr_rotated; - move_active_pages_to_lru(zone, &l_active, + move_active_pages_to_lru(zone, &l_active, &l_hold, LRU_ACTIVE + file * LRU_FILE); - move_active_pages_to_lru(zone, &l_inactive, + move_active_pages_to_lru(zone, &l_inactive, &l_hold, LRU_BASE + file * LRU_FILE); __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); spin_unlock_irq(&zone->lru_lock); + + free_hot_cold_page_list(&l_hold, 1); } #ifdef CONFIG_SWAP @@ -1741,10 +1803,8 @@ static int inactive_anon_is_low_global(struct zone *zone) * Returns true if the zone does not have enough inactive anon pages, * meaning some active anon pages need to be deactivated. */ -static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc) +static int inactive_anon_is_low(struct mem_cgroup_zone *mz) { - int low; - /* * If we don't have swap space, anonymous page deactivation * is pointless. @@ -1752,15 +1812,14 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc) if (!total_swap_pages) return 0; - if (scanning_global_lru(sc)) - low = inactive_anon_is_low_global(zone); - else - low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup, zone); - return low; + if (!scanning_global_lru(mz)) + return mem_cgroup_inactive_anon_is_low(mz->mem_cgroup, + mz->zone); + + return inactive_anon_is_low_global(mz->zone); } #else -static inline int inactive_anon_is_low(struct zone *zone, - struct scan_control *sc) +static inline int inactive_anon_is_low(struct mem_cgroup_zone *mz) { return 0; } @@ -1778,8 +1837,7 @@ static int inactive_file_is_low_global(struct zone *zone) /** * inactive_file_is_low - check if file pages need to be deactivated - * @zone: zone to check - * @sc: scan control of this context + * @mz: memory cgroup and zone to check * * When the system is doing streaming IO, memory pressure here * ensures that active file pages get deactivated, until more @@ -1791,45 +1849,44 @@ static int inactive_file_is_low_global(struct zone *zone) * This uses a different ratio than the anonymous pages, because * the page cache uses a use-once replacement algorithm. */ -static int inactive_file_is_low(struct zone *zone, struct scan_control *sc) +static int inactive_file_is_low(struct mem_cgroup_zone *mz) { - int low; + if (!scanning_global_lru(mz)) + return mem_cgroup_inactive_file_is_low(mz->mem_cgroup, + mz->zone); - if (scanning_global_lru(sc)) - low = inactive_file_is_low_global(zone); - else - low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup, zone); - return low; + return inactive_file_is_low_global(mz->zone); } -static int inactive_list_is_low(struct zone *zone, struct scan_control *sc, - int file) +static int inactive_list_is_low(struct mem_cgroup_zone *mz, int file) { if (file) - return inactive_file_is_low(zone, sc); + return inactive_file_is_low(mz); else - return inactive_anon_is_low(zone, sc); + return inactive_anon_is_low(mz); } static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, - struct zone *zone, struct scan_control *sc, int priority) + struct mem_cgroup_zone *mz, + struct scan_control *sc, int priority) { int file = is_file_lru(lru); if (is_active_lru(lru)) { - if (inactive_list_is_low(zone, sc, file)) - shrink_active_list(nr_to_scan, zone, sc, priority, file); + if (inactive_list_is_low(mz, file)) + shrink_active_list(nr_to_scan, mz, sc, priority, file); return 0; } - return shrink_inactive_list(nr_to_scan, zone, sc, priority, file); + return shrink_inactive_list(nr_to_scan, mz, sc, priority, file); } -static int vmscan_swappiness(struct scan_control *sc) +static int vmscan_swappiness(struct mem_cgroup_zone *mz, + struct scan_control *sc) { - if (scanning_global_lru(sc)) + if (global_reclaim(sc)) return vm_swappiness; - return mem_cgroup_swappiness(sc->mem_cgroup); + return mem_cgroup_swappiness(mz->mem_cgroup); } /* @@ -1840,15 +1897,15 @@ static int vmscan_swappiness(struct scan_control *sc) * * nr[0] = anon pages to scan; nr[1] = file pages to scan */ -static void get_scan_count(struct zone *zone, struct scan_control *sc, - unsigned long *nr, int priority) +static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc, + unsigned long *nr, int priority) { unsigned long anon, file, free; unsigned long anon_prio, file_prio; unsigned long ap, fp; - struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); + struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); u64 fraction[2], denominator; - enum lru_list l; + enum lru_list lru; int noswap = 0; bool force_scan = false; @@ -1862,9 +1919,9 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, * latencies, so it's better to scan a minimum amount there as * well. */ - if (scanning_global_lru(sc) && current_is_kswapd()) + if (current_is_kswapd() && mz->zone->all_unreclaimable) force_scan = true; - if (!scanning_global_lru(sc)) + if (!global_reclaim(sc)) force_scan = true; /* If we have no swap space, do not bother scanning anon pages. */ @@ -1876,16 +1933,16 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, goto out; } - anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + - zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); - file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) + - zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); + anon = zone_nr_lru_pages(mz, LRU_ACTIVE_ANON) + + zone_nr_lru_pages(mz, LRU_INACTIVE_ANON); + file = zone_nr_lru_pages(mz, LRU_ACTIVE_FILE) + + zone_nr_lru_pages(mz, LRU_INACTIVE_FILE); - if (scanning_global_lru(sc)) { - free = zone_page_state(zone, NR_FREE_PAGES); + if (global_reclaim(sc)) { + free = zone_page_state(mz->zone, NR_FREE_PAGES); /* If we have very few page cache pages, force-scan anon pages. */ - if (unlikely(file + free <= high_wmark_pages(zone))) { + if (unlikely(file + free <= high_wmark_pages(mz->zone))) { fraction[0] = 1; fraction[1] = 0; denominator = 1; @@ -1897,8 +1954,8 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, * With swappiness at 100, anonymous and file have the same priority. * This scanning priority is essentially the inverse of IO cost. */ - anon_prio = vmscan_swappiness(sc); - file_prio = 200 - vmscan_swappiness(sc); + anon_prio = vmscan_swappiness(mz, sc); + file_prio = 200 - vmscan_swappiness(mz, sc); /* * OK, so we have swap space and a fair amount of page cache @@ -1911,7 +1968,7 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, * * anon in [0], file in [1] */ - spin_lock_irq(&zone->lru_lock); + spin_lock_irq(&mz->zone->lru_lock); if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { reclaim_stat->recent_scanned[0] /= 2; reclaim_stat->recent_rotated[0] /= 2; @@ -1932,24 +1989,24 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1); fp /= reclaim_stat->recent_rotated[1] + 1; - spin_unlock_irq(&zone->lru_lock); + spin_unlock_irq(&mz->zone->lru_lock); fraction[0] = ap; fraction[1] = fp; denominator = ap + fp + 1; out: - for_each_evictable_lru(l) { - int file = is_file_lru(l); + for_each_evictable_lru(lru) { + int file = is_file_lru(lru); unsigned long scan; - scan = zone_nr_lru_pages(zone, sc, l); + scan = zone_nr_lru_pages(mz, lru); if (priority || noswap) { scan >>= priority; if (!scan && force_scan) scan = SWAP_CLUSTER_MAX; scan = div64_u64(scan * fraction[file], denominator); } - nr[l] = scan; + nr[lru] = scan; } } @@ -1960,7 +2017,7 @@ out: * back to the allocator and call try_to_compact_zone(), we ensure that * there are enough free pages for it to be likely successful */ -static inline bool should_continue_reclaim(struct zone *zone, +static inline bool should_continue_reclaim(struct mem_cgroup_zone *mz, unsigned long nr_reclaimed, unsigned long nr_scanned, struct scan_control *sc) @@ -2000,15 +2057,15 @@ static inline bool should_continue_reclaim(struct zone *zone, * inactive lists are large enough, continue reclaiming */ pages_for_compaction = (2UL << sc->order); - inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); + inactive_lru_pages = zone_nr_lru_pages(mz, LRU_INACTIVE_FILE); if (nr_swap_pages > 0) - inactive_lru_pages += zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); + inactive_lru_pages += zone_nr_lru_pages(mz, LRU_INACTIVE_ANON); if (sc->nr_reclaimed < pages_for_compaction && inactive_lru_pages > pages_for_compaction) return true; /* If compaction would go ahead or the allocation would succeed, stop */ - switch (compaction_suitable(zone, sc->order)) { + switch (compaction_suitable(mz->zone, sc->order)) { case COMPACT_PARTIAL: case COMPACT_CONTINUE: return false; @@ -2020,12 +2077,12 @@ static inline bool should_continue_reclaim(struct zone *zone, /* * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. */ -static void shrink_zone(int priority, struct zone *zone, - struct scan_control *sc) +static void shrink_mem_cgroup_zone(int priority, struct mem_cgroup_zone *mz, + struct scan_control *sc) { unsigned long nr[NR_LRU_LISTS]; unsigned long nr_to_scan; - enum lru_list l; + enum lru_list lru; unsigned long nr_reclaimed, nr_scanned; unsigned long nr_to_reclaim = sc->nr_to_reclaim; struct blk_plug plug; @@ -2033,19 +2090,19 @@ static void shrink_zone(int priority, struct zone *zone, restart: nr_reclaimed = 0; nr_scanned = sc->nr_scanned; - get_scan_count(zone, sc, nr, priority); + get_scan_count(mz, sc, nr, priority); blk_start_plug(&plug); while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || nr[LRU_INACTIVE_FILE]) { - for_each_evictable_lru(l) { - if (nr[l]) { + for_each_evictable_lru(lru) { + if (nr[lru]) { nr_to_scan = min_t(unsigned long, - nr[l], SWAP_CLUSTER_MAX); - nr[l] -= nr_to_scan; + nr[lru], SWAP_CLUSTER_MAX); + nr[lru] -= nr_to_scan; - nr_reclaimed += shrink_list(l, nr_to_scan, - zone, sc, priority); + nr_reclaimed += shrink_list(lru, nr_to_scan, + mz, sc, priority); } } /* @@ -2066,17 +2123,89 @@ restart: * Even if we did not try to evict anon pages at all, we want to * rebalance the anon lru active/inactive ratio. */ - if (inactive_anon_is_low(zone, sc)) - shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); + if (inactive_anon_is_low(mz)) + shrink_active_list(SWAP_CLUSTER_MAX, mz, sc, priority, 0); /* reclaim/compaction might need reclaim to continue */ - if (should_continue_reclaim(zone, nr_reclaimed, + if (should_continue_reclaim(mz, nr_reclaimed, sc->nr_scanned - nr_scanned, sc)) goto restart; throttle_vm_writeout(sc->gfp_mask); } +static void shrink_zone(int priority, struct zone *zone, + struct scan_control *sc) +{ + struct mem_cgroup *root = sc->target_mem_cgroup; + struct mem_cgroup_reclaim_cookie reclaim = { + .zone = zone, + .priority = priority, + }; + struct mem_cgroup *memcg; + + memcg = mem_cgroup_iter(root, NULL, &reclaim); + do { + struct mem_cgroup_zone mz = { + .mem_cgroup = memcg, + .zone = zone, + }; + + shrink_mem_cgroup_zone(priority, &mz, sc); + /* + * Limit reclaim has historically picked one memcg and + * scanned it with decreasing priority levels until + * nr_to_reclaim had been reclaimed. This priority + * cycle is thus over after a single memcg. + * + * Direct reclaim and kswapd, on the other hand, have + * to scan all memory cgroups to fulfill the overall + * scan target for the zone. + */ + if (!global_reclaim(sc)) { + mem_cgroup_iter_break(root, memcg); + break; + } + memcg = mem_cgroup_iter(root, memcg, &reclaim); + } while (memcg); +} + +/* Returns true if compaction should go ahead for a high-order request */ +static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) +{ + unsigned long balance_gap, watermark; + bool watermark_ok; + + /* Do not consider compaction for orders reclaim is meant to satisfy */ + if (sc->order <= PAGE_ALLOC_COSTLY_ORDER) + return false; + + /* + * Compaction takes time to run and there are potentially other + * callers using the pages just freed. Continue reclaiming until + * there is a buffer of free pages available to give compaction + * a reasonable chance of completing and allocating the page + */ + balance_gap = min(low_wmark_pages(zone), + (zone->present_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / + KSWAPD_ZONE_BALANCE_GAP_RATIO); + watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order); + watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0); + + /* + * If compaction is deferred, reclaim up to a point where + * compaction will have a chance of success when re-enabled + */ + if (compaction_deferred(zone)) + return watermark_ok; + + /* If compaction is not ready to start, keep reclaiming */ + if (!compaction_suitable(zone, sc->order)) + return false; + + return watermark_ok; +} + /* * This is the direct reclaim path, for page-allocating processes. We only * try to reclaim pages from zones which will satisfy the caller's allocation @@ -2094,8 +2223,9 @@ restart: * scan then give up on it. * * This function returns true if a zone is being reclaimed for a costly - * high-order allocation and compaction is either ready to begin or deferred. - * This indicates to the caller that it should retry the allocation or fail. + * high-order allocation and compaction is ready to begin. This indicates to + * the caller that it should consider retrying the allocation instead of + * further reclaim. */ static bool shrink_zones(int priority, struct zonelist *zonelist, struct scan_control *sc) @@ -2104,7 +2234,7 @@ static bool shrink_zones(int priority, struct zonelist *zonelist, struct zone *zone; unsigned long nr_soft_reclaimed; unsigned long nr_soft_scanned; - bool should_abort_reclaim = false; + bool aborted_reclaim = false; for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(sc->gfp_mask), sc->nodemask) { @@ -2114,7 +2244,7 @@ static bool shrink_zones(int priority, struct zonelist *zonelist, * Take care memory controller reclaiming has small influence * to global LRU. */ - if (scanning_global_lru(sc)) { + if (global_reclaim(sc)) { if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) continue; if (zone->all_unreclaimable && priority != DEF_PRIORITY) @@ -2129,10 +2259,8 @@ static bool shrink_zones(int priority, struct zonelist *zonelist, * noticable problem, like transparent huge page * allocations. */ - if (sc->order > PAGE_ALLOC_COSTLY_ORDER && - (compaction_suitable(zone, sc->order) || - compaction_deferred(zone))) { - should_abort_reclaim = true; + if (compaction_ready(zone, sc)) { + aborted_reclaim = true; continue; } } @@ -2154,7 +2282,7 @@ static bool shrink_zones(int priority, struct zonelist *zonelist, shrink_zone(priority, zone, sc); } - return should_abort_reclaim; + return aborted_reclaim; } static bool zone_reclaimable(struct zone *zone) @@ -2208,25 +2336,25 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, struct zoneref *z; struct zone *zone; unsigned long writeback_threshold; + bool aborted_reclaim; get_mems_allowed(); delayacct_freepages_start(); - if (scanning_global_lru(sc)) + if (global_reclaim(sc)) count_vm_event(ALLOCSTALL); for (priority = DEF_PRIORITY; priority >= 0; priority--) { sc->nr_scanned = 0; if (!priority) - disable_swap_token(sc->mem_cgroup); - if (shrink_zones(priority, zonelist, sc)) - break; + disable_swap_token(sc->target_mem_cgroup); + aborted_reclaim = shrink_zones(priority, zonelist, sc); /* * Don't shrink slabs when reclaiming memory from * over limit cgroups */ - if (scanning_global_lru(sc)) { + if (global_reclaim(sc)) { unsigned long lru_pages = 0; for_each_zone_zonelist(zone, z, zonelist, gfp_zone(sc->gfp_mask)) { @@ -2287,8 +2415,12 @@ out: if (oom_killer_disabled) return 0; + /* Aborted reclaim to try compaction? don't OOM, then */ + if (aborted_reclaim) + return 1; + /* top priority shrink_zones still had more to do? don't OOM, then */ - if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc)) + if (global_reclaim(sc) && !all_unreclaimable(zonelist, sc)) return 1; return 0; @@ -2305,7 +2437,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, .may_unmap = 1, .may_swap = 1, .order = order, - .mem_cgroup = NULL, + .target_mem_cgroup = NULL, .nodemask = nodemask, }; struct shrink_control shrink = { @@ -2325,7 +2457,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, #ifdef CONFIG_CGROUP_MEM_RES_CTLR -unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, +unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, gfp_t gfp_mask, bool noswap, struct zone *zone, unsigned long *nr_scanned) @@ -2337,7 +2469,11 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, .may_unmap = 1, .may_swap = !noswap, .order = 0, - .mem_cgroup = mem, + .target_mem_cgroup = memcg, + }; + struct mem_cgroup_zone mz = { + .mem_cgroup = memcg, + .zone = zone, }; sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | @@ -2354,7 +2490,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, * will pick up pages from other mem cgroup's as well. We hack * the priority and make it zero. */ - shrink_zone(0, zone, &sc); + shrink_mem_cgroup_zone(0, &mz, &sc); trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); @@ -2362,7 +2498,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, return sc.nr_reclaimed; } -unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, +unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, gfp_t gfp_mask, bool noswap) { @@ -2375,7 +2511,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, .may_swap = !noswap, .nr_to_reclaim = SWAP_CLUSTER_MAX, .order = 0, - .mem_cgroup = mem_cont, + .target_mem_cgroup = memcg, .nodemask = NULL, /* we don't care the placement */ .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), @@ -2389,7 +2525,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, * take care of from where we get pages. So the node where we start the * scan does not need to be the current node. */ - nid = mem_cgroup_select_victim_node(mem_cont); + nid = mem_cgroup_select_victim_node(memcg); zonelist = NODE_DATA(nid)->node_zonelists; @@ -2405,6 +2541,29 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, } #endif +static void age_active_anon(struct zone *zone, struct scan_control *sc, + int priority) +{ + struct mem_cgroup *memcg; + + if (!total_swap_pages) + return; + + memcg = mem_cgroup_iter(NULL, NULL, NULL); + do { + struct mem_cgroup_zone mz = { + .mem_cgroup = memcg, + .zone = zone, + }; + + if (inactive_anon_is_low(&mz)) + shrink_active_list(SWAP_CLUSTER_MAX, &mz, + sc, priority, 0); + + memcg = mem_cgroup_iter(NULL, memcg, NULL); + } while (memcg); +} + /* * pgdat_balanced is used when checking if a node is balanced for high-order * allocations. Only zones that meet watermarks and are in a zone allowed @@ -2525,7 +2684,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, */ .nr_to_reclaim = ULONG_MAX, .order = order, - .mem_cgroup = NULL, + .target_mem_cgroup = NULL, }; struct shrink_control shrink = { .gfp_mask = sc.gfp_mask, @@ -2564,9 +2723,7 @@ loop_again: * Do some background aging of the anon list, to give * pages a chance to be referenced before reclaiming. */ - if (inactive_anon_is_low(zone, &sc)) - shrink_active_list(SWAP_CLUSTER_MAX, zone, - &sc, priority, 0); + age_active_anon(zone, &sc, priority); if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), 0, 0)) { @@ -3355,16 +3512,18 @@ int page_evictable(struct page *page, struct vm_area_struct *vma) */ static void check_move_unevictable_page(struct page *page, struct zone *zone) { - VM_BUG_ON(PageActive(page)); + struct lruvec *lruvec; + VM_BUG_ON(PageActive(page)); retry: ClearPageUnevictable(page); if (page_evictable(page, NULL)) { enum lru_list l = page_lru_base_type(page); __dec_zone_state(zone, NR_UNEVICTABLE); - list_move(&page->lru, &zone->lru[l].list); - mem_cgroup_move_lists(page, LRU_UNEVICTABLE, l); + lruvec = mem_cgroup_lru_move_lists(zone, page, + LRU_UNEVICTABLE, l); + list_move(&page->lru, &lruvec->lists[l]); __inc_zone_state(zone, NR_INACTIVE_ANON + l); __count_vm_event(UNEVICTABLE_PGRESCUED); } else { @@ -3372,8 +3531,9 @@ retry: * rotate unevictable list */ SetPageUnevictable(page); - list_move(&page->lru, &zone->lru[LRU_UNEVICTABLE].list); - mem_cgroup_rotate_lru_list(page, LRU_UNEVICTABLE); + lruvec = mem_cgroup_lru_move_lists(zone, page, LRU_UNEVICTABLE, + LRU_UNEVICTABLE); + list_move(&page->lru, &lruvec->lists[LRU_UNEVICTABLE]); if (page_evictable(page, NULL)) goto retry; } diff --git a/mm/vmstat.c b/mm/vmstat.c index 8fd603b..f600557 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -295,7 +295,7 @@ void __dec_zone_page_state(struct page *page, enum zone_stat_item item) } EXPORT_SYMBOL(__dec_zone_page_state); -#ifdef CONFIG_CMPXCHG_LOCAL +#ifdef CONFIG_HAVE_CMPXCHG_LOCAL /* * If we have cmpxchg_local support then we do not need to incur the overhead * that comes with local_irq_save/restore if we use this_cpu_cmpxchg. diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile new file mode 100644 index 0000000..4ec8401 --- /dev/null +++ b/tools/testing/selftests/Makefile @@ -0,0 +1,11 @@ +TARGETS = breakpoints + +all: + for TARGET in $(TARGETS); do \ + make -C $$TARGET; \ + done; + +clean: + for TARGET in $(TARGETS); do \ + make -C $$TARGET clean; \ + done; diff --git a/tools/testing/selftests/breakpoints/Makefile b/tools/testing/selftests/breakpoints/Makefile new file mode 100644 index 0000000..f362722 --- /dev/null +++ b/tools/testing/selftests/breakpoints/Makefile @@ -0,0 +1,20 @@ +# Taken from perf makefile +uname_M := $(shell uname -m 2>/dev/null || echo not) +ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/i386/) +ifeq ($(ARCH),i386) + ARCH := x86 +endif +ifeq ($(ARCH),x86_64) + ARCH := x86 +endif + + +all: +ifeq ($(ARCH),x86) + gcc breakpoint_test.c -o run_test +else + echo "Not an x86 target, can't build breakpoints selftests" +endif + +clean: + rm -fr run_test diff --git a/tools/testing/selftests/breakpoints/breakpoint_test.c b/tools/testing/selftests/breakpoints/breakpoint_test.c new file mode 100644 index 0000000..a0743f3 --- /dev/null +++ b/tools/testing/selftests/breakpoints/breakpoint_test.c @@ -0,0 +1,394 @@ +/* + * Copyright (C) 2011 Red Hat, Inc., Frederic Weisbecker <fweisbec@redhat.com> + * + * Licensed under the terms of the GNU GPL License version 2 + * + * Selftests for breakpoints (and more generally the do_debug() path) in x86. + */ + + +#include <sys/ptrace.h> +#include <unistd.h> +#include <stddef.h> +#include <sys/user.h> +#include <stdio.h> +#include <stdlib.h> +#include <signal.h> +#include <sys/types.h> +#include <sys/wait.h> + + +/* Breakpoint access modes */ +enum { + BP_X = 1, + BP_RW = 2, + BP_W = 4, +}; + +static pid_t child_pid; + +/* + * Ensures the child and parent are always "talking" about + * the same test sequence. (ie: that we haven't forgotten + * to call check_trapped() somewhere). + */ +static int nr_tests; + +static void set_breakpoint_addr(void *addr, int n) +{ + int ret; + + ret = ptrace(PTRACE_POKEUSER, child_pid, + offsetof(struct user, u_debugreg[n]), addr); + if (ret) { + perror("Can't set breakpoint addr\n"); + exit(-1); + } +} + +static void toggle_breakpoint(int n, int type, int len, + int local, int global, int set) +{ + int ret; + + int xtype, xlen; + unsigned long vdr7, dr7; + + switch (type) { + case BP_X: + xtype = 0; + break; + case BP_W: + xtype = 1; + break; + case BP_RW: + xtype = 3; + break; + } + + switch (len) { + case 1: + xlen = 0; + break; + case 2: + xlen = 4; + break; + case 4: + xlen = 0xc; + break; + case 8: + xlen = 8; + break; + } + + dr7 = ptrace(PTRACE_PEEKUSER, child_pid, + offsetof(struct user, u_debugreg[7]), 0); + + vdr7 = (xlen | xtype) << 16; + vdr7 <<= 4 * n; + + if (local) { + vdr7 |= 1 << (2 * n); + vdr7 |= 1 << 8; + } + if (global) { + vdr7 |= 2 << (2 * n); + vdr7 |= 1 << 9; + } + + if (set) + dr7 |= vdr7; + else + dr7 &= ~vdr7; + + ret = ptrace(PTRACE_POKEUSER, child_pid, + offsetof(struct user, u_debugreg[7]), dr7); + if (ret) { + perror("Can't set dr7"); + exit(-1); + } +} + +/* Dummy variables to test read/write accesses */ +static unsigned long long dummy_var[4]; + +/* Dummy functions to test execution accesses */ +static void dummy_func(void) { } +static void dummy_func1(void) { } +static void dummy_func2(void) { } +static void dummy_func3(void) { } + +static void (*dummy_funcs[])(void) = { + dummy_func, + dummy_func1, + dummy_func2, + dummy_func3, +}; + +static int trapped; + +static void check_trapped(void) +{ + /* + * If we haven't trapped, wake up the parent + * so that it notices the failure. + */ + if (!trapped) + kill(getpid(), SIGUSR1); + trapped = 0; + + nr_tests++; +} + +static void write_var(int len) +{ + char *pcval; short *psval; int *pival; long long *plval; + int i; + + for (i = 0; i < 4; i++) { + switch (len) { + case 1: + pcval = (char *)&dummy_var[i]; + *pcval = 0xff; + break; + case 2: + psval = (short *)&dummy_var[i]; + *psval = 0xffff; + break; + case 4: + pival = (int *)&dummy_var[i]; + *pival = 0xffffffff; + break; + case 8: + plval = (long long *)&dummy_var[i]; + *plval = 0xffffffffffffffffLL; + break; + } + check_trapped(); + } +} + +static void read_var(int len) +{ + char cval; short sval; int ival; long long lval; + int i; + + for (i = 0; i < 4; i++) { + switch (len) { + case 1: + cval = *(char *)&dummy_var[i]; + break; + case 2: + sval = *(short *)&dummy_var[i]; + break; + case 4: + ival = *(int *)&dummy_var[i]; + break; + case 8: + lval = *(long long *)&dummy_var[i]; + break; + } + check_trapped(); + } +} + +/* + * Do the r/w/x accesses to trigger the breakpoints. And run + * the usual traps. + */ +static void trigger_tests(void) +{ + int len, local, global, i; + char val; + int ret; + + ret = ptrace(PTRACE_TRACEME, 0, NULL, 0); + if (ret) { + perror("Can't be traced?\n"); + return; + } + + /* Wake up father so that it sets up the first test */ + kill(getpid(), SIGUSR1); + + /* Test instruction breakpoints */ + for (local = 0; local < 2; local++) { + for (global = 0; global < 2; global++) { + if (!local && !global) + continue; + + for (i = 0; i < 4; i++) { + dummy_funcs[i](); + check_trapped(); + } + } + } + + /* Test write watchpoints */ + for (len = 1; len <= sizeof(long); len <<= 1) { + for (local = 0; local < 2; local++) { + for (global = 0; global < 2; global++) { + if (!local && !global) + continue; + write_var(len); + } + } + } + + /* Test read/write watchpoints (on read accesses) */ + for (len = 1; len <= sizeof(long); len <<= 1) { + for (local = 0; local < 2; local++) { + for (global = 0; global < 2; global++) { + if (!local && !global) + continue; + read_var(len); + } + } + } + + /* Icebp trap */ + asm(".byte 0xf1\n"); + check_trapped(); + + /* Int 3 trap */ + asm("int $3\n"); + check_trapped(); + + kill(getpid(), SIGUSR1); +} + +static void check_success(const char *msg) +{ + const char *msg2; + int child_nr_tests; + int status; + + /* Wait for the child to SIGTRAP */ + wait(&status); + + msg2 = "Failed"; + + if (WSTOPSIG(status) == SIGTRAP) { + child_nr_tests = ptrace(PTRACE_PEEKDATA, child_pid, + &nr_tests, 0); + if (child_nr_tests == nr_tests) + msg2 = "Ok"; + if (ptrace(PTRACE_POKEDATA, child_pid, &trapped, 1)) { + perror("Can't poke\n"); + exit(-1); + } + } + + nr_tests++; + + printf("%s [%s]\n", msg, msg2); +} + +static void launch_instruction_breakpoints(char *buf, int local, int global) +{ + int i; + + for (i = 0; i < 4; i++) { + set_breakpoint_addr(dummy_funcs[i], i); + toggle_breakpoint(i, BP_X, 1, local, global, 1); + ptrace(PTRACE_CONT, child_pid, NULL, 0); + sprintf(buf, "Test breakpoint %d with local: %d global: %d", + i, local, global); + check_success(buf); + toggle_breakpoint(i, BP_X, 1, local, global, 0); + } +} + +static void launch_watchpoints(char *buf, int mode, int len, + int local, int global) +{ + const char *mode_str; + int i; + + if (mode == BP_W) + mode_str = "write"; + else + mode_str = "read"; + + for (i = 0; i < 4; i++) { + set_breakpoint_addr(&dummy_var[i], i); + toggle_breakpoint(i, mode, len, local, global, 1); + ptrace(PTRACE_CONT, child_pid, NULL, 0); + sprintf(buf, "Test %s watchpoint %d with len: %d local: " + "%d global: %d", mode_str, i, len, local, global); + check_success(buf); + toggle_breakpoint(i, mode, len, local, global, 0); + } +} + +/* Set the breakpoints and check the child successfully trigger them */ +static void launch_tests(void) +{ + char buf[1024]; + int len, local, global, i; + + /* Instruction breakpoints */ + for (local = 0; local < 2; local++) { + for (global = 0; global < 2; global++) { + if (!local && !global) + continue; + launch_instruction_breakpoints(buf, local, global); + } + } + + /* Write watchpoint */ + for (len = 1; len <= sizeof(long); len <<= 1) { + for (local = 0; local < 2; local++) { + for (global = 0; global < 2; global++) { + if (!local && !global) + continue; + launch_watchpoints(buf, BP_W, len, + local, global); + } + } + } + + /* Read-Write watchpoint */ + for (len = 1; len <= sizeof(long); len <<= 1) { + for (local = 0; local < 2; local++) { + for (global = 0; global < 2; global++) { + if (!local && !global) + continue; + launch_watchpoints(buf, BP_RW, len, + local, global); + } + } + } + + /* Icebp traps */ + ptrace(PTRACE_CONT, child_pid, NULL, 0); + check_success("Test icebp"); + + /* Int 3 traps */ + ptrace(PTRACE_CONT, child_pid, NULL, 0); + check_success("Test int 3 trap"); + + ptrace(PTRACE_CONT, child_pid, NULL, 0); +} + +int main(int argc, char **argv) +{ + pid_t pid; + int ret; + + pid = fork(); + if (!pid) { + trigger_tests(); + return 0; + } + + child_pid = pid; + + wait(NULL); + + launch_tests(); + + wait(NULL); + + return 0; +} diff --git a/tools/testing/selftests/run_tests b/tools/testing/selftests/run_tests new file mode 100644 index 0000000..320718a --- /dev/null +++ b/tools/testing/selftests/run_tests @@ -0,0 +1,8 @@ +#!/bin/bash + +TARGETS=breakpoints + +for TARGET in $TARGETS +do + $TARGET/run_test +done |