From 58d114b669d2b86aa79eac6688590c808072579b Mon Sep 17 00:00:00 2001 From: "Ying-Chun Liu (PaulLiu)" Date: Mon, 16 Apr 2012 00:01:50 +0800 Subject: mfd: Add device-tree support for da9502 i2c driver This patch adds device-tree support for dialog MFD and the binding documentations. Signed-off-by: Ying-Chun Liu (PaulLiu) Cc: Samuel Ortiz Cc: Mark Brown Cc: Shawn Guo Cc: Ashish Jangam Signed-off-by: Samuel Ortiz --- .../devicetree/bindings/mfd/da9052-i2c.txt | 60 ++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 Documentation/devicetree/bindings/mfd/da9052-i2c.txt (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/mfd/da9052-i2c.txt b/Documentation/devicetree/bindings/mfd/da9052-i2c.txt new file mode 100644 index 0000000..1857f4a --- /dev/null +++ b/Documentation/devicetree/bindings/mfd/da9052-i2c.txt @@ -0,0 +1,60 @@ +* Dialog DA9052/53 Power Management Integrated Circuit (PMIC) + +Required properties: +- compatible : Should be "dlg,da9052", "dlg,da9053-aa", + "dlg,da9053-ab", or "dlg,da9053-bb" + +Sub-nodes: +- regulators : Contain the regulator nodes. The DA9052/53 regulators are + bound using their names as listed below: + + buck0 : regulator BUCK0 + buck1 : regulator BUCK1 + buck2 : regulator BUCK2 + buck3 : regulator BUCK3 + ldo4 : regulator LDO4 + ldo5 : regulator LDO5 + ldo6 : regulator LDO6 + ldo7 : regulator LDO7 + ldo8 : regulator LDO8 + ldo9 : regulator LDO9 + ldo10 : regulator LDO10 + ldo11 : regulator LDO11 + ldo12 : regulator LDO12 + ldo13 : regulator LDO13 + + The bindings details of individual regulator device can be found in: + Documentation/devicetree/bindings/regulator/regulator.txt + +Examples: + +i2c@63fc8000 { /* I2C1 */ + status = "okay"; + + pmic: dialog@48 { + compatible = "dlg,da9053-aa"; + reg = <0x48>; + + regulators { + buck0 { + regulator-min-microvolt = <500000>; + regulator-max-microvolt = <2075000>; + }; + + buck1 { + regulator-min-microvolt = <500000>; + regulator-max-microvolt = <2075000>; + }; + + buck2 { + regulator-min-microvolt = <925000>; + regulator-max-microvolt = <2500000>; + }; + + buck3 { + regulator-min-microvolt = <925000>; + regulator-max-microvolt = <2500000>; + }; + }; + }; +}; -- cgit v1.1 From 16c5c023aac86228e3e94c4bf6d19708ea861a05 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Thu, 3 May 2012 12:26:36 +0200 Subject: mfd: Add LM3533 lighting-power core driver Add support for National Semiconductor / TI LM3533 lighting power chips. This is the core driver which provides register access over I2C and registers the ambient-light-sensor, LED and backlight sub-drivers. Signed-off-by: Johan Hovold Reviewed-by: Mark Brown Signed-off-by: Samuel Ortiz --- .../ABI/testing/sysfs-bus-i2c-devices-lm3533 | 38 ++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 Documentation/ABI/testing/sysfs-bus-i2c-devices-lm3533 (limited to 'Documentation') diff --git a/Documentation/ABI/testing/sysfs-bus-i2c-devices-lm3533 b/Documentation/ABI/testing/sysfs-bus-i2c-devices-lm3533 new file mode 100644 index 0000000..5700721 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-bus-i2c-devices-lm3533 @@ -0,0 +1,38 @@ +What: /sys/bus/i2c/devices/.../boost_freq +Date: April 2012 +KernelVersion: 3.5 +Contact: Johan Hovold +Description: + Set the boost converter switching frequency (0, 1), where + + 0 - 500Hz + 1 - 1000Hz + +What: /sys/bus/i2c/devices/.../boost_ovp +Date: April 2012 +KernelVersion: 3.5 +Contact: Johan Hovold +Description: + Set the boost converter over-voltage protection threshold + (0..3), where + + 0 - 16V + 1 - 24V + 2 - 32V + 3 - 40V + +What: /sys/bus/i2c/devices/.../output_hvled[n] +Date: April 2012 +KernelVersion: 3.5 +Contact: Johan Hovold +Description: + Set the controlling backlight device for high-voltage current + sink HVLED[n] (n = 1, 2) (0, 1). + +What: /sys/bus/i2c/devices/.../output_lvled[n] +Date: April 2012 +KernelVersion: 3.5 +Contact: Johan Hovold +Description: + Set the controlling led device for low-voltage current sink + LVLED[n] (n = 1..5) (0..3). -- cgit v1.1 From 1291aa457324e149bbda19bd637b4b8ec8f04bcb Mon Sep 17 00:00:00 2001 From: Rhyland Klein Date: Tue, 8 May 2012 11:42:39 -0700 Subject: mfd: Add tps65910 device tree bindings documentation Add device tree bindings for TI's tps65910 pmic. Signed-off-by: Rhyland Klein Signed-off-by: Samuel Ortiz --- Documentation/devicetree/bindings/mfd/tps65910.txt | 133 +++++++++++++++++++++ 1 file changed, 133 insertions(+) create mode 100644 Documentation/devicetree/bindings/mfd/tps65910.txt (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/mfd/tps65910.txt b/Documentation/devicetree/bindings/mfd/tps65910.txt new file mode 100644 index 0000000..645f5ea --- /dev/null +++ b/Documentation/devicetree/bindings/mfd/tps65910.txt @@ -0,0 +1,133 @@ +TPS65910 Power Management Integrated Circuit + +Required properties: +- compatible: "ti,tps65910" or "ti,tps65911" +- reg: I2C slave address +- interrupts: the interrupt outputs of the controller +- #gpio-cells: number of cells to describe a GPIO, this should be 2. + The first cell is the GPIO number. + The second cell is used to specify additional options . +- gpio-controller: mark the device as a GPIO controller +- #interrupt-cells: the number of cells to describe an IRQ, this should be 2. + The first cell is the IRQ number. + The second cell is the flags, encoded as the trigger masks from + Documentation/devicetree/bindings/interrupts.txt +- regulators: This is the list of child nodes that specify the regulator + initialization data for defined regulators. Not all regulators for the given + device need to be present. The definition for each of these nodes is defined + using the standard binding for regulators found at + Documentation/devicetree/bindings/regulator/regulator.txt. + + The valid names for regulators are: + tps65910: vrtc, vio, vdd1, vdd2, vdd3, vdig1, vdig2, vpll, vdac, vaux1, + vaux2, vaux33, vmmc + tps65911: vrtc, vio, vdd1, vdd3, vddctrl, ldo1, ldo2, ldo3, ldo4, ldo5, + ldo6, ldo7, ldo8 + +Optional properties: +- ti,vmbch-threshold: (tps65911) main battery charged threshold + comparator. (see VMBCH_VSEL in TPS65910 datasheet) +- ti,vmbch2-threshold: (tps65911) main battery discharged threshold + comparator. (see VMBCH_VSEL in TPS65910 datasheet) +- ti,en-gpio-sleep: enable sleep control for gpios + There should be 9 entries here, one for each gpio. + +Regulator Optional properties: +- ti,regulator-ext-sleep-control: enable external sleep + control through external inputs [0 (not enabled), 1 (EN1), 2 (EN2) or 4(EN3)] + If this property is not defined, it defaults to 0 (not enabled). + +Example: + + pmu: tps65910@d2 { + compatible = "ti,tps65910"; + reg = <0xd2>; + interrupt-parent = <&intc>; + interrupts = < 0 118 0x04 >; + + #gpio-cells = <2>; + gpio-controller; + + #interrupt-cells = <2>; + interrupt-controller; + + ti,vmbch-threshold = 0; + ti,vmbch2-threshold = 0; + + ti,en-gpio-sleep = <0 0 1 0 0 0 0 0 0>; + + regulators { + vdd1_reg: vdd1 { + regulator-min-microvolt = < 600000>; + regulator-max-microvolt = <1500000>; + regulator-always-on; + regulator-boot-on; + ti,regulator-ext-sleep-control = <0>; + }; + vdd2_reg: vdd2 { + regulator-min-microvolt = < 600000>; + regulator-max-microvolt = <1500000>; + regulator-always-on; + regulator-boot-on; + ti,regulator-ext-sleep-control = <4>; + }; + vddctrl_reg: vddctrl { + regulator-min-microvolt = < 600000>; + regulator-max-microvolt = <1400000>; + regulator-always-on; + regulator-boot-on; + ti,regulator-ext-sleep-control = <0>; + }; + vio_reg: vio { + regulator-min-microvolt = <1500000>; + regulator-max-microvolt = <1800000>; + regulator-always-on; + regulator-boot-on; + ti,regulator-ext-sleep-control = <1>; + }; + ldo1_reg: ldo1 { + regulator-min-microvolt = <1000000>; + regulator-max-microvolt = <3300000>; + ti,regulator-ext-sleep-control = <0>; + }; + ldo2_reg: ldo2 { + regulator-min-microvolt = <1050000>; + regulator-max-microvolt = <1050000>; + ti,regulator-ext-sleep-control = <0>; + }; + ldo3_reg: ldo3 { + regulator-min-microvolt = <1000000>; + regulator-max-microvolt = <3300000>; + ti,regulator-ext-sleep-control = <0>; + }; + ldo4_reg: ldo4 { + regulator-min-microvolt = <1000000>; + regulator-max-microvolt = <3300000>; + regulator-always-on; + ti,regulator-ext-sleep-control = <0>; + }; + ldo5_reg: ldo5 { + regulator-min-microvolt = <1000000>; + regulator-max-microvolt = <3300000>; + ti,regulator-ext-sleep-control = <0>; + }; + ldo6_reg: ldo6 { + regulator-min-microvolt = <1200000>; + regulator-max-microvolt = <1200000>; + ti,regulator-ext-sleep-control = <0>; + }; + ldo7_reg: ldo7 { + regulator-min-microvolt = <1200000>; + regulator-max-microvolt = <1200000>; + regulator-always-on; + regulator-boot-on; + ti,regulator-ext-sleep-control = <1>; + }; + ldo8_reg: ldo8 { + regulator-min-microvolt = <1000000>; + regulator-max-microvolt = <3300000>; + regulator-always-on; + ti,regulator-ext-sleep-control = <1>; + }; + }; + }; -- cgit v1.1 From f4cf18ca5914bc1edb03c9049198738255fa4a23 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Thu, 10 May 2012 14:11:29 +0200 Subject: mfd: Remove lm3533 boost attributes Remove boost-frequency and ovp attributes, which can be set through platform data, from sysfs. Signed-off-by: Johan Hovold Signed-off-by: Samuel Ortiz --- .../ABI/testing/sysfs-bus-i2c-devices-lm3533 | 23 ---------------------- 1 file changed, 23 deletions(-) (limited to 'Documentation') diff --git a/Documentation/ABI/testing/sysfs-bus-i2c-devices-lm3533 b/Documentation/ABI/testing/sysfs-bus-i2c-devices-lm3533 index 5700721..1b62230 100644 --- a/Documentation/ABI/testing/sysfs-bus-i2c-devices-lm3533 +++ b/Documentation/ABI/testing/sysfs-bus-i2c-devices-lm3533 @@ -1,26 +1,3 @@ -What: /sys/bus/i2c/devices/.../boost_freq -Date: April 2012 -KernelVersion: 3.5 -Contact: Johan Hovold -Description: - Set the boost converter switching frequency (0, 1), where - - 0 - 500Hz - 1 - 1000Hz - -What: /sys/bus/i2c/devices/.../boost_ovp -Date: April 2012 -KernelVersion: 3.5 -Contact: Johan Hovold -Description: - Set the boost converter over-voltage protection threshold - (0..3), where - - 0 - 16V - 1 - 24V - 2 - 32V - 3 - 40V - What: /sys/bus/i2c/devices/.../output_hvled[n] Date: April 2012 KernelVersion: 3.5 -- cgit v1.1 From 37e13cecaa141eccce705843f5d2f7509e29bd3a Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Wed, 16 May 2012 14:11:58 +0300 Subject: mfd: Add support for Device Tree to twl6040 Device tree based probing support for the core twl6040 driver. Child devices will be created as MFD devices: - ASoC codec is always created - Vibra child is only created if the vibra section present in the DT blob. Signed-off-by: Peter Ujfalusi Signed-off-by: Samuel Ortiz --- Documentation/devicetree/bindings/mfd/twl6040.txt | 62 +++++++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 Documentation/devicetree/bindings/mfd/twl6040.txt (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/mfd/twl6040.txt b/Documentation/devicetree/bindings/mfd/twl6040.txt new file mode 100644 index 0000000..bc67c6f --- /dev/null +++ b/Documentation/devicetree/bindings/mfd/twl6040.txt @@ -0,0 +1,62 @@ +Texas Instruments TWL6040 family + +The TWL6040s are 8-channel high quality low-power audio codecs providing audio +and vibra functionality on OMAP4+ platforms. +They are connected ot the host processor via i2c for commands, McPDM for audio +data and commands. + +Required properties: +- compatible : Must be "ti,twl6040"; +- reg: must be 0x4b for i2c address +- interrupts: twl6040 has one interrupt line connecteded to the main SoC +- interrupt-parent: The parent interrupt controller +- twl6040,audpwron-gpio: Power on GPIO line for the twl6040 + +- vio-supply: Regulator for the twl6040 VIO supply +- v2v1-supply: Regulator for the twl6040 V2V1 supply + +Optional properties, nodes: +- enable-active-high: To power on the twl6040 during boot. + +Vibra functionality +Required properties: +- vddvibl-supply: Regulator for the left vibra motor +- vddvibr-supply: Regulator for the right vibra motor +- vibra { }: Configuration section for vibra parameters containing the following + properties: +- ti,vibldrv-res: Resistance parameter for left driver +- ti,vibrdrv-res: Resistance parameter for right driver +- ti,viblmotor-res: Resistance parameter for left motor +- ti,viblmotor-res: Resistance parameter for right motor + +Optional properties within vibra { } section: +- vddvibl_uV: If the vddvibl default voltage need to be changed +- vddvibr_uV: If the vddvibr default voltage need to be changed + +Example: +&i2c1 { + twl6040: twl@4b { + compatible = "ti,twl6040"; + reg = <0x4b>; + + interrupts = <0 119 4>; + interrupt-parent = <&gic>; + twl6040,audpwron-gpio = <&gpio4 31 0>; + + vio-supply = <&v1v8>; + v2v1-supply = <&v2v1>; + enable-active-high; + + /* regulators for vibra motor */ + vddvibl-supply = <&vbat>; + vddvibr-supply = <&vbat>; + + vibra { + /* Vibra driver, motor resistance parameters */ + ti,vibldrv-res = <8>; + ti,vibrdrv-res = <3>; + ti,viblmotor-res = <10>; + ti,vibrmotor-res = <10>; + }; + }; +}; -- cgit v1.1 From eb6332a54542bcd3aedb121ecb247f172b8f3602 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 29 May 2012 15:06:26 -0700 Subject: Documentation: memcg: future proof hierarchical statistics documentation The hierarchical versions of per-memcg counters in memory.stat are all calculated the same way and are all named total_. Documenting the pattern is easier for maintenance than listing each counter twice. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Acked-by: KOSAKI Motohiro Acked-by: Ying Han Randy Dunlap Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/cgroups/memory.txt | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) (limited to 'Documentation') diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index 9b1067a..e479007 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt @@ -430,17 +430,10 @@ hierarchical_memory_limit - # of bytes of memory limit with regard to hierarchy hierarchical_memsw_limit - # of bytes of memory+swap limit with regard to hierarchy under which memory cgroup is. -total_cache - sum of all children's "cache" -total_rss - sum of all children's "rss" -total_mapped_file - sum of all children's "cache" -total_pgpgin - sum of all children's "pgpgin" -total_pgpgout - sum of all children's "pgpgout" -total_swap - sum of all children's "swap" -total_inactive_anon - sum of all children's "inactive_anon" -total_active_anon - sum of all children's "active_anon" -total_inactive_file - sum of all children's "inactive_file" -total_active_file - sum of all children's "active_file" -total_unevictable - sum of all children's "unevictable" +total_ - # hierarchical version of , which in + addition to the cgroup's own value includes the + sum of all hierarchical children's values of + , i.e. total_cache # The following additional stats are dependent on CONFIG_DEBUG_VM. -- cgit v1.1 From 17cf28afea2a1112f240a3a2da8af883be024811 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 29 May 2012 15:06:41 -0700 Subject: mm/fs: remove truncate_range Remove vmtruncate_range(), and remove the truncate_range method from struct inode_operations: only tmpfs ever supported it, and tmpfs has now converted over to using the fallocate method of file_operations. Update Documentation accordingly, adding (setlease and) fallocate lines. And while we're in mm.h, remove duplicate declarations of shmem_lock() and shmem_file_setup(): everyone is now using the ones in shmem_fs.h. Based-on-patch-by: Cong Wang Signed-off-by: Hugh Dickins Cc: Christoph Hellwig Cc: Cong Wang Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/Locking | 2 -- Documentation/filesystems/vfs.txt | 13 ++++++++----- 2 files changed, 8 insertions(+), 7 deletions(-) (limited to 'Documentation') diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 4fca82e..d449e63 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -60,7 +60,6 @@ ata *); ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t); ssize_t (*listxattr) (struct dentry *, char *, size_t); int (*removexattr) (struct dentry *, const char *); - void (*truncate_range)(struct inode *, loff_t, loff_t); int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, u64 len); locking rules: @@ -87,7 +86,6 @@ setxattr: yes getxattr: no listxattr: no removexattr: yes -truncate_range: yes fiemap: no Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_mutex on victim. diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index 0d04920..ef19f91 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -363,7 +363,6 @@ struct inode_operations { ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t); ssize_t (*listxattr) (struct dentry *, char *, size_t); int (*removexattr) (struct dentry *, const char *); - void (*truncate_range)(struct inode *, loff_t, loff_t); }; Again, all methods are called without any locks being held, unless @@ -472,9 +471,6 @@ otherwise noted. removexattr: called by the VFS to remove an extended attribute from a file. This method is called by removexattr(2) system call. - truncate_range: a method provided by the underlying filesystem to truncate a - range of blocks , i.e. punch a hole somewhere in a file. - The Address Space Object ======================== @@ -760,7 +756,7 @@ struct file_operations ---------------------- This describes how the VFS can manipulate an open file. As of kernel -2.6.22, the following members are defined: +3.5, the following members are defined: struct file_operations { struct module *owner; @@ -790,6 +786,8 @@ struct file_operations { int (*flock) (struct file *, int, struct file_lock *); ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, size_t, unsigned int); ssize_t (*splice_read)(struct file *, struct pipe_inode_info *, size_t, unsigned int); + int (*setlease)(struct file *, long arg, struct file_lock **); + long (*fallocate)(struct file *, int mode, loff_t offset, loff_t len); }; Again, all methods are called without any locks being held, unless @@ -858,6 +856,11 @@ otherwise noted. splice_read: called by the VFS to splice data from file to a pipe. This method is used by the splice(2) system call + setlease: called by the VFS to set or release a file lock lease. + setlease has the file_lock_lock held and must not sleep. + + fallocate: called by the VFS to preallocate blocks or punch a hole. + Note that the file operations are implemented by the specific filesystem in which the inode resides. When opening a device node (character or block special) most filesystems will call special -- cgit v1.1 From 692569946fbf56fbb75d85c57679541f9a3550b4 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 29 May 2012 15:06:45 -0700 Subject: mm: document the meminfo and vmstat fields of relevance to transparent hugepages Update Documentation/vm/transhuge.txt and Documentation/filesystems/proc.txt with some information on monitoring transparent huge page usage and the associated overhead. Signed-off-by: Mel Gorman Signed-off-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/proc.txt | 2 ++ Documentation/vm/transhuge.txt | 62 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+) (limited to 'Documentation') diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index ef088e5..912af6c 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -743,6 +743,7 @@ Committed_AS: 100056 kB VmallocTotal: 112216 kB VmallocUsed: 428 kB VmallocChunk: 111088 kB +AnonHugePages: 49152 kB MemTotal: Total usable ram (i.e. physical ram minus a few reserved bits and the kernel binary code) @@ -776,6 +777,7 @@ VmallocChunk: 111088 kB Dirty: Memory which is waiting to get written back to the disk Writeback: Memory which is actively being written back to the disk AnonPages: Non-file backed pages mapped into userspace page tables +AnonHugePages: Non-file backed huge pages mapped into userspace page tables Mapped: files which have been mmaped, such as libraries Slab: in-kernel data structures cache SReclaimable: Part of Slab, that might be reclaimed, such as caches diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt index 29bdf62..f734bb2 100644 --- a/Documentation/vm/transhuge.txt +++ b/Documentation/vm/transhuge.txt @@ -166,6 +166,68 @@ behavior. So to make them effective you need to restart any application that could have been using hugepages. This also applies to the regions registered in khugepaged. +== Monitoring usage == + +The number of transparent huge pages currently used by the system is +available by reading the AnonHugePages field in /proc/meminfo. To +identify what applications are using transparent huge pages, it is +necessary to read /proc/PID/smaps and count the AnonHugePages fields +for each mapping. Note that reading the smaps file is expensive and +reading it frequently will incur overhead. + +There are a number of counters in /proc/vmstat that may be used to +monitor how successfully the system is providing huge pages for use. + +thp_fault_alloc is incremented every time a huge page is successfully + allocated to handle a page fault. This applies to both the + first time a page is faulted and for COW faults. + +thp_collapse_alloc is incremented by khugepaged when it has found + a range of pages to collapse into one huge page and has + successfully allocated a new huge page to store the data. + +thp_fault_fallback is incremented if a page fault fails to allocate + a huge page and instead falls back to using small pages. + +thp_collapse_alloc_failed is incremented if khugepaged found a range + of pages that should be collapsed into one huge page but failed + the allocation. + +thp_split is incremented every time a huge page is split into base + pages. This can happen for a variety of reasons but a common + reason is that a huge page is old and is being reclaimed. + +As the system ages, allocating huge pages may be expensive as the +system uses memory compaction to copy data around memory to free a +huge page for use. There are some counters in /proc/vmstat to help +monitor this overhead. + +compact_stall is incremented every time a process stalls to run + memory compaction so that a huge page is free for use. + +compact_success is incremented if the system compacted memory and + freed a huge page for use. + +compact_fail is incremented if the system tries to compact memory + but failed. + +compact_pages_moved is incremented each time a page is moved. If + this value is increasing rapidly, it implies that the system + is copying a lot of data to satisfy the huge page allocation. + It is possible that the cost of copying exceeds any savings + from reduced TLB misses. + +compact_pagemigrate_failed is incremented when the underlying mechanism + for moving a page failed. + +compact_blocks_moved is incremented each time memory compaction examines + a huge page aligned range of pages. + +It is possible to establish how long the stalls were using the function +tracer to record how long was spent in __alloc_pages_nodemask and +using the mm_page_alloc tracepoint to identify which allocations were +for huge pages. + == get_user_pages and follow_page == get_user_pages and follow_page if run on a hugepage, will return the -- cgit v1.1 From 4b91355e9dc9ac1eb3d69e56de093899ff2677ef Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 29 May 2012 15:06:51 -0700 Subject: memcg: fix/change behavior of shared anon at moving task This patch changes memcg's behavior at task_move(). At task_move(), the kernel scans a task's page table and move the changes for mapped pages from source cgroup to target cgroup. There has been a bug at handling shared anonymous pages for a long time. Before patch: - The spec says 'shared anonymous pages are not moved.' - The implementation was 'shared anonymoys pages may be moved'. If page_mapcount <=2, shared anonymous pages's charge were moved. After patch: - The spec says 'all anonymous pages are moved'. - The implementation is 'all anonymous pages are moved'. Considering usage of memcg, this will not affect user's experience. 'shared anonymous' pages only exists between a tree of processes which don't do exec(). Moving one of process without exec() seems not sane. For example, libcgroup will not be affected by this change. (Anyway, no one noticed the implementation for a long time...) Below is a discussion log: - current spec/implementation are complex - Now, shared file caches are moved - It adds unclear check as page_mapcount(). To do correct check, we should check swap users, etc. - No one notice this implementation behavior. So, no one get benefit from the design. - In general, once task is moved to a cgroup for running, it will not be moved.... - Finally, we have control knob as memory.move_charge_at_immigrate. Here is a patch to allow moving shared pages, completely. This makes memcg simpler and fix current broken code. Suggested-by: Hugh Dickins Signed-off-by: KAMEZAWA Hiroyuki Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Naoya Horiguchi Cc: Glauber Costa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/cgroups/memory.txt | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'Documentation') diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index e479007..6a066a2 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt @@ -184,12 +184,14 @@ behind this approach is that a cgroup that aggressively uses a shared page will eventually get charged for it (once it is uncharged from the cgroup that brought it in -- this will happen on memory pressure). +But see section 8.2: when moving a task to another cgroup, its pages may +be recharged to the new cgroup, if move_charge_at_immigrate has been chosen. + Exception: If CONFIG_CGROUP_CGROUP_MEM_RES_CTLR_SWAP is not used. When you do swapoff and make swapped-out pages of shmem(tmpfs) to be backed into memory in force, charges for pages are accounted against the caller of swapoff rather than the users of shmem. - 2.4 Swap Extension (CONFIG_CGROUP_MEM_RES_CTLR_SWAP) Swap Extension allows you to record charge for swap. A swapped-in page is @@ -615,8 +617,7 @@ memory cgroup. bit | what type of charges would be moved ? -----+------------------------------------------------------------------------ 0 | A charge of an anonymous page(or swap of it) used by the target task. - | Those pages and swaps must be used only by the target task. You must - | enable Swap Extension(see 2.4) to enable move of swap charges. + | You must enable Swap Extension(see 2.4) to enable move of swap charges. -----+------------------------------------------------------------------------ 1 | A charge of file pages(normal file, tmpfs file(e.g. ipc shared memory) | and swaps of tmpfs file) mmapped by the target task. Unlike the case of @@ -629,8 +630,6 @@ memory cgroup. 8.3 TODO -- Implement madvise(2) to let users decide the vma to be moved or not to be - moved. - All of moving charge operations are done under cgroup_mutex. It's not good behavior to hold the mutex too long, so we may need some trick. -- cgit v1.1