summaryrefslogtreecommitdiffstats
path: root/Documentation
diff options
context:
space:
mode:
Diffstat (limited to 'Documentation')
-rw-r--r--Documentation/DocBook/mtdnand.tmpl2
-rw-r--r--Documentation/DocBook/scsi.tmpl2
-rw-r--r--Documentation/SubmittingPatches2
-rw-r--r--Documentation/filesystems/nfs41-server.txt54
-rw-r--r--Documentation/filesystems/nfsroot.txt2
-rw-r--r--Documentation/filesystems/proc.txt22
-rw-r--r--Documentation/gcov.txt2
-rw-r--r--Documentation/hwmon/hpfall.c115
-rw-r--r--Documentation/hwmon/pc874272
-rw-r--r--Documentation/kernel-parameters.txt2
-rw-r--r--Documentation/kmemcheck.txt21
-rw-r--r--Documentation/memory.txt31
-rw-r--r--Documentation/networking/regulatory.txt2
-rw-r--r--Documentation/numastat.txt8
-rw-r--r--Documentation/powerpc/dts-bindings/marvell.txt2
-rw-r--r--Documentation/scsi/ChangeLog.megaraid2
-rw-r--r--Documentation/scsi/scsi_fc_transport.txt2
-rw-r--r--Documentation/sound/alsa/HD-Audio-Models.txt2
-rw-r--r--Documentation/sysctl/kernel.txt30
-rw-r--r--Documentation/sysctl/vm.txt4
-rw-r--r--Documentation/trace/events-kmem.txt107
-rw-r--r--Documentation/trace/events.txt2
-rw-r--r--Documentation/trace/ftrace.txt2
-rw-r--r--Documentation/trace/postprocess/trace-pagealloc-postprocess.pl418
-rw-r--r--Documentation/trace/tracepoint-analysis.txt327
-rw-r--r--Documentation/vm/00-INDEX4
-rw-r--r--Documentation/vm/hugetlbpage.txt147
-rw-r--r--Documentation/vm/ksm.txt89
-rw-r--r--Documentation/vm/map_hugetlb.c77
29 files changed, 1310 insertions, 172 deletions
diff --git a/Documentation/DocBook/mtdnand.tmpl b/Documentation/DocBook/mtdnand.tmpl
index 8e14585..df0d089 100644
--- a/Documentation/DocBook/mtdnand.tmpl
+++ b/Documentation/DocBook/mtdnand.tmpl
@@ -568,7 +568,7 @@ static void board_select_chip (struct mtd_info *mtd, int chip)
<para>
The blocks in which the tables are stored are procteted against
accidental access by marking them bad in the memory bad block
- table. The bad block table managment functions are allowed
+ table. The bad block table management functions are allowed
to circumvernt this protection.
</para>
<para>
diff --git a/Documentation/DocBook/scsi.tmpl b/Documentation/DocBook/scsi.tmpl
index 10a150a..d87f456 100644
--- a/Documentation/DocBook/scsi.tmpl
+++ b/Documentation/DocBook/scsi.tmpl
@@ -317,7 +317,7 @@
<para>
The SAS transport class contains common code to deal with SAS HBAs,
an aproximated representation of SAS topologies in the driver model,
- and various sysfs attributes to expose these topologies and managment
+ and various sysfs attributes to expose these topologies and management
interfaces to userspace.
</para>
<para>
diff --git a/Documentation/SubmittingPatches b/Documentation/SubmittingPatches
index 5c555a8b..b7f9d3b 100644
--- a/Documentation/SubmittingPatches
+++ b/Documentation/SubmittingPatches
@@ -183,7 +183,7 @@ the MAN-PAGES maintainer (as listed in the MAINTAINERS file)
a man-pages patch, or at least a notification of the change,
so that some information makes its way into the manual pages.
-Even if the maintainer did not respond in step #4, make sure to ALWAYS
+Even if the maintainer did not respond in step #5, make sure to ALWAYS
copy the maintainer when you change their code.
For small patches you may want to CC the Trivial Patch Monkey
diff --git a/Documentation/filesystems/nfs41-server.txt b/Documentation/filesystems/nfs41-server.txt
index 05d81cb..5920fe2 100644
--- a/Documentation/filesystems/nfs41-server.txt
+++ b/Documentation/filesystems/nfs41-server.txt
@@ -11,6 +11,11 @@ the /proc/fs/nfsd/versions control file. Note that to write this
control file, the nfsd service must be taken down. Use your user-mode
nfs-utils to set this up; see rpc.nfsd(8)
+(Warning: older servers will interpret "+4.1" and "-4.1" as "+4" and
+"-4", respectively. Therefore, code meant to work on both new and old
+kernels must turn 4.1 on or off *before* turning support for version 4
+on or off; rpc.nfsd does this correctly.)
+
The NFSv4 minorversion 1 (NFSv4.1) implementation in nfsd is based
on the latest NFSv4.1 Internet Draft:
http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion1-29
@@ -25,6 +30,49 @@ are still under development out of tree.
See http://wiki.linux-nfs.org/wiki/index.php/PNFS_prototype_design
for more information.
+The current implementation is intended for developers only: while it
+does support ordinary file operations on clients we have tested against
+(including the linux client), it is incomplete in ways which may limit
+features unexpectedly, cause known bugs in rare cases, or cause
+interoperability problems with future clients. Known issues:
+
+ - gss support is questionable: currently mounts with kerberos
+ from a linux client are possible, but we aren't really
+ conformant with the spec (for example, we don't use kerberos
+ on the backchannel correctly).
+ - no trunking support: no clients currently take advantage of
+ trunking, but this is a mandatory failure, and its use is
+ recommended to clients in a number of places. (E.g. to ensure
+ timely renewal in case an existing connection's retry timeouts
+ have gotten too long; see section 8.3 of the draft.)
+ Therefore, lack of this feature may cause future clients to
+ fail.
+ - Incomplete backchannel support: incomplete backchannel gss
+ support and no support for BACKCHANNEL_CTL mean that
+ callbacks (hence delegations and layouts) may not be
+ available and clients confused by the incomplete
+ implementation may fail.
+ - Server reboot recovery is unsupported; if the server reboots,
+ clients may fail.
+ - We do not support SSV, which provides security for shared
+ client-server state (thus preventing unauthorized tampering
+ with locks and opens, for example). It is mandatory for
+ servers to support this, though no clients use it yet.
+ - Mandatory operations which we do not support, such as
+ DESTROY_CLIENTID, FREE_STATEID, SECINFO_NO_NAME, and
+ TEST_STATEID, are not currently used by clients, but will be
+ (and the spec recommends their uses in common cases), and
+ clients should not be expected to know how to recover from the
+ case where they are not supported. This will eventually cause
+ interoperability failures.
+
+In addition, some limitations are inherited from the current NFSv4
+implementation:
+
+ - Incomplete delegation enforcement: if a file is renamed or
+ unlinked, a client holding a delegation may continue to
+ indefinitely allow opens of the file under the old name.
+
The table below, taken from the NFSv4.1 document, lists
the operations that are mandatory to implement (REQ), optional
(OPT), and NFSv4.0 operations that are required not to implement (MNI)
@@ -142,6 +190,12 @@ NS*| CB_WANTS_CANCELLED | OPT | FDELG, | Section 20.10 |
Implementation notes:
+DELEGPURGE:
+* mandatory only for servers that support CLAIM_DELEGATE_PREV and/or
+ CLAIM_DELEG_PREV_FH (which allows clients to keep delegations that
+ persist across client reboots). Thus we need not implement this for
+ now.
+
EXCHANGE_ID:
* only SP4_NONE state protection supported
* implementation ids are ignored
diff --git a/Documentation/filesystems/nfsroot.txt b/Documentation/filesystems/nfsroot.txt
index 68baddf..3ba0b94 100644
--- a/Documentation/filesystems/nfsroot.txt
+++ b/Documentation/filesystems/nfsroot.txt
@@ -105,7 +105,7 @@ ip=<client-ip>:<server-ip>:<gw-ip>:<netmask>:<hostname>:<device>:<autoconf>
the client address and this parameter is NOT empty only
replies from the specified server are accepted.
- Only required for for NFS root. That is autoconfiguration
+ Only required for NFS root. That is autoconfiguration
will not be triggered if it is missing and NFS root is not
in operation.
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index ffead13..75988ba 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -375,6 +375,19 @@ of memory currently marked as referenced or accessed.
This file is only present if the CONFIG_MMU kernel configuration option is
enabled.
+The /proc/PID/clear_refs is used to reset the PG_Referenced and ACCESSED/YOUNG
+bits on both physical and virtual pages associated with a process.
+To clear the bits for all the pages associated with the process
+ > echo 1 > /proc/PID/clear_refs
+
+To clear the bits for the anonymous pages associated with the process
+ > echo 2 > /proc/PID/clear_refs
+
+To clear the bits for the file mapped pages associated with the process
+ > echo 3 > /proc/PID/clear_refs
+Any other value written to /proc/PID/clear_refs will have no effect.
+
+
1.2 Kernel data
---------------
@@ -1032,9 +1045,9 @@ Various pieces of information about kernel activity are available in the
since the system first booted. For a quick look, simply cat the file:
> cat /proc/stat
- cpu 2255 34 2290 22625563 6290 127 456 0
- cpu0 1132 34 1441 11311718 3675 127 438 0
- cpu1 1123 0 849 11313845 2614 0 18 0
+ cpu 2255 34 2290 22625563 6290 127 456 0 0
+ cpu0 1132 34 1441 11311718 3675 127 438 0 0
+ cpu1 1123 0 849 11313845 2614 0 18 0 0
intr 114930548 113199788 3 0 5 263 0 4 [... lots more numbers ...]
ctxt 1990473
btime 1062191376
@@ -1056,6 +1069,7 @@ second). The meanings of the columns are as follows, from left to right:
- irq: servicing interrupts
- softirq: servicing softirqs
- steal: involuntary wait
+- guest: running a guest
The "intr" line gives counts of interrupts serviced since boot time, for each
of the possible system interrupts. The first column is the total of all
@@ -1191,7 +1205,7 @@ The following heuristics are then applied:
* if the task was reniced, its score doubles
* superuser or direct hardware access tasks (CAP_SYS_ADMIN, CAP_SYS_RESOURCE
or CAP_SYS_RAWIO) have their score divided by 4
- * if oom condition happened in one cpuset and checked task does not belong
+ * if oom condition happened in one cpuset and checked process does not belong
to it, its score is divided by 8
* the resulting score is multiplied by two to the power of oom_adj, i.e.
points <<= oom_adj when it is positive and
diff --git a/Documentation/gcov.txt b/Documentation/gcov.txt
index 40ec633..e7ca647 100644
--- a/Documentation/gcov.txt
+++ b/Documentation/gcov.txt
@@ -47,7 +47,7 @@ Possible uses:
Configure the kernel with:
- CONFIG_DEBUGFS=y
+ CONFIG_DEBUG_FS=y
CONFIG_GCOV_KERNEL=y
and to get coverage data for the entire kernel:
diff --git a/Documentation/hwmon/hpfall.c b/Documentation/hwmon/hpfall.c
index bbea1cc..681ec22 100644
--- a/Documentation/hwmon/hpfall.c
+++ b/Documentation/hwmon/hpfall.c
@@ -16,6 +16,34 @@
#include <stdint.h>
#include <errno.h>
#include <signal.h>
+#include <sys/mman.h>
+#include <sched.h>
+
+char unload_heads_path[64];
+
+int set_unload_heads_path(char *device)
+{
+ char devname[64];
+
+ if (strlen(device) <= 5 || strncmp(device, "/dev/", 5) != 0)
+ return -EINVAL;
+ strncpy(devname, device + 5, sizeof(devname));
+
+ snprintf(unload_heads_path, sizeof(unload_heads_path),
+ "/sys/block/%s/device/unload_heads", devname);
+ return 0;
+}
+int valid_disk(void)
+{
+ int fd = open(unload_heads_path, O_RDONLY);
+ if (fd < 0) {
+ perror(unload_heads_path);
+ return 0;
+ }
+
+ close(fd);
+ return 1;
+}
void write_int(char *path, int i)
{
@@ -40,7 +68,7 @@ void set_led(int on)
void protect(int seconds)
{
- write_int("/sys/block/sda/device/unload_heads", seconds*1000);
+ write_int(unload_heads_path, seconds*1000);
}
int on_ac(void)
@@ -57,45 +85,62 @@ void ignore_me(void)
{
protect(0);
set_led(0);
-
}
-int main(int argc, char* argv[])
+int main(int argc, char **argv)
{
- int fd, ret;
+ int fd, ret;
+ struct sched_param param;
+
+ if (argc == 1)
+ ret = set_unload_heads_path("/dev/sda");
+ else if (argc == 2)
+ ret = set_unload_heads_path(argv[1]);
+ else
+ ret = -EINVAL;
+
+ if (ret || !valid_disk()) {
+ fprintf(stderr, "usage: %s <device> (default: /dev/sda)\n",
+ argv[0]);
+ exit(1);
+ }
+
+ fd = open("/dev/freefall", O_RDONLY);
+ if (fd < 0) {
+ perror("/dev/freefall");
+ return EXIT_FAILURE;
+ }
- fd = open("/dev/freefall", O_RDONLY);
- if (fd < 0) {
- perror("open");
- return EXIT_FAILURE;
- }
+ daemon(0, 0);
+ param.sched_priority = sched_get_priority_max(SCHED_FIFO);
+ sched_setscheduler(0, SCHED_FIFO, &param);
+ mlockall(MCL_CURRENT|MCL_FUTURE);
signal(SIGALRM, ignore_me);
- for (;;) {
- unsigned char count;
-
- ret = read(fd, &count, sizeof(count));
- alarm(0);
- if ((ret == -1) && (errno == EINTR)) {
- /* Alarm expired, time to unpark the heads */
- continue;
- }
-
- if (ret != sizeof(count)) {
- perror("read");
- break;
- }
-
- protect(21);
- set_led(1);
- if (1 || on_ac() || lid_open()) {
- alarm(2);
- } else {
- alarm(20);
- }
- }
-
- close(fd);
- return EXIT_SUCCESS;
+ for (;;) {
+ unsigned char count;
+
+ ret = read(fd, &count, sizeof(count));
+ alarm(0);
+ if ((ret == -1) && (errno == EINTR)) {
+ /* Alarm expired, time to unpark the heads */
+ continue;
+ }
+
+ if (ret != sizeof(count)) {
+ perror("read");
+ break;
+ }
+
+ protect(21);
+ set_led(1);
+ if (1 || on_ac() || lid_open())
+ alarm(2);
+ else
+ alarm(20);
+ }
+
+ close(fd);
+ return EXIT_SUCCESS;
}
diff --git a/Documentation/hwmon/pc87427 b/Documentation/hwmon/pc87427
index d1ebbe5..db5cc12 100644
--- a/Documentation/hwmon/pc87427
+++ b/Documentation/hwmon/pc87427
@@ -34,5 +34,5 @@ Fan rotation speeds are reported as 14-bit values from a gated clock
signal. Speeds down to 83 RPM can be measured.
An alarm is triggered if the rotation speed drops below a programmable
-limit. Another alarm is triggered if the speed is too low to to be measured
+limit. Another alarm is triggered if the speed is too low to be measured
(including stalled or missing fan).
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 0f17d16..c363840 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -933,7 +933,7 @@ and is between 256 and 4096 characters. It is defined in the file
1 -- enable informational integrity auditing messages.
ima_hash= [IMA]
- Formt: { "sha1" | "md5" }
+ Format: { "sha1" | "md5" }
default: "sha1"
ima_tcb [IMA]
diff --git a/Documentation/kmemcheck.txt b/Documentation/kmemcheck.txt
index 36304460..c28f828 100644
--- a/Documentation/kmemcheck.txt
+++ b/Documentation/kmemcheck.txt
@@ -43,26 +43,7 @@ feature.
1. Downloading
==============
-kmemcheck can only be downloaded using git. If you want to write patches
-against the current code, you should use the kmemcheck development branch of
-the tip tree. It is also possible to use the linux-next tree, which also
-includes the latest version of kmemcheck.
-
-Assuming that you've already cloned the linux-2.6.git repository, all you
-have to do is add the -tip tree as a remote, like this:
-
- $ git remote add tip git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip.git
-
-To actually download the tree, fetch the remote:
-
- $ git fetch tip
-
-And to check out a new local branch with the kmemcheck code:
-
- $ git checkout -b kmemcheck tip/kmemcheck
-
-General instructions for the -tip tree can be found here:
-http://people.redhat.com/mingo/tip.git/readme.txt
+As of version 2.6.31-rc1, kmemcheck is included in the mainline kernel.
2. Configuring and compiling
diff --git a/Documentation/memory.txt b/Documentation/memory.txt
index 2b3dedd..802efe5 100644
--- a/Documentation/memory.txt
+++ b/Documentation/memory.txt
@@ -1,18 +1,7 @@
There are several classic problems related to memory on Linux
systems.
- 1) There are some buggy motherboards which cannot properly
- deal with the memory above 16MB. Consider exchanging
- your motherboard.
-
- 2) You cannot do DMA on the ISA bus to addresses above
- 16M. Most device drivers under Linux allow the use
- of bounce buffers which work around this problem. Drivers
- that don't use bounce buffers will be unstable with
- more than 16M installed. Drivers that use bounce buffers
- will be OK, but may have slightly higher overhead.
-
- 3) There are some motherboards that will not cache above
+ 1) There are some motherboards that will not cache above
a certain quantity of memory. If you have one of these
motherboards, your system will be SLOWER, not faster
as you add more memory. Consider exchanging your
@@ -24,7 +13,7 @@ It can also tell Linux to use less memory than is actually installed.
If you use "mem=" on a machine with PCI, consider using "memmap=" to avoid
physical address space collisions.
-See the documentation of your boot loader (LILO, loadlin, etc.) about
+See the documentation of your boot loader (LILO, grub, loadlin, etc.) about
how to pass options to the kernel.
There are other memory problems which Linux cannot deal with. Random
@@ -42,19 +31,3 @@ Try:
with the vendor. Consider testing it with memtest86 yourself.
* Exchanging your CPU, cache, or motherboard for one that works.
-
- * Disabling the cache from the BIOS.
-
- * Try passing the "mem=4M" option to the kernel to limit
- Linux to using a very small amount of memory. Use "memmap="-option
- together with "mem=" on systems with PCI to avoid physical address
- space collisions.
-
-
-Other tricks:
-
- * Try passing the "no-387" option to the kernel to ignore
- a buggy FPU.
-
- * Try passing the "no-hlt" option to disable the potentially
- buggy HLT instruction in your CPU.
diff --git a/Documentation/networking/regulatory.txt b/Documentation/networking/regulatory.txt
index eaa1a25..ee31369 100644
--- a/Documentation/networking/regulatory.txt
+++ b/Documentation/networking/regulatory.txt
@@ -96,7 +96,7 @@ Example code - drivers hinting an alpha2:
This example comes from the zd1211rw device driver. You can start
by having a mapping of your device's EEPROM country/regulatory
-domain value to to a specific alpha2 as follows:
+domain value to a specific alpha2 as follows:
static struct zd_reg_alpha2_map reg_alpha2_map[] = {
{ ZD_REGDOMAIN_FCC, "US" },
diff --git a/Documentation/numastat.txt b/Documentation/numastat.txt
index 80133ac..9fcc9a6 100644
--- a/Documentation/numastat.txt
+++ b/Documentation/numastat.txt
@@ -7,10 +7,10 @@ All units are pages. Hugepages have separate counters.
numa_hit A process wanted to allocate memory from this node,
and succeeded.
-numa_miss A process wanted to allocate memory from this node,
- but ended up with memory from another.
-numa_foreign A process wanted to allocate on another node,
- but ended up with memory from this one.
+numa_miss A process wanted to allocate memory from another node,
+ but ended up with memory from this node.
+numa_foreign A process wanted to allocate on this node,
+ but ended up with memory from another one.
local_node A process ran on this node and got memory from it.
other_node A process ran on this node and got memory from another node.
interleave_hit Interleaving wanted to allocate from this node
diff --git a/Documentation/powerpc/dts-bindings/marvell.txt b/Documentation/powerpc/dts-bindings/marvell.txt
index 3708a2f..f1533d9 100644
--- a/Documentation/powerpc/dts-bindings/marvell.txt
+++ b/Documentation/powerpc/dts-bindings/marvell.txt
@@ -32,7 +32,7 @@ prefixed with the string "marvell,", for Marvell Technology Group Ltd.
devices. This field represents the number of cells needed to
represent the address of the memory-mapped registers of devices
within the system controller chip.
- - #size-cells : Size representation for for the memory-mapped
+ - #size-cells : Size representation for the memory-mapped
registers within the system controller chip.
- #interrupt-cells : Defines the width of cells used to represent
interrupts.
diff --git a/Documentation/scsi/ChangeLog.megaraid b/Documentation/scsi/ChangeLog.megaraid
index eaa4801..38e9e7c 100644
--- a/Documentation/scsi/ChangeLog.megaraid
+++ b/Documentation/scsi/ChangeLog.megaraid
@@ -514,7 +514,7 @@ iv. Remove yield() while mailbox handshake in synchronous commands
v. Remove redundant __megaraid_busywait_mbox routine
-vi. Fix bug in the managment module, which causes a system lockup when the
+vi. Fix bug in the management module, which causes a system lockup when the
IO module is loaded and then unloaded, followed by executing any
management utility. The current version of management module does not
handle the adapter unregister properly.
diff --git a/Documentation/scsi/scsi_fc_transport.txt b/Documentation/scsi/scsi_fc_transport.txt
index d7f1817..aec6549 100644
--- a/Documentation/scsi/scsi_fc_transport.txt
+++ b/Documentation/scsi/scsi_fc_transport.txt
@@ -378,7 +378,7 @@ Vport Disable/Enable:
int vport_disable(struct fc_vport *vport, bool disable)
where:
- vport: Is vport to to be enabled or disabled
+ vport: Is vport to be enabled or disabled
disable: If "true", the vport is to be disabled.
If "false", the vport is to be enabled.
diff --git a/Documentation/sound/alsa/HD-Audio-Models.txt b/Documentation/sound/alsa/HD-Audio-Models.txt
index 97eebd6..f1708b7 100644
--- a/Documentation/sound/alsa/HD-Audio-Models.txt
+++ b/Documentation/sound/alsa/HD-Audio-Models.txt
@@ -387,7 +387,7 @@ STAC92HD73*
STAC92HD83*
===========
ref Reference board
- mic-ref Reference board with power managment for ports
+ mic-ref Reference board with power management for ports
dell-s14 Dell laptop
auto BIOS setup (default)
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 2dbff53..3e5b63e 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -319,25 +319,29 @@ This option can be used to select the type of process address
space randomization that is used in the system, for architectures
that support this feature.
-0 - Turn the process address space randomization off by default.
+0 - Turn the process address space randomization off. This is the
+ default for architectures that do not support this feature anyways,
+ and kernels that are booted with the "norandmaps" parameter.
1 - Make the addresses of mmap base, stack and VDSO page randomized.
This, among other things, implies that shared libraries will be
- loaded to random addresses. Also for PIE-linked binaries, the location
- of code start is randomized.
+ loaded to random addresses. Also for PIE-linked binaries, the
+ location of code start is randomized. This is the default if the
+ CONFIG_COMPAT_BRK option is enabled.
- With heap randomization, the situation is a little bit more
- complicated.
- There a few legacy applications out there (such as some ancient
+2 - Additionally enable heap randomization. This is the default if
+ CONFIG_COMPAT_BRK is disabled.
+
+ There are a few legacy applications out there (such as some ancient
versions of libc.so.5 from 1996) that assume that brk area starts
- just after the end of the code+bss. These applications break when
- start of the brk area is randomized. There are however no known
+ just after the end of the code+bss. These applications break when
+ start of the brk area is randomized. There are however no known
non-legacy applications that would be broken this way, so for most
- systems it is safe to choose full randomization. However there is
- a CONFIG_COMPAT_BRK option for systems with ancient and/or broken
- binaries, that makes heap non-randomized, but keeps all other
- parts of process address space randomized if randomize_va_space
- sysctl is turned on.
+ systems it is safe to choose full randomization.
+
+ Systems with ancient and/or broken binaries should be configured
+ with CONFIG_COMPAT_BRK enabled, which excludes the heap from process
+ address space randomization.
==============================================================
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index c4de635..e6fb1ec 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -585,7 +585,9 @@ caching of directory and inode objects.
At the default value of vfs_cache_pressure=100 the kernel will attempt to
reclaim dentries and inodes at a "fair" rate with respect to pagecache and
swapcache reclaim. Decreasing vfs_cache_pressure causes the kernel to prefer
-to retain dentry and inode caches. Increasing vfs_cache_pressure beyond 100
+to retain dentry and inode caches. When vfs_cache_pressure=0, the kernel will
+never reclaim dentries and inodes due to memory pressure and this can easily
+lead to out-of-memory conditions. Increasing vfs_cache_pressure beyond 100
causes the kernel to prefer to reclaim dentries and inodes.
==============================================================
diff --git a/Documentation/trace/events-kmem.txt b/Documentation/trace/events-kmem.txt
new file mode 100644
index 0000000..6ef2a86
--- /dev/null
+++ b/Documentation/trace/events-kmem.txt
@@ -0,0 +1,107 @@
+ Subsystem Trace Points: kmem
+
+The tracing system kmem captures events related to object and page allocation
+within the kernel. Broadly speaking there are four major subheadings.
+
+ o Slab allocation of small objects of unknown type (kmalloc)
+ o Slab allocation of small objects of known type
+ o Page allocation
+ o Per-CPU Allocator Activity
+ o External Fragmentation
+
+This document will describe what each of the tracepoints are and why they
+might be useful.
+
+1. Slab allocation of small objects of unknown type
+===================================================
+kmalloc call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s
+kmalloc_node call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d
+kfree call_site=%lx ptr=%p
+
+Heavy activity for these events may indicate that a specific cache is
+justified, particularly if kmalloc slab pages are getting significantly
+internal fragmented as a result of the allocation pattern. By correlating
+kmalloc with kfree, it may be possible to identify memory leaks and where
+the allocation sites were.
+
+
+2. Slab allocation of small objects of known type
+=================================================
+kmem_cache_alloc call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s
+kmem_cache_alloc_node call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d
+kmem_cache_free call_site=%lx ptr=%p
+
+These events are similar in usage to the kmalloc-related events except that
+it is likely easier to pin the event down to a specific cache. At the time
+of writing, no information is available on what slab is being allocated from,
+but the call_site can usually be used to extrapolate that information
+
+3. Page allocation
+==================
+mm_page_alloc page=%p pfn=%lu order=%d migratetype=%d gfp_flags=%s
+mm_page_alloc_zone_locked page=%p pfn=%lu order=%u migratetype=%d cpu=%d percpu_refill=%d
+mm_page_free_direct page=%p pfn=%lu order=%d
+mm_pagevec_free page=%p pfn=%lu order=%d cold=%d
+
+These four events deal with page allocation and freeing. mm_page_alloc is
+a simple indicator of page allocator activity. Pages may be allocated from
+the per-CPU allocator (high performance) or the buddy allocator.
+
+If pages are allocated directly from the buddy allocator, the
+mm_page_alloc_zone_locked event is triggered. This event is important as high
+amounts of activity imply high activity on the zone->lock. Taking this lock
+impairs performance by disabling interrupts, dirtying cache lines between
+CPUs and serialising many CPUs.
+
+When a page is freed directly by the caller, the mm_page_free_direct event
+is triggered. Significant amounts of activity here could indicate that the
+callers should be batching their activities.
+
+When pages are freed using a pagevec, the mm_pagevec_free is
+triggered. Broadly speaking, pages are taken off the LRU lock in bulk and
+freed in batch with a pagevec. Significant amounts of activity here could
+indicate that the system is under memory pressure and can also indicate
+contention on the zone->lru_lock.
+
+4. Per-CPU Allocator Activity
+=============================
+mm_page_alloc_zone_locked page=%p pfn=%lu order=%u migratetype=%d cpu=%d percpu_refill=%d
+mm_page_pcpu_drain page=%p pfn=%lu order=%d cpu=%d migratetype=%d
+
+In front of the page allocator is a per-cpu page allocator. It exists only
+for order-0 pages, reduces contention on the zone->lock and reduces the
+amount of writing on struct page.
+
+When a per-CPU list is empty or pages of the wrong type are allocated,
+the zone->lock will be taken once and the per-CPU list refilled. The event
+triggered is mm_page_alloc_zone_locked for each page allocated with the
+event indicating whether it is for a percpu_refill or not.
+
+When the per-CPU list is too full, a number of pages are freed, each one
+which triggers a mm_page_pcpu_drain event.
+
+The individual nature of the events are so that pages can be tracked
+between allocation and freeing. A number of drain or refill pages that occur
+consecutively imply the zone->lock being taken once. Large amounts of PCP
+refills and drains could imply an imbalance between CPUs where too much work
+is being concentrated in one place. It could also indicate that the per-CPU
+lists should be a larger size. Finally, large amounts of refills on one CPU
+and drains on another could be a factor in causing large amounts of cache
+line bounces due to writes between CPUs and worth investigating if pages
+can be allocated and freed on the same CPU through some algorithm change.
+
+5. External Fragmentation
+=========================
+mm_page_alloc_extfrag page=%p pfn=%lu alloc_order=%d fallback_order=%d pageblock_order=%d alloc_migratetype=%d fallback_migratetype=%d fragmenting=%d change_ownership=%d
+
+External fragmentation affects whether a high-order allocation will be
+successful or not. For some types of hardware, this is important although
+it is avoided where possible. If the system is using huge pages and needs
+to be able to resize the pool over the lifetime of the system, this value
+is important.
+
+Large numbers of this event implies that memory is fragmenting and
+high-order allocations will start failing at some time in the future. One
+means of reducing the occurange of this event is to increase the size of
+min_free_kbytes in increments of 3*pageblock_size*nr_online_nodes where
+pageblock_size is usually the size of the default hugepage size.
diff --git a/Documentation/trace/events.txt b/Documentation/trace/events.txt
index 78c45a8..02ac6ed 100644
--- a/Documentation/trace/events.txt
+++ b/Documentation/trace/events.txt
@@ -72,7 +72,7 @@ To enable all events in sched subsystem:
# echo 1 > /sys/kernel/debug/tracing/events/sched/enable
-To eanble all events:
+To enable all events:
# echo 1 > /sys/kernel/debug/tracing/events/enable
diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt
index 1b6292b..957b22f 100644
--- a/Documentation/trace/ftrace.txt
+++ b/Documentation/trace/ftrace.txt
@@ -133,7 +133,7 @@ of ftrace. Here is a list of some of the key files:
than requested, the rest of the page will be used,
making the actual allocation bigger than requested.
( Note, the size may not be a multiple of the page size
- due to buffer managment overhead. )
+ due to buffer management overhead. )
This can only be updated when the current_tracer
is set to "nop".
diff --git a/Documentation/trace/postprocess/trace-pagealloc-postprocess.pl b/Documentation/trace/postprocess/trace-pagealloc-postprocess.pl
new file mode 100644
index 0000000..7df50e8
--- /dev/null
+++ b/Documentation/trace/postprocess/trace-pagealloc-postprocess.pl
@@ -0,0 +1,418 @@
+#!/usr/bin/perl
+# This is a POC (proof of concept or piece of crap, take your pick) for reading the
+# text representation of trace output related to page allocation. It makes an attempt
+# to extract some high-level information on what is going on. The accuracy of the parser
+# may vary considerably
+#
+# Example usage: trace-pagealloc-postprocess.pl < /sys/kernel/debug/tracing/trace_pipe
+# other options
+# --prepend-parent Report on the parent proc and PID
+# --read-procstat If the trace lacks process info, get it from /proc
+# --ignore-pid Aggregate processes of the same name together
+#
+# Copyright (c) IBM Corporation 2009
+# Author: Mel Gorman <mel@csn.ul.ie>
+use strict;
+use Getopt::Long;
+
+# Tracepoint events
+use constant MM_PAGE_ALLOC => 1;
+use constant MM_PAGE_FREE_DIRECT => 2;
+use constant MM_PAGEVEC_FREE => 3;
+use constant MM_PAGE_PCPU_DRAIN => 4;
+use constant MM_PAGE_ALLOC_ZONE_LOCKED => 5;
+use constant MM_PAGE_ALLOC_EXTFRAG => 6;
+use constant EVENT_UNKNOWN => 7;
+
+# Constants used to track state
+use constant STATE_PCPU_PAGES_DRAINED => 8;
+use constant STATE_PCPU_PAGES_REFILLED => 9;
+
+# High-level events extrapolated from tracepoints
+use constant HIGH_PCPU_DRAINS => 10;
+use constant HIGH_PCPU_REFILLS => 11;
+use constant HIGH_EXT_FRAGMENT => 12;
+use constant HIGH_EXT_FRAGMENT_SEVERE => 13;
+use constant HIGH_EXT_FRAGMENT_MODERATE => 14;
+use constant HIGH_EXT_FRAGMENT_CHANGED => 15;
+
+my %perprocesspid;
+my %perprocess;
+my $opt_ignorepid;
+my $opt_read_procstat;
+my $opt_prepend_parent;
+
+# Catch sigint and exit on request
+my $sigint_report = 0;
+my $sigint_exit = 0;
+my $sigint_pending = 0;
+my $sigint_received = 0;
+sub sigint_handler {
+ my $current_time = time;
+ if ($current_time - 2 > $sigint_received) {
+ print "SIGINT received, report pending. Hit ctrl-c again to exit\n";
+ $sigint_report = 1;
+ } else {
+ if (!$sigint_exit) {
+ print "Second SIGINT received quickly, exiting\n";
+ }
+ $sigint_exit++;
+ }
+
+ if ($sigint_exit > 3) {
+ print "Many SIGINTs received, exiting now without report\n";
+ exit;
+ }
+
+ $sigint_received = $current_time;
+ $sigint_pending = 1;
+}
+$SIG{INT} = "sigint_handler";
+
+# Parse command line options
+GetOptions(
+ 'ignore-pid' => \$opt_ignorepid,
+ 'read-procstat' => \$opt_read_procstat,
+ 'prepend-parent' => \$opt_prepend_parent,
+);
+
+# Defaults for dynamically discovered regex's
+my $regex_fragdetails_default = 'page=([0-9a-f]*) pfn=([0-9]*) alloc_order=([-0-9]*) fallback_order=([-0-9]*) pageblock_order=([-0-9]*) alloc_migratetype=([-0-9]*) fallback_migratetype=([-0-9]*) fragmenting=([-0-9]) change_ownership=([-0-9])';
+
+# Dyanically discovered regex
+my $regex_fragdetails;
+
+# Static regex used. Specified like this for readability and for use with /o
+# (process_pid) (cpus ) ( time ) (tpoint ) (details)
+my $regex_traceevent = '\s*([a-zA-Z0-9-]*)\s*(\[[0-9]*\])\s*([0-9.]*):\s*([a-zA-Z_]*):\s*(.*)';
+my $regex_statname = '[-0-9]*\s\((.*)\).*';
+my $regex_statppid = '[-0-9]*\s\(.*\)\s[A-Za-z]\s([0-9]*).*';
+
+sub generate_traceevent_regex {
+ my $event = shift;
+ my $default = shift;
+ my $regex;
+
+ # Read the event format or use the default
+ if (!open (FORMAT, "/sys/kernel/debug/tracing/events/$event/format")) {
+ $regex = $default;
+ } else {
+ my $line;
+ while (!eof(FORMAT)) {
+ $line = <FORMAT>;
+ if ($line =~ /^print fmt:\s"(.*)",.*/) {
+ $regex = $1;
+ $regex =~ s/%p/\([0-9a-f]*\)/g;
+ $regex =~ s/%d/\([-0-9]*\)/g;
+ $regex =~ s/%lu/\([0-9]*\)/g;
+ }
+ }
+ }
+
+ # Verify fields are in the right order
+ my $tuple;
+ foreach $tuple (split /\s/, $regex) {
+ my ($key, $value) = split(/=/, $tuple);
+ my $expected = shift;
+ if ($key ne $expected) {
+ print("WARNING: Format not as expected '$key' != '$expected'");
+ $regex =~ s/$key=\((.*)\)/$key=$1/;
+ }
+ }
+
+ if (defined shift) {
+ die("Fewer fields than expected in format");
+ }
+
+ return $regex;
+}
+$regex_fragdetails = generate_traceevent_regex("kmem/mm_page_alloc_extfrag",
+ $regex_fragdetails_default,
+ "page", "pfn",
+ "alloc_order", "fallback_order", "pageblock_order",
+ "alloc_migratetype", "fallback_migratetype",
+ "fragmenting", "change_ownership");
+
+sub read_statline($) {
+ my $pid = $_[0];
+ my $statline;
+
+ if (open(STAT, "/proc/$pid/stat")) {
+ $statline = <STAT>;
+ close(STAT);
+ }
+
+ if ($statline eq '') {
+ $statline = "-1 (UNKNOWN_PROCESS_NAME) R 0";
+ }
+
+ return $statline;
+}
+
+sub guess_process_pid($$) {
+ my $pid = $_[0];
+ my $statline = $_[1];
+
+ if ($pid == 0) {
+ return "swapper-0";
+ }
+
+ if ($statline !~ /$regex_statname/o) {
+ die("Failed to math stat line for process name :: $statline");
+ }
+ return "$1-$pid";
+}
+
+sub parent_info($$) {
+ my $pid = $_[0];
+ my $statline = $_[1];
+ my $ppid;
+
+ if ($pid == 0) {
+ return "NOPARENT-0";
+ }
+
+ if ($statline !~ /$regex_statppid/o) {
+ die("Failed to match stat line process ppid:: $statline");
+ }
+
+ # Read the ppid stat line
+ $ppid = $1;
+ return guess_process_pid($ppid, read_statline($ppid));
+}
+
+sub process_events {
+ my $traceevent;
+ my $process_pid;
+ my $cpus;
+ my $timestamp;
+ my $tracepoint;
+ my $details;
+ my $statline;
+
+ # Read each line of the event log
+EVENT_PROCESS:
+ while ($traceevent = <STDIN>) {
+ if ($traceevent =~ /$regex_traceevent/o) {
+ $process_pid = $1;
+ $tracepoint = $4;
+
+ if ($opt_read_procstat || $opt_prepend_parent) {
+ $process_pid =~ /(.*)-([0-9]*)$/;
+ my $process = $1;
+ my $pid = $2;
+
+ $statline = read_statline($pid);
+
+ if ($opt_read_procstat && $process eq '') {
+ $process_pid = guess_process_pid($pid, $statline);
+ }
+
+ if ($opt_prepend_parent) {
+ $process_pid = parent_info($pid, $statline) . " :: $process_pid";
+ }
+ }
+
+ # Unnecessary in this script. Uncomment if required
+ # $cpus = $2;
+ # $timestamp = $3;
+ } else {
+ next;
+ }
+
+ # Perl Switch() sucks majorly
+ if ($tracepoint eq "mm_page_alloc") {
+ $perprocesspid{$process_pid}->{MM_PAGE_ALLOC}++;
+ } elsif ($tracepoint eq "mm_page_free_direct") {
+ $perprocesspid{$process_pid}->{MM_PAGE_FREE_DIRECT}++;
+ } elsif ($tracepoint eq "mm_pagevec_free") {
+ $perprocesspid{$process_pid}->{MM_PAGEVEC_FREE}++;
+ } elsif ($tracepoint eq "mm_page_pcpu_drain") {
+ $perprocesspid{$process_pid}->{MM_PAGE_PCPU_DRAIN}++;
+ $perprocesspid{$process_pid}->{STATE_PCPU_PAGES_DRAINED}++;
+ } elsif ($tracepoint eq "mm_page_alloc_zone_locked") {
+ $perprocesspid{$process_pid}->{MM_PAGE_ALLOC_ZONE_LOCKED}++;
+ $perprocesspid{$process_pid}->{STATE_PCPU_PAGES_REFILLED}++;
+ } elsif ($tracepoint eq "mm_page_alloc_extfrag") {
+
+ # Extract the details of the event now
+ $details = $5;
+
+ my ($page, $pfn);
+ my ($alloc_order, $fallback_order, $pageblock_order);
+ my ($alloc_migratetype, $fallback_migratetype);
+ my ($fragmenting, $change_ownership);
+
+ if ($details !~ /$regex_fragdetails/o) {
+ print "WARNING: Failed to parse mm_page_alloc_extfrag as expected\n";
+ next;
+ }
+
+ $perprocesspid{$process_pid}->{MM_PAGE_ALLOC_EXTFRAG}++;
+ $page = $1;
+ $pfn = $2;
+ $alloc_order = $3;
+ $fallback_order = $4;
+ $pageblock_order = $5;
+ $alloc_migratetype = $6;
+ $fallback_migratetype = $7;
+ $fragmenting = $8;
+ $change_ownership = $9;
+
+ if ($fragmenting) {
+ $perprocesspid{$process_pid}->{HIGH_EXT_FRAG}++;
+ if ($fallback_order <= 3) {
+ $perprocesspid{$process_pid}->{HIGH_EXT_FRAGMENT_SEVERE}++;
+ } else {
+ $perprocesspid{$process_pid}->{HIGH_EXT_FRAGMENT_MODERATE}++;
+ }
+ }
+ if ($change_ownership) {
+ $perprocesspid{$process_pid}->{HIGH_EXT_FRAGMENT_CHANGED}++;
+ }
+ } else {
+ $perprocesspid{$process_pid}->{EVENT_UNKNOWN}++;
+ }
+
+ # Catch a full pcpu drain event
+ if ($perprocesspid{$process_pid}->{STATE_PCPU_PAGES_DRAINED} &&
+ $tracepoint ne "mm_page_pcpu_drain") {
+
+ $perprocesspid{$process_pid}->{HIGH_PCPU_DRAINS}++;
+ $perprocesspid{$process_pid}->{STATE_PCPU_PAGES_DRAINED} = 0;
+ }
+
+ # Catch a full pcpu refill event
+ if ($perprocesspid{$process_pid}->{STATE_PCPU_PAGES_REFILLED} &&
+ $tracepoint ne "mm_page_alloc_zone_locked") {
+ $perprocesspid{$process_pid}->{HIGH_PCPU_REFILLS}++;
+ $perprocesspid{$process_pid}->{STATE_PCPU_PAGES_REFILLED} = 0;
+ }
+
+ if ($sigint_pending) {
+ last EVENT_PROCESS;
+ }
+ }
+}
+
+sub dump_stats {
+ my $hashref = shift;
+ my %stats = %$hashref;
+
+ # Dump per-process stats
+ my $process_pid;
+ my $max_strlen = 0;
+
+ # Get the maximum process name
+ foreach $process_pid (keys %perprocesspid) {
+ my $len = length($process_pid);
+ if ($len > $max_strlen) {
+ $max_strlen = $len;
+ }
+ }
+ $max_strlen += 2;
+
+ printf("\n");
+ printf("%-" . $max_strlen . "s %8s %10s %8s %8s %8s %8s %8s %8s %8s %8s %8s %8s %8s\n",
+ "Process", "Pages", "Pages", "Pages", "Pages", "PCPU", "PCPU", "PCPU", "Fragment", "Fragment", "MigType", "Fragment", "Fragment", "Unknown");
+ printf("%-" . $max_strlen . "s %8s %10s %8s %8s %8s %8s %8s %8s %8s %8s %8s %8s %8s\n",
+ "details", "allocd", "allocd", "freed", "freed", "pages", "drains", "refills", "Fallback", "Causing", "Changed", "Severe", "Moderate", "");
+
+ printf("%-" . $max_strlen . "s %8s %10s %8s %8s %8s %8s %8s %8s %8s %8s %8s %8s %8s\n",
+ "", "", "under lock", "direct", "pagevec", "drain", "", "", "", "", "", "", "", "");
+
+ foreach $process_pid (keys %stats) {
+ # Dump final aggregates
+ if ($stats{$process_pid}->{STATE_PCPU_PAGES_DRAINED}) {
+ $stats{$process_pid}->{HIGH_PCPU_DRAINS}++;
+ $stats{$process_pid}->{STATE_PCPU_PAGES_DRAINED} = 0;
+ }
+ if ($stats{$process_pid}->{STATE_PCPU_PAGES_REFILLED}) {
+ $stats{$process_pid}->{HIGH_PCPU_REFILLS}++;
+ $stats{$process_pid}->{STATE_PCPU_PAGES_REFILLED} = 0;
+ }
+
+ printf("%-" . $max_strlen . "s %8d %10d %8d %8d %8d %8d %8d %8d %8d %8d %8d %8d %8d\n",
+ $process_pid,
+ $stats{$process_pid}->{MM_PAGE_ALLOC},
+ $stats{$process_pid}->{MM_PAGE_ALLOC_ZONE_LOCKED},
+ $stats{$process_pid}->{MM_PAGE_FREE_DIRECT},
+ $stats{$process_pid}->{MM_PAGEVEC_FREE},
+ $stats{$process_pid}->{MM_PAGE_PCPU_DRAIN},
+ $stats{$process_pid}->{HIGH_PCPU_DRAINS},
+ $stats{$process_pid}->{HIGH_PCPU_REFILLS},
+ $stats{$process_pid}->{MM_PAGE_ALLOC_EXTFRAG},
+ $stats{$process_pid}->{HIGH_EXT_FRAG},
+ $stats{$process_pid}->{HIGH_EXT_FRAGMENT_CHANGED},
+ $stats{$process_pid}->{HIGH_EXT_FRAGMENT_SEVERE},
+ $stats{$process_pid}->{HIGH_EXT_FRAGMENT_MODERATE},
+ $stats{$process_pid}->{EVENT_UNKNOWN});
+ }
+}
+
+sub aggregate_perprocesspid() {
+ my $process_pid;
+ my $process;
+ undef %perprocess;
+
+ foreach $process_pid (keys %perprocesspid) {
+ $process = $process_pid;
+ $process =~ s/-([0-9])*$//;
+ if ($process eq '') {
+ $process = "NO_PROCESS_NAME";
+ }
+
+ $perprocess{$process}->{MM_PAGE_ALLOC} += $perprocesspid{$process_pid}->{MM_PAGE_ALLOC};
+ $perprocess{$process}->{MM_PAGE_ALLOC_ZONE_LOCKED} += $perprocesspid{$process_pid}->{MM_PAGE_ALLOC_ZONE_LOCKED};
+ $perprocess{$process}->{MM_PAGE_FREE_DIRECT} += $perprocesspid{$process_pid}->{MM_PAGE_FREE_DIRECT};
+ $perprocess{$process}->{MM_PAGEVEC_FREE} += $perprocesspid{$process_pid}->{MM_PAGEVEC_FREE};
+ $perprocess{$process}->{MM_PAGE_PCPU_DRAIN} += $perprocesspid{$process_pid}->{MM_PAGE_PCPU_DRAIN};
+ $perprocess{$process}->{HIGH_PCPU_DRAINS} += $perprocesspid{$process_pid}->{HIGH_PCPU_DRAINS};
+ $perprocess{$process}->{HIGH_PCPU_REFILLS} += $perprocesspid{$process_pid}->{HIGH_PCPU_REFILLS};
+ $perprocess{$process}->{MM_PAGE_ALLOC_EXTFRAG} += $perprocesspid{$process_pid}->{MM_PAGE_ALLOC_EXTFRAG};
+ $perprocess{$process}->{HIGH_EXT_FRAG} += $perprocesspid{$process_pid}->{HIGH_EXT_FRAG};
+ $perprocess{$process}->{HIGH_EXT_FRAGMENT_CHANGED} += $perprocesspid{$process_pid}->{HIGH_EXT_FRAGMENT_CHANGED};
+ $perprocess{$process}->{HIGH_EXT_FRAGMENT_SEVERE} += $perprocesspid{$process_pid}->{HIGH_EXT_FRAGMENT_SEVERE};
+ $perprocess{$process}->{HIGH_EXT_FRAGMENT_MODERATE} += $perprocesspid{$process_pid}->{HIGH_EXT_FRAGMENT_MODERATE};
+ $perprocess{$process}->{EVENT_UNKNOWN} += $perprocesspid{$process_pid}->{EVENT_UNKNOWN};
+ }
+}
+
+sub report() {
+ if (!$opt_ignorepid) {
+ dump_stats(\%perprocesspid);
+ } else {
+ aggregate_perprocesspid();
+ dump_stats(\%perprocess);
+ }
+}
+
+# Process events or signals until neither is available
+sub signal_loop() {
+ my $sigint_processed;
+ do {
+ $sigint_processed = 0;
+ process_events();
+
+ # Handle pending signals if any
+ if ($sigint_pending) {
+ my $current_time = time;
+
+ if ($sigint_exit) {
+ print "Received exit signal\n";
+ $sigint_pending = 0;
+ }
+ if ($sigint_report) {
+ if ($current_time >= $sigint_received + 2) {
+ report();
+ $sigint_report = 0;
+ $sigint_pending = 0;
+ $sigint_processed = 1;
+ }
+ }
+ }
+ } while ($sigint_pending || $sigint_processed);
+}
+
+signal_loop();
+report();
diff --git a/Documentation/trace/tracepoint-analysis.txt b/Documentation/trace/tracepoint-analysis.txt
new file mode 100644
index 0000000..5eb4e487
--- /dev/null
+++ b/Documentation/trace/tracepoint-analysis.txt
@@ -0,0 +1,327 @@
+ Notes on Analysing Behaviour Using Events and Tracepoints
+
+ Documentation written by Mel Gorman
+ PCL information heavily based on email from Ingo Molnar
+
+1. Introduction
+===============
+
+Tracepoints (see Documentation/trace/tracepoints.txt) can be used without
+creating custom kernel modules to register probe functions using the event
+tracing infrastructure.
+
+Simplistically, tracepoints will represent an important event that when can
+be taken in conjunction with other tracepoints to build a "Big Picture" of
+what is going on within the system. There are a large number of methods for
+gathering and interpreting these events. Lacking any current Best Practises,
+this document describes some of the methods that can be used.
+
+This document assumes that debugfs is mounted on /sys/kernel/debug and that
+the appropriate tracing options have been configured into the kernel. It is
+assumed that the PCL tool tools/perf has been installed and is in your path.
+
+2. Listing Available Events
+===========================
+
+2.1 Standard Utilities
+----------------------
+
+All possible events are visible from /sys/kernel/debug/tracing/events. Simply
+calling
+
+ $ find /sys/kernel/debug/tracing/events -type d
+
+will give a fair indication of the number of events available.
+
+2.2 PCL
+-------
+
+Discovery and enumeration of all counters and events, including tracepoints
+are available with the perf tool. Getting a list of available events is a
+simple case of
+
+ $ perf list 2>&1 | grep Tracepoint
+ ext4:ext4_free_inode [Tracepoint event]
+ ext4:ext4_request_inode [Tracepoint event]
+ ext4:ext4_allocate_inode [Tracepoint event]
+ ext4:ext4_write_begin [Tracepoint event]
+ ext4:ext4_ordered_write_end [Tracepoint event]
+ [ .... remaining output snipped .... ]
+
+
+2. Enabling Events
+==================
+
+2.1 System-Wide Event Enabling
+------------------------------
+
+See Documentation/trace/events.txt for a proper description on how events
+can be enabled system-wide. A short example of enabling all events related
+to page allocation would look something like
+
+ $ for i in `find /sys/kernel/debug/tracing/events -name "enable" | grep mm_`; do echo 1 > $i; done
+
+2.2 System-Wide Event Enabling with SystemTap
+---------------------------------------------
+
+In SystemTap, tracepoints are accessible using the kernel.trace() function
+call. The following is an example that reports every 5 seconds what processes
+were allocating the pages.
+
+ global page_allocs
+
+ probe kernel.trace("mm_page_alloc") {
+ page_allocs[execname()]++
+ }
+
+ function print_count() {
+ printf ("%-25s %-s\n", "#Pages Allocated", "Process Name")
+ foreach (proc in page_allocs-)
+ printf("%-25d %s\n", page_allocs[proc], proc)
+ printf ("\n")
+ delete page_allocs
+ }
+
+ probe timer.s(5) {
+ print_count()
+ }
+
+2.3 System-Wide Event Enabling with PCL
+---------------------------------------
+
+By specifying the -a switch and analysing sleep, the system-wide events
+for a duration of time can be examined.
+
+ $ perf stat -a \
+ -e kmem:mm_page_alloc -e kmem:mm_page_free_direct \
+ -e kmem:mm_pagevec_free \
+ sleep 10
+ Performance counter stats for 'sleep 10':
+
+ 9630 kmem:mm_page_alloc
+ 2143 kmem:mm_page_free_direct
+ 7424 kmem:mm_pagevec_free
+
+ 10.002577764 seconds time elapsed
+
+Similarly, one could execute a shell and exit it as desired to get a report
+at that point.
+
+2.4 Local Event Enabling
+------------------------
+
+Documentation/trace/ftrace.txt describes how to enable events on a per-thread
+basis using set_ftrace_pid.
+
+2.5 Local Event Enablement with PCL
+-----------------------------------
+
+Events can be activate and tracked for the duration of a process on a local
+basis using PCL such as follows.
+
+ $ perf stat -e kmem:mm_page_alloc -e kmem:mm_page_free_direct \
+ -e kmem:mm_pagevec_free ./hackbench 10
+ Time: 0.909
+
+ Performance counter stats for './hackbench 10':
+
+ 17803 kmem:mm_page_alloc
+ 12398 kmem:mm_page_free_direct
+ 4827 kmem:mm_pagevec_free
+
+ 0.973913387 seconds time elapsed
+
+3. Event Filtering
+==================
+
+Documentation/trace/ftrace.txt covers in-depth how to filter events in
+ftrace. Obviously using grep and awk of trace_pipe is an option as well
+as any script reading trace_pipe.
+
+4. Analysing Event Variances with PCL
+=====================================
+
+Any workload can exhibit variances between runs and it can be important
+to know what the standard deviation in. By and large, this is left to the
+performance analyst to do it by hand. In the event that the discrete event
+occurrences are useful to the performance analyst, then perf can be used.
+
+ $ perf stat --repeat 5 -e kmem:mm_page_alloc -e kmem:mm_page_free_direct
+ -e kmem:mm_pagevec_free ./hackbench 10
+ Time: 0.890
+ Time: 0.895
+ Time: 0.915
+ Time: 1.001
+ Time: 0.899
+
+ Performance counter stats for './hackbench 10' (5 runs):
+
+ 16630 kmem:mm_page_alloc ( +- 3.542% )
+ 11486 kmem:mm_page_free_direct ( +- 4.771% )
+ 4730 kmem:mm_pagevec_free ( +- 2.325% )
+
+ 0.982653002 seconds time elapsed ( +- 1.448% )
+
+In the event that some higher-level event is required that depends on some
+aggregation of discrete events, then a script would need to be developed.
+
+Using --repeat, it is also possible to view how events are fluctuating over
+time on a system wide basis using -a and sleep.
+
+ $ perf stat -e kmem:mm_page_alloc -e kmem:mm_page_free_direct \
+ -e kmem:mm_pagevec_free \
+ -a --repeat 10 \
+ sleep 1
+ Performance counter stats for 'sleep 1' (10 runs):
+
+ 1066 kmem:mm_page_alloc ( +- 26.148% )
+ 182 kmem:mm_page_free_direct ( +- 5.464% )
+ 890 kmem:mm_pagevec_free ( +- 30.079% )
+
+ 1.002251757 seconds time elapsed ( +- 0.005% )
+
+5. Higher-Level Analysis with Helper Scripts
+============================================
+
+When events are enabled the events that are triggering can be read from
+/sys/kernel/debug/tracing/trace_pipe in human-readable format although binary
+options exist as well. By post-processing the output, further information can
+be gathered on-line as appropriate. Examples of post-processing might include
+
+ o Reading information from /proc for the PID that triggered the event
+ o Deriving a higher-level event from a series of lower-level events.
+ o Calculate latencies between two events
+
+Documentation/trace/postprocess/trace-pagealloc-postprocess.pl is an example
+script that can read trace_pipe from STDIN or a copy of a trace. When used
+on-line, it can be interrupted once to generate a report without existing
+and twice to exit.
+
+Simplistically, the script just reads STDIN and counts up events but it
+also can do more such as
+
+ o Derive high-level events from many low-level events. If a number of pages
+ are freed to the main allocator from the per-CPU lists, it recognises
+ that as one per-CPU drain even though there is no specific tracepoint
+ for that event
+ o It can aggregate based on PID or individual process number
+ o In the event memory is getting externally fragmented, it reports
+ on whether the fragmentation event was severe or moderate.
+ o When receiving an event about a PID, it can record who the parent was so
+ that if large numbers of events are coming from very short-lived
+ processes, the parent process responsible for creating all the helpers
+ can be identified
+
+6. Lower-Level Analysis with PCL
+================================
+
+There may also be a requirement to identify what functions with a program
+were generating events within the kernel. To begin this sort of analysis, the
+data must be recorded. At the time of writing, this required root
+
+ $ perf record -c 1 \
+ -e kmem:mm_page_alloc -e kmem:mm_page_free_direct \
+ -e kmem:mm_pagevec_free \
+ ./hackbench 10
+ Time: 0.894
+ [ perf record: Captured and wrote 0.733 MB perf.data (~32010 samples) ]
+
+Note the use of '-c 1' to set the event period to sample. The default sample
+period is quite high to minimise overhead but the information collected can be
+very coarse as a result.
+
+This record outputted a file called perf.data which can be analysed using
+perf report.
+
+ $ perf report
+ # Samples: 30922
+ #
+ # Overhead Command Shared Object
+ # ........ ......... ................................
+ #
+ 87.27% hackbench [vdso]
+ 6.85% hackbench /lib/i686/cmov/libc-2.9.so
+ 2.62% hackbench /lib/ld-2.9.so
+ 1.52% perf [vdso]
+ 1.22% hackbench ./hackbench
+ 0.48% hackbench [kernel]
+ 0.02% perf /lib/i686/cmov/libc-2.9.so
+ 0.01% perf /usr/bin/perf
+ 0.01% perf /lib/ld-2.9.so
+ 0.00% hackbench /lib/i686/cmov/libpthread-2.9.so
+ #
+ # (For more details, try: perf report --sort comm,dso,symbol)
+ #
+
+According to this, the vast majority of events occured triggered on events
+within the VDSO. With simple binaries, this will often be the case so lets
+take a slightly different example. In the course of writing this, it was
+noticed that X was generating an insane amount of page allocations so lets look
+at it
+
+ $ perf record -c 1 -f \
+ -e kmem:mm_page_alloc -e kmem:mm_page_free_direct \
+ -e kmem:mm_pagevec_free \
+ -p `pidof X`
+
+This was interrupted after a few seconds and
+
+ $ perf report
+ # Samples: 27666
+ #
+ # Overhead Command Shared Object
+ # ........ ....... .......................................
+ #
+ 51.95% Xorg [vdso]
+ 47.95% Xorg /opt/gfx-test/lib/libpixman-1.so.0.13.1
+ 0.09% Xorg /lib/i686/cmov/libc-2.9.so
+ 0.01% Xorg [kernel]
+ #
+ # (For more details, try: perf report --sort comm,dso,symbol)
+ #
+
+So, almost half of the events are occuring in a library. To get an idea which
+symbol.
+
+ $ perf report --sort comm,dso,symbol
+ # Samples: 27666
+ #
+ # Overhead Command Shared Object Symbol
+ # ........ ....... ....................................... ......
+ #
+ 51.95% Xorg [vdso] [.] 0x000000ffffe424
+ 47.93% Xorg /opt/gfx-test/lib/libpixman-1.so.0.13.1 [.] pixmanFillsse2
+ 0.09% Xorg /lib/i686/cmov/libc-2.9.so [.] _int_malloc
+ 0.01% Xorg /opt/gfx-test/lib/libpixman-1.so.0.13.1 [.] pixman_region32_copy_f
+ 0.01% Xorg [kernel] [k] read_hpet
+ 0.01% Xorg /opt/gfx-test/lib/libpixman-1.so.0.13.1 [.] get_fast_path
+ 0.00% Xorg [kernel] [k] ftrace_trace_userstack
+
+To see where within the function pixmanFillsse2 things are going wrong
+
+ $ perf annotate pixmanFillsse2
+ [ ... ]
+ 0.00 : 34eeb: 0f 18 08 prefetcht0 (%eax)
+ : }
+ :
+ : extern __inline void __attribute__((__gnu_inline__, __always_inline__, _
+ : _mm_store_si128 (__m128i *__P, __m128i __B) : {
+ : *__P = __B;
+ 12.40 : 34eee: 66 0f 7f 80 40 ff ff movdqa %xmm0,-0xc0(%eax)
+ 0.00 : 34ef5: ff
+ 12.40 : 34ef6: 66 0f 7f 80 50 ff ff movdqa %xmm0,-0xb0(%eax)
+ 0.00 : 34efd: ff
+ 12.39 : 34efe: 66 0f 7f 80 60 ff ff movdqa %xmm0,-0xa0(%eax)
+ 0.00 : 34f05: ff
+ 12.67 : 34f06: 66 0f 7f 80 70 ff ff movdqa %xmm0,-0x90(%eax)
+ 0.00 : 34f0d: ff
+ 12.58 : 34f0e: 66 0f 7f 40 80 movdqa %xmm0,-0x80(%eax)
+ 12.31 : 34f13: 66 0f 7f 40 90 movdqa %xmm0,-0x70(%eax)
+ 12.40 : 34f18: 66 0f 7f 40 a0 movdqa %xmm0,-0x60(%eax)
+ 12.31 : 34f1d: 66 0f 7f 40 b0 movdqa %xmm0,-0x50(%eax)
+
+At a glance, it looks like the time is being spent copying pixmaps to
+the card. Further investigation would be needed to determine why pixmaps
+are being copied around so much but a starting point would be to take an
+ancient build of libpixmap out of the library path where it was totally
+forgotten about from months ago!
diff --git a/Documentation/vm/00-INDEX b/Documentation/vm/00-INDEX
index 2f77ced..e57d6a9 100644
--- a/Documentation/vm/00-INDEX
+++ b/Documentation/vm/00-INDEX
@@ -6,6 +6,8 @@ balance
- various information on memory balancing.
hugetlbpage.txt
- a brief summary of hugetlbpage support in the Linux kernel.
+ksm.txt
+ - how to use the Kernel Samepage Merging feature.
locking
- info on how locking and synchronization is done in the Linux vm code.
numa
@@ -20,3 +22,5 @@ slabinfo.c
- source code for a tool to get reports about slabs.
slub.txt
- a short users guide for SLUB.
+map_hugetlb.c
+ - an example program that uses the MAP_HUGETLB mmap flag.
diff --git a/Documentation/vm/hugetlbpage.txt b/Documentation/vm/hugetlbpage.txt
index ea8714f..82a7bd1 100644
--- a/Documentation/vm/hugetlbpage.txt
+++ b/Documentation/vm/hugetlbpage.txt
@@ -18,13 +18,13 @@ First the Linux kernel needs to be built with the CONFIG_HUGETLBFS
automatically when CONFIG_HUGETLBFS is selected) configuration
options.
-The kernel built with hugepage support should show the number of configured
-hugepages in the system by running the "cat /proc/meminfo" command.
+The kernel built with huge page support should show the number of configured
+huge pages in the system by running the "cat /proc/meminfo" command.
/proc/meminfo also provides information about the total number of hugetlb
pages configured in the kernel. It also displays information about the
number of free hugetlb pages at any time. It also displays information about
-the configured hugepage size - this is needed for generating the proper
+the configured huge page size - this is needed for generating the proper
alignment and size of the arguments to the above system calls.
The output of "cat /proc/meminfo" will have lines like:
@@ -37,25 +37,27 @@ HugePages_Surp: yyy
Hugepagesize: zzz kB
where:
-HugePages_Total is the size of the pool of hugepages.
-HugePages_Free is the number of hugepages in the pool that are not yet
-allocated.
-HugePages_Rsvd is short for "reserved," and is the number of hugepages
-for which a commitment to allocate from the pool has been made, but no
-allocation has yet been made. It's vaguely analogous to overcommit.
-HugePages_Surp is short for "surplus," and is the number of hugepages in
-the pool above the value in /proc/sys/vm/nr_hugepages. The maximum
-number of surplus hugepages is controlled by
-/proc/sys/vm/nr_overcommit_hugepages.
+HugePages_Total is the size of the pool of huge pages.
+HugePages_Free is the number of huge pages in the pool that are not yet
+ allocated.
+HugePages_Rsvd is short for "reserved," and is the number of huge pages for
+ which a commitment to allocate from the pool has been made,
+ but no allocation has yet been made. Reserved huge pages
+ guarantee that an application will be able to allocate a
+ huge page from the pool of huge pages at fault time.
+HugePages_Surp is short for "surplus," and is the number of huge pages in
+ the pool above the value in /proc/sys/vm/nr_hugepages. The
+ maximum number of surplus huge pages is controlled by
+ /proc/sys/vm/nr_overcommit_hugepages.
/proc/filesystems should also show a filesystem of type "hugetlbfs" configured
in the kernel.
/proc/sys/vm/nr_hugepages indicates the current number of configured hugetlb
pages in the kernel. Super user can dynamically request more (or free some
-pre-configured) hugepages.
+pre-configured) huge pages.
The allocation (or deallocation) of hugetlb pages is possible only if there are
-enough physically contiguous free pages in system (freeing of hugepages is
+enough physically contiguous free pages in system (freeing of huge pages is
possible only if there are enough hugetlb pages free that can be transferred
back to regular memory pool).
@@ -67,43 +69,82 @@ use either the mmap system call or shared memory system calls to start using
the huge pages. It is required that the system administrator preallocate
enough memory for huge page purposes.
-Use the following command to dynamically allocate/deallocate hugepages:
+The administrator can preallocate huge pages on the kernel boot command line by
+specifying the "hugepages=N" parameter, where 'N' = the number of huge pages
+requested. This is the most reliable method for preallocating huge pages as
+memory has not yet become fragmented.
+
+Some platforms support multiple huge page sizes. To preallocate huge pages
+of a specific size, one must preceed the huge pages boot command parameters
+with a huge page size selection parameter "hugepagesz=<size>". <size> must
+be specified in bytes with optional scale suffix [kKmMgG]. The default huge
+page size may be selected with the "default_hugepagesz=<size>" boot parameter.
+
+/proc/sys/vm/nr_hugepages indicates the current number of configured [default
+size] hugetlb pages in the kernel. Super user can dynamically request more
+(or free some pre-configured) huge pages.
+
+Use the following command to dynamically allocate/deallocate default sized
+huge pages:
echo 20 > /proc/sys/vm/nr_hugepages
-This command will try to configure 20 hugepages in the system. The success
-or failure of allocation depends on the amount of physically contiguous
-memory that is preset in system at this time. System administrators may want
-to put this command in one of the local rc init files. This will enable the
-kernel to request huge pages early in the boot process (when the possibility
-of getting physical contiguous pages is still very high). In either
-case, administrators will want to verify the number of hugepages actually
-allocated by checking the sysctl or meminfo.
-
-/proc/sys/vm/nr_overcommit_hugepages indicates how large the pool of
-hugepages can grow, if more hugepages than /proc/sys/vm/nr_hugepages are
-requested by applications. echo'ing any non-zero value into this file
-indicates that the hugetlb subsystem is allowed to try to obtain
-hugepages from the buddy allocator, if the normal pool is exhausted. As
-these surplus hugepages go out of use, they are freed back to the buddy
+This command will try to configure 20 default sized huge pages in the system.
+On a NUMA platform, the kernel will attempt to distribute the huge page pool
+over the all on-line nodes. These huge pages, allocated when nr_hugepages
+is increased, are called "persistent huge pages".
+
+The success or failure of huge page allocation depends on the amount of
+physically contiguous memory that is preset in system at the time of the
+allocation attempt. If the kernel is unable to allocate huge pages from
+some nodes in a NUMA system, it will attempt to make up the difference by
+allocating extra pages on other nodes with sufficient available contiguous
+memory, if any.
+
+System administrators may want to put this command in one of the local rc init
+files. This will enable the kernel to request huge pages early in the boot
+process when the possibility of getting physical contiguous pages is still
+very high. Administrators can verify the number of huge pages actually
+allocated by checking the sysctl or meminfo. To check the per node
+distribution of huge pages in a NUMA system, use:
+
+ cat /sys/devices/system/node/node*/meminfo | fgrep Huge
+
+/proc/sys/vm/nr_overcommit_hugepages specifies how large the pool of
+huge pages can grow, if more huge pages than /proc/sys/vm/nr_hugepages are
+requested by applications. Writing any non-zero value into this file
+indicates that the hugetlb subsystem is allowed to try to obtain "surplus"
+huge pages from the buddy allocator, when the normal pool is exhausted. As
+these surplus huge pages go out of use, they are freed back to the buddy
allocator.
+When increasing the huge page pool size via nr_hugepages, any surplus
+pages will first be promoted to persistent huge pages. Then, additional
+huge pages will be allocated, if necessary and if possible, to fulfill
+the new huge page pool size.
+
+The administrator may shrink the pool of preallocated huge pages for
+the default huge page size by setting the nr_hugepages sysctl to a
+smaller value. The kernel will attempt to balance the freeing of huge pages
+across all on-line nodes. Any free huge pages on the selected nodes will
+be freed back to the buddy allocator.
+
Caveat: Shrinking the pool via nr_hugepages such that it becomes less
-than the number of hugepages in use will convert the balance to surplus
+than the number of huge pages in use will convert the balance to surplus
huge pages even if it would exceed the overcommit value. As long as
this condition holds, however, no more surplus huge pages will be
allowed on the system until one of the two sysctls are increased
sufficiently, or the surplus huge pages go out of use and are freed.
-With support for multiple hugepage pools at run-time available, much of
-the hugepage userspace interface has been duplicated in sysfs. The above
-information applies to the default hugepage size (which will be
-controlled by the proc interfaces for backwards compatibility). The root
-hugepage control directory is
+With support for multiple huge page pools at run-time available, much of
+the huge page userspace interface has been duplicated in sysfs. The above
+information applies to the default huge page size which will be
+controlled by the /proc interfaces for backwards compatibility. The root
+huge page control directory in sysfs is:
/sys/kernel/mm/hugepages
-For each hugepage size supported by the running kernel, a subdirectory
+For each huge page size supported by the running kernel, a subdirectory
will exist, of the form
hugepages-${size}kB
@@ -116,9 +157,9 @@ Inside each of these directories, the same set of files will exist:
resv_hugepages
surplus_hugepages
-which function as described above for the default hugepage-sized case.
+which function as described above for the default huge page-sized case.
-If the user applications are going to request hugepages using mmap system
+If the user applications are going to request huge pages using mmap system
call, then it is required that system administrator mount a file system of
type hugetlbfs:
@@ -127,7 +168,7 @@ type hugetlbfs:
none /mnt/huge
This command mounts a (pseudo) filesystem of type hugetlbfs on the directory
-/mnt/huge. Any files created on /mnt/huge uses hugepages. The uid and gid
+/mnt/huge. Any files created on /mnt/huge uses huge pages. The uid and gid
options sets the owner and group of the root of the file system. By default
the uid and gid of the current process are taken. The mode option sets the
mode of root of file system to value & 0777. This value is given in octal.
@@ -146,24 +187,26 @@ Regular chown, chgrp, and chmod commands (with right permissions) could be
used to change the file attributes on hugetlbfs.
Also, it is important to note that no such mount command is required if the
-applications are going to use only shmat/shmget system calls. Users who
-wish to use hugetlb page via shared memory segment should be a member of
-a supplementary group and system admin needs to configure that gid into
-/proc/sys/vm/hugetlb_shm_group. It is possible for same or different
-applications to use any combination of mmaps and shm* calls, though the
-mount of filesystem will be required for using mmap calls.
+applications are going to use only shmat/shmget system calls or mmap with
+MAP_HUGETLB. Users who wish to use hugetlb page via shared memory segment
+should be a member of a supplementary group and system admin needs to
+configure that gid into /proc/sys/vm/hugetlb_shm_group. It is possible for
+same or different applications to use any combination of mmaps and shm*
+calls, though the mount of filesystem will be required for using mmap calls
+without MAP_HUGETLB. For an example of how to use mmap with MAP_HUGETLB see
+map_hugetlb.c.
*******************************************************************
/*
- * Example of using hugepage memory in a user application using Sys V shared
+ * Example of using huge page memory in a user application using Sys V shared
* memory system calls. In this example the app is requesting 256MB of
* memory that is backed by huge pages. The application uses the flag
* SHM_HUGETLB in the shmget system call to inform the kernel that it is
- * requesting hugepages.
+ * requesting huge pages.
*
* For the ia64 architecture, the Linux kernel reserves Region number 4 for
- * hugepages. That means the addresses starting with 0x800000... will need
+ * huge pages. That means the addresses starting with 0x800000... will need
* to be specified. Specifying a fixed address is not required on ppc64,
* i386 or x86_64.
*
@@ -252,14 +295,14 @@ int main(void)
*******************************************************************
/*
- * Example of using hugepage memory in a user application using the mmap
+ * Example of using huge page memory in a user application using the mmap
* system call. Before running this application, make sure that the
* administrator has mounted the hugetlbfs filesystem (on some directory
* like /mnt) using the command mount -t hugetlbfs nodev /mnt. In this
* example, the app is requesting memory of size 256MB that is backed by
* huge pages.
*
- * For ia64 architecture, Linux kernel reserves Region number 4 for hugepages.
+ * For ia64 architecture, Linux kernel reserves Region number 4 for huge pages.
* That means the addresses starting with 0x800000... will need to be
* specified. Specifying a fixed address is not required on ppc64, i386
* or x86_64.
diff --git a/Documentation/vm/ksm.txt b/Documentation/vm/ksm.txt
new file mode 100644
index 0000000..72a22f6
--- /dev/null
+++ b/Documentation/vm/ksm.txt
@@ -0,0 +1,89 @@
+How to use the Kernel Samepage Merging feature
+----------------------------------------------
+
+KSM is a memory-saving de-duplication feature, enabled by CONFIG_KSM=y,
+added to the Linux kernel in 2.6.32. See mm/ksm.c for its implementation,
+and http://lwn.net/Articles/306704/ and http://lwn.net/Articles/330589/
+
+The KSM daemon ksmd periodically scans those areas of user memory which
+have been registered with it, looking for pages of identical content which
+can be replaced by a single write-protected page (which is automatically
+copied if a process later wants to update its content).
+
+KSM was originally developed for use with KVM (where it was known as
+Kernel Shared Memory), to fit more virtual machines into physical memory,
+by sharing the data common between them. But it can be useful to any
+application which generates many instances of the same data.
+
+KSM only merges anonymous (private) pages, never pagecache (file) pages.
+KSM's merged pages are at present locked into kernel memory for as long
+as they are shared: so cannot be swapped out like the user pages they
+replace (but swapping KSM pages should follow soon in a later release).
+
+KSM only operates on those areas of address space which an application
+has advised to be likely candidates for merging, by using the madvise(2)
+system call: int madvise(addr, length, MADV_MERGEABLE).
+
+The app may call int madvise(addr, length, MADV_UNMERGEABLE) to cancel
+that advice and restore unshared pages: whereupon KSM unmerges whatever
+it merged in that range. Note: this unmerging call may suddenly require
+more memory than is available - possibly failing with EAGAIN, but more
+probably arousing the Out-Of-Memory killer.
+
+If KSM is not configured into the running kernel, madvise MADV_MERGEABLE
+and MADV_UNMERGEABLE simply fail with EINVAL. If the running kernel was
+built with CONFIG_KSM=y, those calls will normally succeed: even if the
+the KSM daemon is not currently running, MADV_MERGEABLE still registers
+the range for whenever the KSM daemon is started; even if the range
+cannot contain any pages which KSM could actually merge; even if
+MADV_UNMERGEABLE is applied to a range which was never MADV_MERGEABLE.
+
+Like other madvise calls, they are intended for use on mapped areas of
+the user address space: they will report ENOMEM if the specified range
+includes unmapped gaps (though working on the intervening mapped areas),
+and might fail with EAGAIN if not enough memory for internal structures.
+
+Applications should be considerate in their use of MADV_MERGEABLE,
+restricting its use to areas likely to benefit. KSM's scans may use
+a lot of processing power, and its kernel-resident pages are a limited
+resource. Some installations will disable KSM for these reasons.
+
+The KSM daemon is controlled by sysfs files in /sys/kernel/mm/ksm/,
+readable by all but writable only by root:
+
+max_kernel_pages - set to maximum number of kernel pages that KSM may use
+ e.g. "echo 2000 > /sys/kernel/mm/ksm/max_kernel_pages"
+ Value 0 imposes no limit on the kernel pages KSM may use;
+ but note that any process using MADV_MERGEABLE can cause
+ KSM to allocate these pages, unswappable until it exits.
+ Default: 2000 (chosen for demonstration purposes)
+
+pages_to_scan - how many present pages to scan before ksmd goes to sleep
+ e.g. "echo 200 > /sys/kernel/mm/ksm/pages_to_scan"
+ Default: 200 (chosen for demonstration purposes)
+
+sleep_millisecs - how many milliseconds ksmd should sleep before next scan
+ e.g. "echo 20 > /sys/kernel/mm/ksm/sleep_millisecs"
+ Default: 20 (chosen for demonstration purposes)
+
+run - set 0 to stop ksmd from running but keep merged pages,
+ set 1 to run ksmd e.g. "echo 1 > /sys/kernel/mm/ksm/run",
+ set 2 to stop ksmd and unmerge all pages currently merged,
+ but leave mergeable areas registered for next run
+ Default: 1 (for immediate use by apps which register)
+
+The effectiveness of KSM and MADV_MERGEABLE is shown in /sys/kernel/mm/ksm/:
+
+pages_shared - how many shared unswappable kernel pages KSM is using
+pages_sharing - how many more sites are sharing them i.e. how much saved
+pages_unshared - how many pages unique but repeatedly checked for merging
+pages_volatile - how many pages changing too fast to be placed in a tree
+full_scans - how many times all mergeable areas have been scanned
+
+A high ratio of pages_sharing to pages_shared indicates good sharing, but
+a high ratio of pages_unshared to pages_sharing indicates wasted effort.
+pages_volatile embraces several different kinds of activity, but a high
+proportion there would also indicate poor use of madvise MADV_MERGEABLE.
+
+Izik Eidus,
+Hugh Dickins, 30 July 2009
diff --git a/Documentation/vm/map_hugetlb.c b/Documentation/vm/map_hugetlb.c
new file mode 100644
index 0000000..e2bdae3
--- /dev/null
+++ b/Documentation/vm/map_hugetlb.c
@@ -0,0 +1,77 @@
+/*
+ * Example of using hugepage memory in a user application using the mmap
+ * system call with MAP_HUGETLB flag. Before running this program make
+ * sure the administrator has allocated enough default sized huge pages
+ * to cover the 256 MB allocation.
+ *
+ * For ia64 architecture, Linux kernel reserves Region number 4 for hugepages.
+ * That means the addresses starting with 0x800000... will need to be
+ * specified. Specifying a fixed address is not required on ppc64, i386
+ * or x86_64.
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+
+#define LENGTH (256UL*1024*1024)
+#define PROTECTION (PROT_READ | PROT_WRITE)
+
+#ifndef MAP_HUGETLB
+#define MAP_HUGETLB 0x40
+#endif
+
+/* Only ia64 requires this */
+#ifdef __ia64__
+#define ADDR (void *)(0x8000000000000000UL)
+#define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_FIXED)
+#else
+#define ADDR (void *)(0x0UL)
+#define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB)
+#endif
+
+void check_bytes(char *addr)
+{
+ printf("First hex is %x\n", *((unsigned int *)addr));
+}
+
+void write_bytes(char *addr)
+{
+ unsigned long i;
+
+ for (i = 0; i < LENGTH; i++)
+ *(addr + i) = (char)i;
+}
+
+void read_bytes(char *addr)
+{
+ unsigned long i;
+
+ check_bytes(addr);
+ for (i = 0; i < LENGTH; i++)
+ if (*(addr + i) != (char)i) {
+ printf("Mismatch at %lu\n", i);
+ break;
+ }
+}
+
+int main(void)
+{
+ void *addr;
+
+ addr = mmap(ADDR, LENGTH, PROTECTION, FLAGS, 0, 0);
+ if (addr == MAP_FAILED) {
+ perror("mmap");
+ exit(1);
+ }
+
+ printf("Returned address is %p\n", addr);
+ check_bytes(addr);
+ write_bytes(addr);
+ read_bytes(addr);
+
+ munmap(addr, LENGTH);
+
+ return 0;
+}
OpenPOWER on IntegriCloud