diff options
author | Paul Mundt <lethal@linux-sh.org> | 2011-01-14 16:06:31 +0900 |
---|---|---|
committer | Paul Mundt <lethal@linux-sh.org> | 2011-01-14 16:06:31 +0900 |
commit | c488a4731abb53aa1bab9fccd8a7472083159bfd (patch) | |
tree | db6d4a664a1e4b7685c1d2d79da63263f40adf7b | |
parent | 6d2ae89c36e2adab5cfa69fecb11290082817ac6 (diff) | |
parent | bba958783b1b4cb0a9420f4e11082467132a334c (diff) | |
download | op-kernel-dev-c488a4731abb53aa1bab9fccd8a7472083159bfd.zip op-kernel-dev-c488a4731abb53aa1bab9fccd8a7472083159bfd.tar.gz |
Merge branch 'common/mmcif' into rmobile-latest
181 files changed, 9940 insertions, 2132 deletions
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index 7781857..bac328c 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt @@ -385,6 +385,10 @@ mapped_file - # of bytes of mapped file (includes tmpfs/shmem) pgpgin - # of pages paged in (equivalent to # of charging events). pgpgout - # of pages paged out (equivalent to # of uncharging events). swap - # of bytes of swap usage +dirty - # of bytes that are waiting to get written back to the disk. +writeback - # of bytes that are actively being written back to the disk. +nfs_unstable - # of bytes sent to the NFS server, but not yet committed to + the actual storage. inactive_anon - # of bytes of anonymous memory and swap cache memory on LRU list. active_anon - # of bytes of anonymous and swap cache memory on active @@ -406,6 +410,9 @@ total_mapped_file - sum of all children's "cache" total_pgpgin - sum of all children's "pgpgin" total_pgpgout - sum of all children's "pgpgout" total_swap - sum of all children's "swap" +total_dirty - sum of all children's "dirty" +total_writeback - sum of all children's "writeback" +total_nfs_unstable - sum of all children's "nfs_unstable" total_inactive_anon - sum of all children's "inactive_anon" total_active_anon - sum of all children's "active_anon" total_inactive_file - sum of all children's "inactive_file" @@ -453,6 +460,73 @@ memory under it will be reclaimed. You can reset failcnt by writing 0 to failcnt file. # echo 0 > .../memory.failcnt +5.5 dirty memory + +Control the maximum amount of dirty pages a cgroup can have at any given time. + +Limiting dirty memory is like fixing the max amount of dirty (hard to reclaim) +page cache used by a cgroup. So, in case of multiple cgroup writers, they will +not be able to consume more than their designated share of dirty pages and will +be forced to perform write-out if they cross that limit. + +The interface is equivalent to the procfs interface: /proc/sys/vm/dirty_*. It +is possible to configure a limit to trigger both a direct writeback or a +background writeback performed by per-bdi flusher threads. The root cgroup +memory.dirty_* control files are read-only and match the contents of +the /proc/sys/vm/dirty_* files. + +Per-cgroup dirty limits can be set using the following files in the cgroupfs: + +- memory.dirty_ratio: the amount of dirty memory (expressed as a percentage of + cgroup memory) at which a process generating dirty pages will itself start + writing out dirty data. + +- memory.dirty_limit_in_bytes: the amount of dirty memory (expressed in bytes) + in the cgroup at which a process generating dirty pages will start itself + writing out dirty data. Suffix (k, K, m, M, g, or G) can be used to indicate + that value is kilo, mega or gigabytes. + + Note: memory.dirty_limit_in_bytes is the counterpart of memory.dirty_ratio. + Only one of them may be specified at a time. When one is written it is + immediately taken into account to evaluate the dirty memory limits and the + other appears as 0 when read. + +- memory.dirty_background_ratio: the amount of dirty memory of the cgroup + (expressed as a percentage of cgroup memory) at which background writeback + kernel threads will start writing out dirty data. + +- memory.dirty_background_limit_in_bytes: the amount of dirty memory (expressed + in bytes) in the cgroup at which background writeback kernel threads will + start writing out dirty data. Suffix (k, K, m, M, g, or G) can be used to + indicate that value is kilo, mega or gigabytes. + + Note: memory.dirty_background_limit_in_bytes is the counterpart of + memory.dirty_background_ratio. Only one of them may be specified at a time. + When one is written it is immediately taken into account to evaluate the dirty + memory limits and the other appears as 0 when read. + +A cgroup may contain more dirty memory than its dirty limit. This is possible +because of the principle that the first cgroup to touch a page is charged for +it. Subsequent page counting events (dirty, writeback, nfs_unstable) are also +counted to the originally charged cgroup. + +Example: If page is allocated by a cgroup A task, then the page is charged to +cgroup A. If the page is later dirtied by a task in cgroup B, then the cgroup A +dirty count will be incremented. If cgroup A is over its dirty limit but cgroup +B is not, then dirtying a cgroup A page from a cgroup B task may push cgroup A +over its dirty limit without throttling the dirtying cgroup B task. + +When use_hierarchy=0, each cgroup has dirty memory usage and limits. +System-wide dirty limits are also consulted. Dirty memory consumption is +checked against both system-wide and per-cgroup dirty limits. + +The current implementation does not enforce per-cgroup dirty limits when +use_hierarchy=1. System-wide dirty limits are used for processes in such +cgroups. Attempts to read memory.dirty_* files return the system-wide +values. Writes to the memory.dirty_* files return error. An enhanced +implementation is needed to check the chain of parents to ensure that no +dirty limit is exceeded. + 6. Hierarchy support The memory controller supports a deep hierarchy and hierarchical accounting. diff --git a/Documentation/device-mapper/dm-crypt.txt b/Documentation/device-mapper/dm-crypt.txt index 524de92..59293ac 100644 --- a/Documentation/device-mapper/dm-crypt.txt +++ b/Documentation/device-mapper/dm-crypt.txt @@ -8,7 +8,7 @@ Parameters: <cipher> <key> <iv_offset> <device path> <offset> <cipher> Encryption cipher and an optional IV generation mode. - (In format cipher-chainmode-ivopts:ivmode). + (In format cipher[:keycount]-chainmode-ivopts:ivmode). Examples: des aes-cbc-essiv:sha256 @@ -20,6 +20,11 @@ Parameters: <cipher> <key> <iv_offset> <device path> <offset> Key used for encryption. It is encoded as a hexadecimal number. You can only use key sizes that are valid for the selected cipher. +<keycount> + Multi-key compatibility mode. You can define <keycount> keys and + then sectors are encrypted according to their offsets (sector 0 uses key0; + sector 1 uses key1 etc.). <keycount> must be a power of two. + <iv_offset> The IV offset is a sector count that is added to the sector number before creating the IV. diff --git a/Documentation/device-mapper/dm-raid.txt b/Documentation/device-mapper/dm-raid.txt new file mode 100644 index 0000000..33b6b70 --- /dev/null +++ b/Documentation/device-mapper/dm-raid.txt @@ -0,0 +1,70 @@ +Device-mapper RAID (dm-raid) is a bridge from DM to MD. It +provides a way to use device-mapper interfaces to access the MD RAID +drivers. + +As with all device-mapper targets, the nominal public interfaces are the +constructor (CTR) tables and the status outputs (both STATUSTYPE_INFO +and STATUSTYPE_TABLE). The CTR table looks like the following: + +1: <s> <l> raid \ +2: <raid_type> <#raid_params> <raid_params> \ +3: <#raid_devs> <meta_dev1> <dev1> .. <meta_devN> <devN> + +Line 1 contains the standard first three arguments to any device-mapper +target - the start, length, and target type fields. The target type in +this case is "raid". + +Line 2 contains the arguments that define the particular raid +type/personality/level, the required arguments for that raid type, and +any optional arguments. Possible raid types include: raid4, raid5_la, +raid5_ls, raid5_rs, raid6_zr, raid6_nr, and raid6_nc. (raid1 is +planned for the future.) The list of required and optional parameters +is the same for all the current raid types. The required parameters are +positional, while the optional parameters are given as key/value pairs. +The possible parameters are as follows: + <chunk_size> Chunk size in sectors. + [[no]sync] Force/Prevent RAID initialization + [rebuild <idx>] Rebuild the drive indicated by the index + [daemon_sleep <ms>] Time between bitmap daemon work to clear bits + [min_recovery_rate <kB/sec/disk>] Throttle RAID initialization + [max_recovery_rate <kB/sec/disk>] Throttle RAID initialization + [max_write_behind <sectors>] See '-write-behind=' (man mdadm) + [stripe_cache <sectors>] Stripe cache size for higher RAIDs + +Line 3 contains the list of devices that compose the array in +metadata/data device pairs. If the metadata is stored separately, a '-' +is given for the metadata device position. If a drive has failed or is +missing at creation time, a '-' can be given for both the metadata and +data drives for a given position. + +NB. Currently all metadata devices must be specified as '-'. + +Examples: +# RAID4 - 4 data drives, 1 parity +# No metadata devices specified to hold superblock/bitmap info +# Chunk size of 1MiB +# (Lines separated for easy reading) +0 1960893648 raid \ + raid4 1 2048 \ + 5 - 8:17 - 8:33 - 8:49 - 8:65 - 8:81 + +# RAID4 - 4 data drives, 1 parity (no metadata devices) +# Chunk size of 1MiB, force RAID initialization, +# min recovery rate at 20 kiB/sec/disk +0 1960893648 raid \ + raid4 4 2048 min_recovery_rate 20 sync\ + 5 - 8:17 - 8:33 - 8:49 - 8:65 - 8:81 + +Performing a 'dmsetup table' should display the CTR table used to +construct the mapping (with possible reordering of optional +parameters). + +Performing a 'dmsetup status' will yield information on the state and +health of the array. The output is as follows: +1: <s> <l> raid \ +2: <raid_type> <#devices> <1 health char for each dev> <resync_ratio> + +Line 1 is standard DM output. Line 2 is best shown by example: + 0 1960893648 raid raid4 5 AAAAA 2/490221568 +Here we can see the RAID type is raid4, there are 5 devices - all of +which are 'A'live, and the array is 2/490221568 complete with recovery. diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 9471225..23cae65 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -375,6 +375,7 @@ Anonymous: 0 kB Swap: 0 kB KernelPageSize: 4 kB MMUPageSize: 4 kB +Locked: 374 kB The first of these lines shows the same information as is displayed for the mapping in /proc/PID/maps. The remaining lines show the size of the mapping @@ -670,6 +671,8 @@ varies by architecture and compile options. The following is from a > cat /proc/meminfo +The "Locked" indicates whether the mapping is locked in memory or not. + MemTotal: 16344972 kB MemFree: 13634064 kB @@ -1320,6 +1323,10 @@ scaled linearly with /proc/<pid>/oom_score_adj. Writing to /proc/<pid>/oom_score_adj or /proc/<pid>/oom_adj will change the other with its scaled value. +The value of /proc/<pid>/oom_score_adj may be reduced no lower than the last +value set by a CAP_SYS_RESOURCE process. To reduce the value any lower +requires CAP_SYS_RESOURCE. + NOTICE: /proc/<pid>/oom_adj is deprecated and will be removed, please see Documentation/feature-removal-schedule.txt. diff --git a/Documentation/gpio.txt b/Documentation/gpio.txt index a492d92..792faa3 100644 --- a/Documentation/gpio.txt +++ b/Documentation/gpio.txt @@ -135,7 +135,7 @@ setting up a platform_device using the GPIO, is mark its direction: int gpio_direction_input(unsigned gpio); int gpio_direction_output(unsigned gpio, int value); -The return value is zero for success, else a negative errno. It must +The return value is zero for success, else a negative errno. It should be checked, since the get/set calls don't have error returns and since misconfiguration is possible. You should normally issue these calls from a task context. However, for spinlock-safe GPIOs it's OK to use them diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt new file mode 100644 index 0000000..0924aac --- /dev/null +++ b/Documentation/vm/transhuge.txt @@ -0,0 +1,298 @@ += Transparent Hugepage Support = + +== Objective == + +Performance critical computing applications dealing with large memory +working sets are already running on top of libhugetlbfs and in turn +hugetlbfs. Transparent Hugepage Support is an alternative means of +using huge pages for the backing of virtual memory with huge pages +that supports the automatic promotion and demotion of page sizes and +without the shortcomings of hugetlbfs. + +Currently it only works for anonymous memory mappings but in the +future it can expand over the pagecache layer starting with tmpfs. + +The reason applications are running faster is because of two +factors. The first factor is almost completely irrelevant and it's not +of significant interest because it'll also have the downside of +requiring larger clear-page copy-page in page faults which is a +potentially negative effect. The first factor consists in taking a +single page fault for each 2M virtual region touched by userland (so +reducing the enter/exit kernel frequency by a 512 times factor). This +only matters the first time the memory is accessed for the lifetime of +a memory mapping. The second long lasting and much more important +factor will affect all subsequent accesses to the memory for the whole +runtime of the application. The second factor consist of two +components: 1) the TLB miss will run faster (especially with +virtualization using nested pagetables but almost always also on bare +metal without virtualization) and 2) a single TLB entry will be +mapping a much larger amount of virtual memory in turn reducing the +number of TLB misses. With virtualization and nested pagetables the +TLB can be mapped of larger size only if both KVM and the Linux guest +are using hugepages but a significant speedup already happens if only +one of the two is using hugepages just because of the fact the TLB +miss is going to run faster. + +== Design == + +- "graceful fallback": mm components which don't have transparent + hugepage knowledge fall back to breaking a transparent hugepage and + working on the regular pages and their respective regular pmd/pte + mappings + +- if a hugepage allocation fails because of memory fragmentation, + regular pages should be gracefully allocated instead and mixed in + the same vma without any failure or significant delay and without + userland noticing + +- if some task quits and more hugepages become available (either + immediately in the buddy or through the VM), guest physical memory + backed by regular pages should be relocated on hugepages + automatically (with khugepaged) + +- it doesn't require memory reservation and in turn it uses hugepages + whenever possible (the only possible reservation here is kernelcore= + to avoid unmovable pages to fragment all the memory but such a tweak + is not specific to transparent hugepage support and it's a generic + feature that applies to all dynamic high order allocations in the + kernel) + +- this initial support only offers the feature in the anonymous memory + regions but it'd be ideal to move it to tmpfs and the pagecache + later + +Transparent Hugepage Support maximizes the usefulness of free memory +if compared to the reservation approach of hugetlbfs by allowing all +unused memory to be used as cache or other movable (or even unmovable +entities). It doesn't require reservation to prevent hugepage +allocation failures to be noticeable from userland. It allows paging +and all other advanced VM features to be available on the +hugepages. It requires no modifications for applications to take +advantage of it. + +Applications however can be further optimized to take advantage of +this feature, like for example they've been optimized before to avoid +a flood of mmap system calls for every malloc(4k). Optimizing userland +is by far not mandatory and khugepaged already can take care of long +lived page allocations even for hugepage unaware applications that +deals with large amounts of memory. + +In certain cases when hugepages are enabled system wide, application +may end up allocating more memory resources. An application may mmap a +large region but only touch 1 byte of it, in that case a 2M page might +be allocated instead of a 4k page for no good. This is why it's +possible to disable hugepages system-wide and to only have them inside +MADV_HUGEPAGE madvise regions. + +Embedded systems should enable hugepages only inside madvise regions +to eliminate any risk of wasting any precious byte of memory and to +only run faster. + +Applications that gets a lot of benefit from hugepages and that don't +risk to lose memory by using hugepages, should use +madvise(MADV_HUGEPAGE) on their critical mmapped regions. + +== sysfs == + +Transparent Hugepage Support can be entirely disabled (mostly for +debugging purposes) or only enabled inside MADV_HUGEPAGE regions (to +avoid the risk of consuming more memory resources) or enabled system +wide. This can be achieved with one of: + +echo always >/sys/kernel/mm/transparent_hugepage/enabled +echo madvise >/sys/kernel/mm/transparent_hugepage/enabled +echo never >/sys/kernel/mm/transparent_hugepage/enabled + +It's also possible to limit defrag efforts in the VM to generate +hugepages in case they're not immediately free to madvise regions or +to never try to defrag memory and simply fallback to regular pages +unless hugepages are immediately available. Clearly if we spend CPU +time to defrag memory, we would expect to gain even more by the fact +we use hugepages later instead of regular pages. This isn't always +guaranteed, but it may be more likely in case the allocation is for a +MADV_HUGEPAGE region. + +echo always >/sys/kernel/mm/transparent_hugepage/defrag +echo madvise >/sys/kernel/mm/transparent_hugepage/defrag +echo never >/sys/kernel/mm/transparent_hugepage/defrag + +khugepaged will be automatically started when +transparent_hugepage/enabled is set to "always" or "madvise, and it'll +be automatically shutdown if it's set to "never". + +khugepaged runs usually at low frequency so while one may not want to +invoke defrag algorithms synchronously during the page faults, it +should be worth invoking defrag at least in khugepaged. However it's +also possible to disable defrag in khugepaged: + +echo yes >/sys/kernel/mm/transparent_hugepage/khugepaged/defrag +echo no >/sys/kernel/mm/transparent_hugepage/khugepaged/defrag + +You can also control how many pages khugepaged should scan at each +pass: + +/sys/kernel/mm/transparent_hugepage/khugepaged/pages_to_scan + +and how many milliseconds to wait in khugepaged between each pass (you +can set this to 0 to run khugepaged at 100% utilization of one core): + +/sys/kernel/mm/transparent_hugepage/khugepaged/scan_sleep_millisecs + +and how many milliseconds to wait in khugepaged if there's an hugepage +allocation failure to throttle the next allocation attempt. + +/sys/kernel/mm/transparent_hugepage/khugepaged/alloc_sleep_millisecs + +The khugepaged progress can be seen in the number of pages collapsed: + +/sys/kernel/mm/transparent_hugepage/khugepaged/pages_collapsed + +for each pass: + +/sys/kernel/mm/transparent_hugepage/khugepaged/full_scans + +== Boot parameter == + +You can change the sysfs boot time defaults of Transparent Hugepage +Support by passing the parameter "transparent_hugepage=always" or +"transparent_hugepage=madvise" or "transparent_hugepage=never" +(without "") to the kernel command line. + +== Need of application restart == + +The transparent_hugepage/enabled values only affect future +behavior. So to make them effective you need to restart any +application that could have been using hugepages. This also applies to +the regions registered in khugepaged. + +== get_user_pages and follow_page == + +get_user_pages and follow_page if run on a hugepage, will return the +head or tail pages as usual (exactly as they would do on +hugetlbfs). Most gup users will only care about the actual physical +address of the page and its temporary pinning to release after the I/O +is complete, so they won't ever notice the fact the page is huge. But +if any driver is going to mangle over the page structure of the tail +page (like for checking page->mapping or other bits that are relevant +for the head page and not the tail page), it should be updated to jump +to check head page instead (while serializing properly against +split_huge_page() to avoid the head and tail pages to disappear from +under it, see the futex code to see an example of that, hugetlbfs also +needed special handling in futex code for similar reasons). + +NOTE: these aren't new constraints to the GUP API, and they match the +same constrains that applies to hugetlbfs too, so any driver capable +of handling GUP on hugetlbfs will also work fine on transparent +hugepage backed mappings. + +In case you can't handle compound pages if they're returned by +follow_page, the FOLL_SPLIT bit can be specified as parameter to +follow_page, so that it will split the hugepages before returning +them. Migration for example passes FOLL_SPLIT as parameter to +follow_page because it's not hugepage aware and in fact it can't work +at all on hugetlbfs (but it instead works fine on transparent +hugepages thanks to FOLL_SPLIT). migration simply can't deal with +hugepages being returned (as it's not only checking the pfn of the +page and pinning it during the copy but it pretends to migrate the +memory in regular page sizes and with regular pte/pmd mappings). + +== Optimizing the applications == + +To be guaranteed that the kernel will map a 2M page immediately in any +memory region, the mmap region has to be hugepage naturally +aligned. posix_memalign() can provide that guarantee. + +== Hugetlbfs == + +You can use hugetlbfs on a kernel that has transparent hugepage +support enabled just fine as always. No difference can be noted in +hugetlbfs other than there will be less overall fragmentation. All +usual features belonging to hugetlbfs are preserved and +unaffected. libhugetlbfs will also work fine as usual. + +== Graceful fallback == + +Code walking pagetables but unware about huge pmds can simply call +split_huge_page_pmd(mm, pmd) where the pmd is the one returned by +pmd_offset. It's trivial to make the code transparent hugepage aware +by just grepping for "pmd_offset" and adding split_huge_page_pmd where +missing after pmd_offset returns the pmd. Thanks to the graceful +fallback design, with a one liner change, you can avoid to write +hundred if not thousand of lines of complex code to make your code +hugepage aware. + +If you're not walking pagetables but you run into a physical hugepage +but you can't handle it natively in your code, you can split it by +calling split_huge_page(page). This is what the Linux VM does before +it tries to swapout the hugepage for example. + +Example to make mremap.c transparent hugepage aware with a one liner +change: + +diff --git a/mm/mremap.c b/mm/mremap.c +--- a/mm/mremap.c ++++ b/mm/mremap.c +@@ -41,6 +41,7 @@ static pmd_t *get_old_pmd(struct mm_stru + return NULL; + + pmd = pmd_offset(pud, addr); ++ split_huge_page_pmd(mm, pmd); + if (pmd_none_or_clear_bad(pmd)) + return NULL; + +== Locking in hugepage aware code == + +We want as much code as possible hugepage aware, as calling +split_huge_page() or split_huge_page_pmd() has a cost. + +To make pagetable walks huge pmd aware, all you need to do is to call +pmd_trans_huge() on the pmd returned by pmd_offset. You must hold the +mmap_sem in read (or write) mode to be sure an huge pmd cannot be +created from under you by khugepaged (khugepaged collapse_huge_page +takes the mmap_sem in write mode in addition to the anon_vma lock). If +pmd_trans_huge returns false, you just fallback in the old code +paths. If instead pmd_trans_huge returns true, you have to take the +mm->page_table_lock and re-run pmd_trans_huge. Taking the +page_table_lock will prevent the huge pmd to be converted into a +regular pmd from under you (split_huge_page can run in parallel to the +pagetable walk). If the second pmd_trans_huge returns false, you +should just drop the page_table_lock and fallback to the old code as +before. Otherwise you should run pmd_trans_splitting on the pmd. In +case pmd_trans_splitting returns true, it means split_huge_page is +already in the middle of splitting the page. So if pmd_trans_splitting +returns true it's enough to drop the page_table_lock and call +wait_split_huge_page and then fallback the old code paths. You are +guaranteed by the time wait_split_huge_page returns, the pmd isn't +huge anymore. If pmd_trans_splitting returns false, you can proceed to +process the huge pmd and the hugepage natively. Once finished you can +drop the page_table_lock. + +== compound_lock, get_user_pages and put_page == + +split_huge_page internally has to distribute the refcounts in the head +page to the tail pages before clearing all PG_head/tail bits from the +page structures. It can do that easily for refcounts taken by huge pmd +mappings. But the GUI API as created by hugetlbfs (that returns head +and tail pages if running get_user_pages on an address backed by any +hugepage), requires the refcount to be accounted on the tail pages and +not only in the head pages, if we want to be able to run +split_huge_page while there are gup pins established on any tail +page. Failure to be able to run split_huge_page if there's any gup pin +on any tail page, would mean having to split all hugepages upfront in +get_user_pages which is unacceptable as too many gup users are +performance critical and they must work natively on hugepages like +they work natively on hugetlbfs already (hugetlbfs is simpler because +hugetlbfs pages cannot be splitted so there wouldn't be requirement of +accounting the pins on the tail pages for hugetlbfs). If we wouldn't +account the gup refcounts on the tail pages during gup, we won't know +anymore which tail page is pinned by gup and which is not while we run +split_huge_page. But we still have to add the gup pin to the head page +too, to know when we can free the compound page in case it's never +splitted during its lifetime. That requires changing not just +get_page, but put_page as well so that when put_page runs on a tail +page (and only on a tail page) it will find its respective head page, +and then it will decrease the head page refcount in addition to the +tail page refcount. To obtain a head page reliably and to decrease its +refcount without race conditions, put_page has to serialize against +__split_huge_page_refcount using a special per-page lock called +compound_lock. diff --git a/MAINTAINERS b/MAINTAINERS index 3dd5c6f..af656de 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6592,13 +6592,12 @@ F: Documentation/i2c/busses/i2c-viapro F: drivers/i2c/busses/i2c-viapro.c VIA SD/MMC CARD CONTROLLER DRIVER -M: Joseph Chan <JosephChan@via.com.tw> +M: Bruce Chang <brucechang@via.com.tw> M: Harald Welte <HaraldWelte@viatech.com> S: Maintained F: drivers/mmc/host/via-sdmmc.c VIA UNICHROME(PRO)/CHROME9 FRAMEBUFFER DRIVER -M: Joseph Chan <JosephChan@via.com.tw> M: Florian Tobias Schandinat <FlorianSchandinat@gmx.de> L: linux-fbdev@vger.kernel.org S: Maintained diff --git a/arch/alpha/include/asm/mman.h b/arch/alpha/include/asm/mman.h index 99c56d4..72db984 100644 --- a/arch/alpha/include/asm/mman.h +++ b/arch/alpha/include/asm/mman.h @@ -53,6 +53,9 @@ #define MADV_MERGEABLE 12 /* KSM may merge identical pages */ #define MADV_UNMERGEABLE 13 /* KSM may not merge identical pages */ +#define MADV_HUGEPAGE 14 /* Worth backing with hugepages */ +#define MADV_NOHUGEPAGE 15 /* Not worth backing with hugepages */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c index 0c1bb68..2cfe816 100644 --- a/arch/arm/kernel/module.c +++ b/arch/arm/kernel/module.c @@ -38,17 +38,9 @@ #ifdef CONFIG_MMU void *module_alloc(unsigned long size) { - struct vm_struct *area; - - size = PAGE_ALIGN(size); - if (!size) - return NULL; - - area = __get_vm_area(size, VM_ALLOC, MODULES_VADDR, MODULES_END); - if (!area) - return NULL; - - return __vmalloc_area(area, GFP_KERNEL, PAGE_KERNEL_EXEC); + return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END, + GFP_KERNEL, PAGE_KERNEL_EXEC, -1, + __builtin_return_address(0)); } #else /* CONFIG_MMU */ void *module_alloc(unsigned long size) diff --git a/arch/arm/mm/pgd.c b/arch/arm/mm/pgd.c index 93292a1..709244c 100644 --- a/arch/arm/mm/pgd.c +++ b/arch/arm/mm/pgd.c @@ -50,7 +50,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) if (!new_pmd) goto no_pmd; - new_pte = pte_alloc_map(mm, new_pmd, 0); + new_pte = pte_alloc_map(mm, NULL, new_pmd, 0); if (!new_pte) goto no_pte; diff --git a/arch/avr32/boards/atngw100/setup.c b/arch/avr32/boards/atngw100/setup.c index 8c6a244..659d119 100644 --- a/arch/avr32/boards/atngw100/setup.c +++ b/arch/avr32/boards/atngw100/setup.c @@ -188,7 +188,7 @@ static void __init set_hw_addr(struct platform_device *pdev) */ regs = (void __iomem __force *)res->start; pclk = clk_get(&pdev->dev, "pclk"); - if (!pclk) + if (IS_ERR(pclk)) return; clk_enable(pclk); diff --git a/arch/avr32/boards/atstk1000/atstk1002.c b/arch/avr32/boards/atstk1000/atstk1002.c index 2adc261..6ce30fb 100644 --- a/arch/avr32/boards/atstk1000/atstk1002.c +++ b/arch/avr32/boards/atstk1000/atstk1002.c @@ -203,7 +203,7 @@ static void __init set_hw_addr(struct platform_device *pdev) */ regs = (void __iomem __force *)res->start; pclk = clk_get(&pdev->dev, "pclk"); - if (!pclk) + if (IS_ERR(pclk)) return; clk_enable(pclk); diff --git a/arch/avr32/boards/favr-32/setup.c b/arch/avr32/boards/favr-32/setup.c index 75f19f4..86fab77 100644 --- a/arch/avr32/boards/favr-32/setup.c +++ b/arch/avr32/boards/favr-32/setup.c @@ -206,7 +206,7 @@ static void __init set_hw_addr(struct platform_device *pdev) */ regs = (void __iomem __force *)res->start; pclk = clk_get(&pdev->dev, "pclk"); - if (!pclk) + if (IS_ERR(pclk)) return; clk_enable(pclk); diff --git a/arch/avr32/boards/hammerhead/setup.c b/arch/avr32/boards/hammerhead/setup.c index dd009875..da14fbd 100644 --- a/arch/avr32/boards/hammerhead/setup.c +++ b/arch/avr32/boards/hammerhead/setup.c @@ -150,7 +150,7 @@ static void __init set_hw_addr(struct platform_device *pdev) regs = (void __iomem __force *)res->start; pclk = clk_get(&pdev->dev, "pclk"); - if (!pclk) + if (IS_ERR(pclk)) return; clk_enable(pclk); diff --git a/arch/avr32/boards/merisc/setup.c b/arch/avr32/boards/merisc/setup.c index 623b077..e61bc94 100644 --- a/arch/avr32/boards/merisc/setup.c +++ b/arch/avr32/boards/merisc/setup.c @@ -134,7 +134,7 @@ static void __init set_hw_addr(struct platform_device *pdev) regs = (void __iomem __force *)res->start; pclk = clk_get(&pdev->dev, "pclk"); - if (!pclk) + if (IS_ERR(pclk)) return; clk_enable(pclk); diff --git a/arch/avr32/boards/mimc200/setup.c b/arch/avr32/boards/mimc200/setup.c index 523d8e1..c4da5cb 100644 --- a/arch/avr32/boards/mimc200/setup.c +++ b/arch/avr32/boards/mimc200/setup.c @@ -162,7 +162,7 @@ static void __init set_hw_addr(struct platform_device *pdev) */ regs = (void __iomem __force *)res->start; pclk = clk_get(&pdev->dev, "pclk"); - if (!pclk) + if (IS_ERR(pclk)) return; clk_enable(pclk); diff --git a/arch/avr32/configs/atngw100_defconfig b/arch/avr32/configs/atngw100_defconfig index 9854013..6f9ca56 100644 --- a/arch/avr32/configs/atngw100_defconfig +++ b/arch/avr32/configs/atngw100_defconfig @@ -2,20 +2,17 @@ CONFIG_EXPERIMENTAL=y # CONFIG_LOCALVERSION_AUTO is not set CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y -CONFIG_BSD_PROCESS_ACCT=y -CONFIG_BSD_PROCESS_ACCT_V3=y CONFIG_LOG_BUF_SHIFT=14 -CONFIG_SYSFS_DEPRECATED_V2=y +CONFIG_RELAY=y CONFIG_BLK_DEV_INITRD=y # CONFIG_SYSCTL_SYSCALL is not set # CONFIG_BASE_FULL is not set # CONFIG_COMPAT_BRK is not set CONFIG_PROFILING=y CONFIG_OPROFILE=m -CONFIG_KPROBES=y +# CONFIG_KPROBES is not set CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y -CONFIG_MODULE_FORCE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set # CONFIG_IOSCHED_DEADLINE is not set CONFIG_NO_HZ=y @@ -29,6 +26,7 @@ CONFIG_CPU_FREQ=y CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y CONFIG_CPU_FREQ_GOV_USERSPACE=y CONFIG_CPU_FREQ_AT32AP=y +CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y @@ -72,8 +70,8 @@ CONFIG_MTD_UBI=y CONFIG_BLK_DEV_LOOP=m CONFIG_BLK_DEV_NBD=m CONFIG_BLK_DEV_RAM=m +CONFIG_MISC_DEVICES=y CONFIG_ATMEL_TCLIB=y -CONFIG_EEPROM_AT24=m CONFIG_NETDEVICES=y CONFIG_TUN=m CONFIG_NET_ETHERNET=y @@ -106,6 +104,7 @@ CONFIG_GPIO_SYSFS=y CONFIG_WATCHDOG=y CONFIG_AT32AP700X_WDT=y CONFIG_USB_GADGET=y +CONFIG_USB_GADGET_VBUS_DRAW=350 CONFIG_USB_ZERO=m CONFIG_USB_ETH=m CONFIG_USB_GADGETFS=m @@ -115,14 +114,12 @@ CONFIG_USB_CDC_COMPOSITE=m CONFIG_MMC=y CONFIG_MMC_TEST=m CONFIG_MMC_ATMELMCI=y -CONFIG_MMC_SPI=m CONFIG_NEW_LEDS=y CONFIG_LEDS_CLASS=y CONFIG_LEDS_GPIO=y CONFIG_LEDS_TRIGGERS=y CONFIG_LEDS_TRIGGER_TIMER=y CONFIG_LEDS_TRIGGER_HEARTBEAT=y -CONFIG_LEDS_TRIGGER_DEFAULT_ON=y CONFIG_RTC_CLASS=y CONFIG_RTC_DRV_AT32AP700X=y CONFIG_DMADEVICES=y @@ -130,21 +127,23 @@ CONFIG_EXT2_FS=y CONFIG_EXT3_FS=y # CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set # CONFIG_EXT3_FS_XATTR is not set +CONFIG_EXT4_FS=y +# CONFIG_EXT4_FS_XATTR is not set # CONFIG_DNOTIFY is not set CONFIG_FUSE_FS=m CONFIG_MSDOS_FS=m CONFIG_VFAT_FS=m CONFIG_FAT_DEFAULT_CODEPAGE=850 +CONFIG_PROC_KCORE=y CONFIG_TMPFS=y -CONFIG_CONFIGFS_FS=m +CONFIG_CONFIGFS_FS=y CONFIG_JFFS2_FS=y -CONFIG_UFS_FS=y +CONFIG_UBIFS_FS=y CONFIG_NFS_FS=y CONFIG_NFS_V3=y CONFIG_ROOT_NFS=y CONFIG_NFSD=m CONFIG_NFSD_V3=y -CONFIG_SMB_FS=m CONFIG_CIFS=m CONFIG_NLS_CODEPAGE_437=m CONFIG_NLS_CODEPAGE_850=m @@ -155,5 +154,3 @@ CONFIG_DEBUG_FS=y CONFIG_DEBUG_KERNEL=y CONFIG_DETECT_HUNG_TASK=y CONFIG_FRAME_POINTER=y -# CONFIG_RCU_CPU_STALL_DETECTOR is not set -CONFIG_CRYPTO_PCBC=m diff --git a/arch/avr32/configs/atngw100_evklcd100_defconfig b/arch/avr32/configs/atngw100_evklcd100_defconfig index 7ceda35..7eece0a 100644 --- a/arch/avr32/configs/atngw100_evklcd100_defconfig +++ b/arch/avr32/configs/atngw100_evklcd100_defconfig @@ -2,20 +2,17 @@ CONFIG_EXPERIMENTAL=y # CONFIG_LOCALVERSION_AUTO is not set CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y -CONFIG_BSD_PROCESS_ACCT=y -CONFIG_BSD_PROCESS_ACCT_V3=y CONFIG_LOG_BUF_SHIFT=14 -CONFIG_SYSFS_DEPRECATED_V2=y +CONFIG_RELAY=y CONFIG_BLK_DEV_INITRD=y # CONFIG_SYSCTL_SYSCALL is not set # CONFIG_BASE_FULL is not set # CONFIG_COMPAT_BRK is not set CONFIG_PROFILING=y CONFIG_OPROFILE=m -CONFIG_KPROBES=y +# CONFIG_KPROBES is not set CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y -CONFIG_MODULE_FORCE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set # CONFIG_IOSCHED_DEADLINE is not set CONFIG_NO_HZ=y @@ -31,6 +28,7 @@ CONFIG_CPU_FREQ=y CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y CONFIG_CPU_FREQ_GOV_USERSPACE=y CONFIG_CPU_FREQ_AT32AP=y +CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y @@ -74,8 +72,10 @@ CONFIG_MTD_UBI=y CONFIG_BLK_DEV_LOOP=m CONFIG_BLK_DEV_NBD=m CONFIG_BLK_DEV_RAM=m +CONFIG_MISC_DEVICES=y CONFIG_ATMEL_TCLIB=y CONFIG_NETDEVICES=y +CONFIG_TUN=m CONFIG_NET_ETHERNET=y CONFIG_MACB=y # CONFIG_NETDEV_1000 is not set @@ -104,6 +104,7 @@ CONFIG_I2C_GPIO=m CONFIG_SPI=y CONFIG_SPI_ATMEL=y CONFIG_SPI_SPIDEV=m +CONFIG_GPIO_SYSFS=y # CONFIG_HWMON is not set CONFIG_WATCHDOG=y CONFIG_AT32AP700X_WDT=y @@ -127,6 +128,7 @@ CONFIG_USB_FILE_STORAGE=m CONFIG_USB_G_SERIAL=m CONFIG_USB_CDC_COMPOSITE=m CONFIG_MMC=y +CONFIG_MMC_TEST=m CONFIG_MMC_ATMELMCI=y CONFIG_NEW_LEDS=y CONFIG_LEDS_CLASS=y @@ -141,11 +143,14 @@ CONFIG_EXT2_FS=y CONFIG_EXT3_FS=y # CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set # CONFIG_EXT3_FS_XATTR is not set +CONFIG_EXT4_FS=y +# CONFIG_EXT4_FS_XATTR is not set # CONFIG_DNOTIFY is not set CONFIG_FUSE_FS=m CONFIG_MSDOS_FS=m CONFIG_VFAT_FS=m CONFIG_FAT_DEFAULT_CODEPAGE=850 +CONFIG_PROC_KCORE=y CONFIG_TMPFS=y CONFIG_CONFIGFS_FS=y CONFIG_JFFS2_FS=y @@ -155,7 +160,6 @@ CONFIG_NFS_V3=y CONFIG_ROOT_NFS=y CONFIG_NFSD=m CONFIG_NFSD_V3=y -CONFIG_SMB_FS=m CONFIG_CIFS=m CONFIG_NLS_CODEPAGE_437=m CONFIG_NLS_CODEPAGE_850=m @@ -166,4 +170,3 @@ CONFIG_DEBUG_FS=y CONFIG_DEBUG_KERNEL=y CONFIG_DETECT_HUNG_TASK=y CONFIG_FRAME_POINTER=y -# CONFIG_RCU_CPU_STALL_DETECTOR is not set diff --git a/arch/avr32/configs/atngw100_evklcd101_defconfig b/arch/avr32/configs/atngw100_evklcd101_defconfig index 7bc5b2c..387eb9d 100644 --- a/arch/avr32/configs/atngw100_evklcd101_defconfig +++ b/arch/avr32/configs/atngw100_evklcd101_defconfig @@ -2,20 +2,17 @@ CONFIG_EXPERIMENTAL=y # CONFIG_LOCALVERSION_AUTO is not set CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y -CONFIG_BSD_PROCESS_ACCT=y -CONFIG_BSD_PROCESS_ACCT_V3=y CONFIG_LOG_BUF_SHIFT=14 -CONFIG_SYSFS_DEPRECATED_V2=y +CONFIG_RELAY=y CONFIG_BLK_DEV_INITRD=y # CONFIG_SYSCTL_SYSCALL is not set # CONFIG_BASE_FULL is not set # CONFIG_COMPAT_BRK is not set CONFIG_PROFILING=y CONFIG_OPROFILE=m -CONFIG_KPROBES=y +# CONFIG_KPROBES is not set CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y -CONFIG_MODULE_FORCE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set # CONFIG_IOSCHED_DEADLINE is not set CONFIG_NO_HZ=y @@ -30,6 +27,7 @@ CONFIG_CPU_FREQ=y CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y CONFIG_CPU_FREQ_GOV_USERSPACE=y CONFIG_CPU_FREQ_AT32AP=y +CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y @@ -73,8 +71,10 @@ CONFIG_MTD_UBI=y CONFIG_BLK_DEV_LOOP=m CONFIG_BLK_DEV_NBD=m CONFIG_BLK_DEV_RAM=m +CONFIG_MISC_DEVICES=y CONFIG_ATMEL_TCLIB=y CONFIG_NETDEVICES=y +CONFIG_TUN=m CONFIG_NET_ETHERNET=y CONFIG_MACB=y # CONFIG_NETDEV_1000 is not set @@ -103,6 +103,7 @@ CONFIG_I2C_GPIO=m CONFIG_SPI=y CONFIG_SPI_ATMEL=y CONFIG_SPI_SPIDEV=m +CONFIG_GPIO_SYSFS=y # CONFIG_HWMON is not set CONFIG_WATCHDOG=y CONFIG_AT32AP700X_WDT=y @@ -126,6 +127,7 @@ CONFIG_USB_FILE_STORAGE=m CONFIG_USB_G_SERIAL=m CONFIG_USB_CDC_COMPOSITE=m CONFIG_MMC=y +CONFIG_MMC_TEST=m CONFIG_MMC_ATMELMCI=y CONFIG_NEW_LEDS=y CONFIG_LEDS_CLASS=y @@ -140,11 +142,14 @@ CONFIG_EXT2_FS=y CONFIG_EXT3_FS=y # CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set # CONFIG_EXT3_FS_XATTR is not set +CONFIG_EXT4_FS=y +# CONFIG_EXT4_FS_XATTR is not set # CONFIG_DNOTIFY is not set CONFIG_FUSE_FS=m CONFIG_MSDOS_FS=m CONFIG_VFAT_FS=m CONFIG_FAT_DEFAULT_CODEPAGE=850 +CONFIG_PROC_KCORE=y CONFIG_TMPFS=y CONFIG_CONFIGFS_FS=y CONFIG_JFFS2_FS=y @@ -154,7 +159,6 @@ CONFIG_NFS_V3=y CONFIG_ROOT_NFS=y CONFIG_NFSD=m CONFIG_NFSD_V3=y -CONFIG_SMB_FS=m CONFIG_CIFS=m CONFIG_NLS_CODEPAGE_437=m CONFIG_NLS_CODEPAGE_850=m @@ -165,4 +169,3 @@ CONFIG_DEBUG_FS=y CONFIG_DEBUG_KERNEL=y CONFIG_DETECT_HUNG_TASK=y CONFIG_FRAME_POINTER=y -# CONFIG_RCU_CPU_STALL_DETECTOR is not set diff --git a/arch/avr32/configs/atngw100mkii_defconfig b/arch/avr32/configs/atngw100mkii_defconfig index 4bd3682..f0fe237 100644 --- a/arch/avr32/configs/atngw100mkii_defconfig +++ b/arch/avr32/configs/atngw100mkii_defconfig @@ -2,20 +2,17 @@ CONFIG_EXPERIMENTAL=y # CONFIG_LOCALVERSION_AUTO is not set CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y -CONFIG_BSD_PROCESS_ACCT=y -CONFIG_BSD_PROCESS_ACCT_V3=y CONFIG_LOG_BUF_SHIFT=14 -CONFIG_SYSFS_DEPRECATED_V2=y +CONFIG_RELAY=y CONFIG_BLK_DEV_INITRD=y # CONFIG_SYSCTL_SYSCALL is not set # CONFIG_BASE_FULL is not set # CONFIG_COMPAT_BRK is not set CONFIG_PROFILING=y CONFIG_OPROFILE=m -CONFIG_KPROBES=y +# CONFIG_KPROBES is not set CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y -CONFIG_MODULE_FORCE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set # CONFIG_IOSCHED_DEADLINE is not set CONFIG_NO_HZ=y @@ -29,6 +26,7 @@ CONFIG_CPU_FREQ=y CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y CONFIG_CPU_FREQ_GOV_USERSPACE=y CONFIG_CPU_FREQ_AT32AP=y +CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y @@ -74,6 +72,7 @@ CONFIG_MTD_UBI=y CONFIG_BLK_DEV_LOOP=m CONFIG_BLK_DEV_NBD=m CONFIG_BLK_DEV_RAM=m +CONFIG_MISC_DEVICES=y CONFIG_ATMEL_TCLIB=y CONFIG_NETDEVICES=y CONFIG_TUN=m @@ -107,6 +106,7 @@ CONFIG_GPIO_SYSFS=y CONFIG_WATCHDOG=y CONFIG_AT32AP700X_WDT=y CONFIG_USB_GADGET=y +CONFIG_USB_GADGET_VBUS_DRAW=350 CONFIG_USB_ZERO=m CONFIG_USB_ETH=m CONFIG_USB_GADGETFS=m @@ -116,14 +116,12 @@ CONFIG_USB_CDC_COMPOSITE=m CONFIG_MMC=y CONFIG_MMC_TEST=m CONFIG_MMC_ATMELMCI=y -CONFIG_MMC_SPI=m CONFIG_NEW_LEDS=y CONFIG_LEDS_CLASS=y CONFIG_LEDS_GPIO=y CONFIG_LEDS_TRIGGERS=y CONFIG_LEDS_TRIGGER_TIMER=y CONFIG_LEDS_TRIGGER_HEARTBEAT=y -CONFIG_LEDS_TRIGGER_DEFAULT_ON=y CONFIG_RTC_CLASS=y CONFIG_RTC_DRV_AT32AP700X=y CONFIG_DMADEVICES=y @@ -131,21 +129,23 @@ CONFIG_EXT2_FS=y CONFIG_EXT3_FS=y # CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set # CONFIG_EXT3_FS_XATTR is not set +CONFIG_EXT4_FS=y +# CONFIG_EXT4_FS_XATTR is not set # CONFIG_DNOTIFY is not set CONFIG_FUSE_FS=m CONFIG_MSDOS_FS=m CONFIG_VFAT_FS=m CONFIG_FAT_DEFAULT_CODEPAGE=850 +CONFIG_PROC_KCORE=y CONFIG_TMPFS=y -CONFIG_CONFIGFS_FS=m +CONFIG_CONFIGFS_FS=y CONFIG_JFFS2_FS=y -CONFIG_UFS_FS=y +CONFIG_UBIFS_FS=y CONFIG_NFS_FS=y CONFIG_NFS_V3=y CONFIG_ROOT_NFS=y CONFIG_NFSD=m CONFIG_NFSD_V3=y -CONFIG_SMB_FS=m CONFIG_CIFS=m CONFIG_NLS_CODEPAGE_437=m CONFIG_NLS_CODEPAGE_850=m @@ -156,5 +156,3 @@ CONFIG_DEBUG_FS=y CONFIG_DEBUG_KERNEL=y CONFIG_DETECT_HUNG_TASK=y CONFIG_FRAME_POINTER=y -# CONFIG_RCU_CPU_STALL_DETECTOR is not set -CONFIG_CRYPTO_PCBC=m diff --git a/arch/avr32/configs/atngw100mkii_evklcd100_defconfig b/arch/avr32/configs/atngw100mkii_evklcd100_defconfig index f8437ef..e4a7c1d 100644 --- a/arch/avr32/configs/atngw100mkii_evklcd100_defconfig +++ b/arch/avr32/configs/atngw100mkii_evklcd100_defconfig @@ -2,20 +2,17 @@ CONFIG_EXPERIMENTAL=y # CONFIG_LOCALVERSION_AUTO is not set CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y -CONFIG_BSD_PROCESS_ACCT=y -CONFIG_BSD_PROCESS_ACCT_V3=y CONFIG_LOG_BUF_SHIFT=14 -CONFIG_SYSFS_DEPRECATED_V2=y +CONFIG_RELAY=y CONFIG_BLK_DEV_INITRD=y # CONFIG_SYSCTL_SYSCALL is not set # CONFIG_BASE_FULL is not set # CONFIG_COMPAT_BRK is not set CONFIG_PROFILING=y CONFIG_OPROFILE=m -CONFIG_KPROBES=y +# CONFIG_KPROBES is not set CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y -CONFIG_MODULE_FORCE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set # CONFIG_IOSCHED_DEADLINE is not set CONFIG_NO_HZ=y @@ -32,6 +29,7 @@ CONFIG_CPU_FREQ=y CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y CONFIG_CPU_FREQ_GOV_USERSPACE=y CONFIG_CPU_FREQ_AT32AP=y +CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y @@ -77,8 +75,10 @@ CONFIG_MTD_UBI=y CONFIG_BLK_DEV_LOOP=m CONFIG_BLK_DEV_NBD=m CONFIG_BLK_DEV_RAM=m +CONFIG_MISC_DEVICES=y CONFIG_ATMEL_TCLIB=y CONFIG_NETDEVICES=y +CONFIG_TUN=m CONFIG_NET_ETHERNET=y CONFIG_MACB=y # CONFIG_NETDEV_1000 is not set @@ -107,6 +107,7 @@ CONFIG_I2C_GPIO=m CONFIG_SPI=y CONFIG_SPI_ATMEL=y CONFIG_SPI_SPIDEV=m +CONFIG_GPIO_SYSFS=y # CONFIG_HWMON is not set CONFIG_WATCHDOG=y CONFIG_AT32AP700X_WDT=y @@ -130,6 +131,7 @@ CONFIG_USB_FILE_STORAGE=m CONFIG_USB_G_SERIAL=m CONFIG_USB_CDC_COMPOSITE=m CONFIG_MMC=y +CONFIG_MMC_TEST=m CONFIG_MMC_ATMELMCI=y CONFIG_NEW_LEDS=y CONFIG_LEDS_CLASS=y @@ -144,11 +146,14 @@ CONFIG_EXT2_FS=y CONFIG_EXT3_FS=y # CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set # CONFIG_EXT3_FS_XATTR is not set +CONFIG_EXT4_FS=y +# CONFIG_EXT4_FS_XATTR is not set # CONFIG_DNOTIFY is not set CONFIG_FUSE_FS=m CONFIG_MSDOS_FS=m CONFIG_VFAT_FS=m CONFIG_FAT_DEFAULT_CODEPAGE=850 +CONFIG_PROC_KCORE=y CONFIG_TMPFS=y CONFIG_CONFIGFS_FS=y CONFIG_JFFS2_FS=y @@ -158,7 +163,6 @@ CONFIG_NFS_V3=y CONFIG_ROOT_NFS=y CONFIG_NFSD=m CONFIG_NFSD_V3=y -CONFIG_SMB_FS=m CONFIG_CIFS=m CONFIG_NLS_CODEPAGE_437=m CONFIG_NLS_CODEPAGE_850=m @@ -169,4 +173,3 @@ CONFIG_DEBUG_FS=y CONFIG_DEBUG_KERNEL=y CONFIG_DETECT_HUNG_TASK=y CONFIG_FRAME_POINTER=y -# CONFIG_RCU_CPU_STALL_DETECTOR is not set diff --git a/arch/avr32/configs/atngw100mkii_evklcd101_defconfig b/arch/avr32/configs/atngw100mkii_evklcd101_defconfig index 7f58f99..6f37f70 100644 --- a/arch/avr32/configs/atngw100mkii_evklcd101_defconfig +++ b/arch/avr32/configs/atngw100mkii_evklcd101_defconfig @@ -2,20 +2,17 @@ CONFIG_EXPERIMENTAL=y # CONFIG_LOCALVERSION_AUTO is not set CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y -CONFIG_BSD_PROCESS_ACCT=y -CONFIG_BSD_PROCESS_ACCT_V3=y CONFIG_LOG_BUF_SHIFT=14 -CONFIG_SYSFS_DEPRECATED_V2=y +CONFIG_RELAY=y CONFIG_BLK_DEV_INITRD=y # CONFIG_SYSCTL_SYSCALL is not set # CONFIG_BASE_FULL is not set # CONFIG_COMPAT_BRK is not set CONFIG_PROFILING=y CONFIG_OPROFILE=m -CONFIG_KPROBES=y +# CONFIG_KPROBES is not set CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y -CONFIG_MODULE_FORCE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set # CONFIG_IOSCHED_DEADLINE is not set CONFIG_NO_HZ=y @@ -31,6 +28,7 @@ CONFIG_CPU_FREQ=y CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y CONFIG_CPU_FREQ_GOV_USERSPACE=y CONFIG_CPU_FREQ_AT32AP=y +CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y @@ -76,8 +74,10 @@ CONFIG_MTD_UBI=y CONFIG_BLK_DEV_LOOP=m CONFIG_BLK_DEV_NBD=m CONFIG_BLK_DEV_RAM=m +CONFIG_MISC_DEVICES=y CONFIG_ATMEL_TCLIB=y CONFIG_NETDEVICES=y +CONFIG_TUN=m CONFIG_NET_ETHERNET=y CONFIG_MACB=y # CONFIG_NETDEV_1000 is not set @@ -106,6 +106,7 @@ CONFIG_I2C_GPIO=m CONFIG_SPI=y CONFIG_SPI_ATMEL=y CONFIG_SPI_SPIDEV=m +CONFIG_GPIO_SYSFS=y # CONFIG_HWMON is not set CONFIG_WATCHDOG=y CONFIG_AT32AP700X_WDT=y @@ -129,6 +130,7 @@ CONFIG_USB_FILE_STORAGE=m CONFIG_USB_G_SERIAL=m CONFIG_USB_CDC_COMPOSITE=m CONFIG_MMC=y +CONFIG_MMC_TEST=m CONFIG_MMC_ATMELMCI=y CONFIG_NEW_LEDS=y CONFIG_LEDS_CLASS=y @@ -143,11 +145,14 @@ CONFIG_EXT2_FS=y CONFIG_EXT3_FS=y # CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set # CONFIG_EXT3_FS_XATTR is not set +CONFIG_EXT4_FS=y +# CONFIG_EXT4_FS_XATTR is not set # CONFIG_DNOTIFY is not set CONFIG_FUSE_FS=m CONFIG_MSDOS_FS=m CONFIG_VFAT_FS=m CONFIG_FAT_DEFAULT_CODEPAGE=850 +CONFIG_PROC_KCORE=y CONFIG_TMPFS=y CONFIG_CONFIGFS_FS=y CONFIG_JFFS2_FS=y @@ -157,7 +162,6 @@ CONFIG_NFS_V3=y CONFIG_ROOT_NFS=y CONFIG_NFSD=m CONFIG_NFSD_V3=y -CONFIG_SMB_FS=m CONFIG_CIFS=m CONFIG_NLS_CODEPAGE_437=m CONFIG_NLS_CODEPAGE_850=m @@ -168,4 +172,3 @@ CONFIG_DEBUG_FS=y CONFIG_DEBUG_KERNEL=y CONFIG_DETECT_HUNG_TASK=y CONFIG_FRAME_POINTER=y -# CONFIG_RCU_CPU_STALL_DETECTOR is not set diff --git a/arch/avr32/configs/atstk1002_defconfig b/arch/avr32/configs/atstk1002_defconfig index aec4c43..4fb01f5 100644 --- a/arch/avr32/configs/atstk1002_defconfig +++ b/arch/avr32/configs/atstk1002_defconfig @@ -3,7 +3,6 @@ CONFIG_EXPERIMENTAL=y CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y CONFIG_LOG_BUF_SHIFT=14 -CONFIG_SYSFS_DEPRECATED_V2=y CONFIG_RELAY=y CONFIG_BLK_DEV_INITRD=y # CONFIG_SYSCTL_SYSCALL is not set @@ -11,7 +10,7 @@ CONFIG_BLK_DEV_INITRD=y # CONFIG_COMPAT_BRK is not set CONFIG_PROFILING=y CONFIG_OPROFILE=m -CONFIG_KPROBES=y +# CONFIG_KPROBES is not set CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set @@ -26,6 +25,7 @@ CONFIG_CPU_FREQ=y CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y CONFIG_CPU_FREQ_GOV_USERSPACE=y CONFIG_CPU_FREQ_AT32AP=y +CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y @@ -35,6 +35,7 @@ CONFIG_INET=y CONFIG_IP_PNP=y CONFIG_IP_PNP_DHCP=y CONFIG_NET_IPIP=m +CONFIG_NET_IPGRE_DEMUX=m CONFIG_NET_IPGRE=m CONFIG_INET_AH=m CONFIG_INET_ESP=m @@ -58,16 +59,14 @@ CONFIG_MTD_BLOCK=y CONFIG_MTD_CFI=y CONFIG_MTD_CFI_AMDSTD=y CONFIG_MTD_PHYSMAP=y -CONFIG_MTD_DATAFLASH=m -CONFIG_MTD_M25P80=m CONFIG_MTD_UBI=y CONFIG_BLK_DEV_LOOP=m CONFIG_BLK_DEV_NBD=m CONFIG_BLK_DEV_RAM=m +CONFIG_MISC_DEVICES=y CONFIG_ATMEL_PWM=m CONFIG_ATMEL_TCLIB=y CONFIG_ATMEL_SSC=m -CONFIG_EEPROM_AT24=m # CONFIG_SCSI_PROC_FS is not set CONFIG_BLK_DEV_SD=m CONFIG_BLK_DEV_SR=m @@ -120,7 +119,6 @@ CONFIG_SND_MIXER_OSS=m CONFIG_SND_PCM_OSS=m # CONFIG_SND_SUPPORT_OLD_API is not set # CONFIG_SND_VERBOSE_PROCFS is not set -# CONFIG_SND_DRIVERS is not set CONFIG_SND_AT73C213=m # CONFIG_HID_SUPPORT is not set CONFIG_USB_GADGET=y @@ -131,16 +129,15 @@ CONFIG_USB_FILE_STORAGE=m CONFIG_USB_G_SERIAL=m CONFIG_USB_CDC_COMPOSITE=m CONFIG_MMC=y +CONFIG_MMC_TEST=m CONFIG_MMC_ATMELMCI=y -CONFIG_MMC_SPI=m CONFIG_NEW_LEDS=y -CONFIG_LEDS_CLASS=m +CONFIG_LEDS_CLASS=y CONFIG_LEDS_ATMEL_PWM=m CONFIG_LEDS_GPIO=m CONFIG_LEDS_TRIGGERS=y CONFIG_LEDS_TRIGGER_TIMER=m CONFIG_LEDS_TRIGGER_HEARTBEAT=m -CONFIG_LEDS_TRIGGER_DEFAULT_ON=m CONFIG_RTC_CLASS=y CONFIG_RTC_DRV_AT32AP700X=y CONFIG_DMADEVICES=y @@ -149,20 +146,23 @@ CONFIG_EXT3_FS=y # CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set # CONFIG_EXT3_FS_XATTR is not set CONFIG_EXT4_FS=y +# CONFIG_EXT4_FS_XATTR is not set # CONFIG_DNOTIFY is not set CONFIG_FUSE_FS=m CONFIG_MSDOS_FS=m CONFIG_VFAT_FS=m +CONFIG_FAT_DEFAULT_CODEPAGE=850 CONFIG_PROC_KCORE=y CONFIG_TMPFS=y +CONFIG_CONFIGFS_FS=y CONFIG_JFFS2_FS=y -# CONFIG_JFFS2_FS_WRITEBUFFER is not set CONFIG_UBIFS_FS=y -CONFIG_MINIX_FS=m CONFIG_NFS_FS=y CONFIG_NFS_V3=y CONFIG_ROOT_NFS=y +CONFIG_CIFS=m CONFIG_NLS_CODEPAGE_437=m +CONFIG_NLS_CODEPAGE_850=m CONFIG_NLS_ISO8859_1=m CONFIG_NLS_UTF8=m CONFIG_MAGIC_SYSRQ=y @@ -170,6 +170,3 @@ CONFIG_DEBUG_FS=y CONFIG_DEBUG_KERNEL=y CONFIG_DETECT_HUNG_TASK=y CONFIG_FRAME_POINTER=y -# CONFIG_RCU_CPU_STALL_DETECTOR is not set -# CONFIG_CRYPTO_HW is not set -CONFIG_CRC_T10DIF=m diff --git a/arch/avr32/configs/atstk1003_defconfig b/arch/avr32/configs/atstk1003_defconfig index 50ba3db..9faaf9b 100644 --- a/arch/avr32/configs/atstk1003_defconfig +++ b/arch/avr32/configs/atstk1003_defconfig @@ -2,22 +2,15 @@ CONFIG_EXPERIMENTAL=y # CONFIG_LOCALVERSION_AUTO is not set CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y -CONFIG_BSD_PROCESS_ACCT=y -CONFIG_BSD_PROCESS_ACCT_V3=y -CONFIG_TASKSTATS=y -CONFIG_TASK_DELAY_ACCT=y -CONFIG_AUDIT=y CONFIG_LOG_BUF_SHIFT=14 -CONFIG_SYSFS_DEPRECATED_V2=y CONFIG_RELAY=y CONFIG_BLK_DEV_INITRD=y # CONFIG_SYSCTL_SYSCALL is not set # CONFIG_BASE_FULL is not set -# CONFIG_SLUB_DEBUG is not set # CONFIG_COMPAT_BRK is not set CONFIG_PROFILING=y CONFIG_OPROFILE=m -CONFIG_KPROBES=y +# CONFIG_KPROBES is not set CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set @@ -33,6 +26,7 @@ CONFIG_CPU_FREQ=y CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y CONFIG_CPU_FREQ_GOV_USERSPACE=y CONFIG_CPU_FREQ_AT32AP=y +CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y @@ -54,18 +48,18 @@ CONFIG_MTD_BLOCK=y CONFIG_MTD_CFI=y CONFIG_MTD_CFI_AMDSTD=y CONFIG_MTD_PHYSMAP=y -CONFIG_MTD_DATAFLASH=m -CONFIG_MTD_M25P80=m +CONFIG_MTD_UBI=y CONFIG_BLK_DEV_LOOP=m CONFIG_BLK_DEV_NBD=m CONFIG_BLK_DEV_RAM=m +CONFIG_MISC_DEVICES=y CONFIG_ATMEL_PWM=m CONFIG_ATMEL_TCLIB=y CONFIG_ATMEL_SSC=m -CONFIG_EEPROM_AT24=m # CONFIG_SCSI_PROC_FS is not set CONFIG_BLK_DEV_SD=m CONFIG_BLK_DEV_SR=m +# CONFIG_SCSI_LOWLEVEL is not set CONFIG_ATA=m # CONFIG_SATA_PMP is not set CONFIG_PATA_AT32=m @@ -77,6 +71,7 @@ CONFIG_PPP_ASYNC=m CONFIG_PPP_DEFLATE=m CONFIG_PPP_BSDCOMP=m CONFIG_INPUT=m +CONFIG_INPUT_EVDEV=m # CONFIG_KEYBOARD_ATKBD is not set CONFIG_KEYBOARD_GPIO=m # CONFIG_MOUSE_PS2 is not set @@ -106,7 +101,6 @@ CONFIG_SND_PCM_OSS=m CONFIG_SND_AT73C213=m # CONFIG_HID_SUPPORT is not set CONFIG_USB_GADGET=y -CONFIG_USB_GADGET_DEBUG_FS=y CONFIG_USB_ZERO=m CONFIG_USB_ETH=m CONFIG_USB_GADGETFS=m @@ -116,36 +110,39 @@ CONFIG_USB_CDC_COMPOSITE=m CONFIG_MMC=y CONFIG_MMC_TEST=m CONFIG_MMC_ATMELMCI=y -CONFIG_MMC_SPI=m CONFIG_NEW_LEDS=y CONFIG_LEDS_CLASS=y CONFIG_LEDS_ATMEL_PWM=m -CONFIG_LEDS_GPIO=y +CONFIG_LEDS_GPIO=m CONFIG_LEDS_TRIGGERS=y -CONFIG_LEDS_TRIGGER_TIMER=y -CONFIG_LEDS_TRIGGER_HEARTBEAT=y -CONFIG_LEDS_TRIGGER_DEFAULT_ON=y +CONFIG_LEDS_TRIGGER_TIMER=m +CONFIG_LEDS_TRIGGER_HEARTBEAT=m CONFIG_RTC_CLASS=y CONFIG_RTC_DRV_AT32AP700X=y CONFIG_DMADEVICES=y -CONFIG_DW_DMAC=y -CONFIG_EXT2_FS=m -CONFIG_EXT3_FS=m +CONFIG_EXT2_FS=y +CONFIG_EXT3_FS=y +# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set # CONFIG_EXT3_FS_XATTR is not set +CONFIG_EXT4_FS=y +# CONFIG_EXT4_FS_XATTR is not set # CONFIG_DNOTIFY is not set CONFIG_FUSE_FS=m CONFIG_MSDOS_FS=m CONFIG_VFAT_FS=m +CONFIG_FAT_DEFAULT_CODEPAGE=850 CONFIG_PROC_KCORE=y CONFIG_TMPFS=y -CONFIG_CONFIGFS_FS=m +CONFIG_CONFIGFS_FS=y CONFIG_JFFS2_FS=y +CONFIG_UBIFS_FS=y # CONFIG_NETWORK_FILESYSTEMS is not set CONFIG_NLS_CODEPAGE_437=m +CONFIG_NLS_CODEPAGE_850=m CONFIG_NLS_ISO8859_1=m CONFIG_NLS_UTF8=m CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_FS=y CONFIG_DEBUG_KERNEL=y +CONFIG_DETECT_HUNG_TASK=y CONFIG_FRAME_POINTER=y -CONFIG_CRC_T10DIF=m diff --git a/arch/avr32/configs/atstk1004_defconfig b/arch/avr32/configs/atstk1004_defconfig index 329e10b..3d2a5d8 100644 --- a/arch/avr32/configs/atstk1004_defconfig +++ b/arch/avr32/configs/atstk1004_defconfig @@ -1,19 +1,32 @@ CONFIG_EXPERIMENTAL=y # CONFIG_LOCALVERSION_AUTO is not set +CONFIG_SYSVIPC=y +CONFIG_POSIX_MQUEUE=y CONFIG_LOG_BUF_SHIFT=14 -CONFIG_SYSFS_DEPRECATED_V2=y +CONFIG_RELAY=y +CONFIG_BLK_DEV_INITRD=y # CONFIG_SYSCTL_SYSCALL is not set # CONFIG_BASE_FULL is not set -# CONFIG_FUTEX is not set -# CONFIG_EPOLL is not set -# CONFIG_SIGNALFD is not set -# CONFIG_TIMERFD is not set -# CONFIG_EVENTFD is not set # CONFIG_COMPAT_BRK is not set -CONFIG_SLOB=y -# CONFIG_BLOCK is not set +CONFIG_PROFILING=y +CONFIG_OPROFILE=m +# CONFIG_KPROBES is not set +CONFIG_MODULES=y +CONFIG_MODULE_UNLOAD=y +# CONFIG_BLK_DEV_BSG is not set +# CONFIG_IOSCHED_DEADLINE is not set +CONFIG_NO_HZ=y +CONFIG_HIGH_RES_TIMERS=y CONFIG_BOARD_ATSTK1004=y # CONFIG_OWNERSHIP_TRACE is not set +CONFIG_NMI_DEBUGGING=y +CONFIG_PM=y +CONFIG_CPU_FREQ=y +# CONFIG_CPU_FREQ_STAT is not set +CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y +CONFIG_CPU_FREQ_GOV_USERSPACE=y +CONFIG_CPU_FREQ_AT32AP=y +CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y @@ -31,40 +44,104 @@ CONFIG_MTD=y CONFIG_MTD_PARTITIONS=y CONFIG_MTD_CMDLINE_PARTS=y CONFIG_MTD_CHAR=y +CONFIG_MTD_BLOCK=y CONFIG_MTD_CFI=y CONFIG_MTD_CFI_AMDSTD=y CONFIG_MTD_PHYSMAP=y -# CONFIG_MISC_DEVICES is not set -# CONFIG_INPUT is not set +CONFIG_MTD_UBI=y +CONFIG_BLK_DEV_LOOP=m +CONFIG_BLK_DEV_NBD=m +CONFIG_BLK_DEV_RAM=m +CONFIG_MISC_DEVICES=y +CONFIG_ATMEL_PWM=m +CONFIG_ATMEL_TCLIB=y +CONFIG_ATMEL_SSC=m +# CONFIG_SCSI_PROC_FS is not set +CONFIG_BLK_DEV_SD=m +CONFIG_BLK_DEV_SR=m +# CONFIG_SCSI_LOWLEVEL is not set +CONFIG_ATA=m +# CONFIG_SATA_PMP is not set +CONFIG_PATA_AT32=m +CONFIG_NETDEVICES=y +# CONFIG_NETDEV_1000 is not set +# CONFIG_NETDEV_10000 is not set +CONFIG_PPP=m +CONFIG_PPP_ASYNC=m +CONFIG_PPP_DEFLATE=m +CONFIG_PPP_BSDCOMP=m +CONFIG_INPUT=m +CONFIG_INPUT_EVDEV=m +# CONFIG_KEYBOARD_ATKBD is not set +CONFIG_KEYBOARD_GPIO=m +# CONFIG_MOUSE_PS2 is not set +CONFIG_MOUSE_GPIO=m # CONFIG_SERIO is not set # CONFIG_VT is not set # CONFIG_DEVKMEM is not set CONFIG_SERIAL_ATMEL=y CONFIG_SERIAL_ATMEL_CONSOLE=y -# CONFIG_SERIAL_ATMEL_PDC is not set # CONFIG_LEGACY_PTYS is not set # CONFIG_HW_RANDOM is not set +CONFIG_I2C=m +CONFIG_I2C_CHARDEV=m +CONFIG_I2C_GPIO=m CONFIG_SPI=y CONFIG_SPI_ATMEL=y +CONFIG_SPI_SPIDEV=m +CONFIG_GPIO_SYSFS=y # CONFIG_HWMON is not set CONFIG_WATCHDOG=y CONFIG_AT32AP700X_WDT=y CONFIG_FB=y CONFIG_FB_ATMEL=y CONFIG_BACKLIGHT_LCD_SUPPORT=y +CONFIG_LCD_CLASS_DEVICE=y CONFIG_LCD_LTV350QV=y # CONFIG_BACKLIGHT_CLASS_DEVICE is not set CONFIG_USB_GADGET=y -CONFIG_USB_ETH=y -# CONFIG_USB_ETH_RNDIS is not set +CONFIG_USB_ZERO=m +CONFIG_USB_ETH=m +CONFIG_USB_GADGETFS=m +CONFIG_USB_FILE_STORAGE=m +CONFIG_USB_G_SERIAL=m +CONFIG_USB_CDC_COMPOSITE=m +CONFIG_MMC=y +CONFIG_MMC_TEST=m +CONFIG_MMC_ATMELMCI=y +CONFIG_NEW_LEDS=y +CONFIG_LEDS_CLASS=y +CONFIG_LEDS_ATMEL_PWM=m +CONFIG_LEDS_GPIO=m +CONFIG_LEDS_TRIGGERS=y +CONFIG_LEDS_TRIGGER_TIMER=m +CONFIG_LEDS_TRIGGER_HEARTBEAT=m CONFIG_RTC_CLASS=y -# CONFIG_RTC_INTF_PROC is not set CONFIG_RTC_DRV_AT32AP700X=y +CONFIG_DMADEVICES=y +CONFIG_EXT2_FS=y +CONFIG_EXT3_FS=y +# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set +# CONFIG_EXT3_FS_XATTR is not set +CONFIG_EXT4_FS=y +# CONFIG_EXT4_FS_XATTR is not set # CONFIG_DNOTIFY is not set +CONFIG_FUSE_FS=m +CONFIG_MSDOS_FS=m +CONFIG_VFAT_FS=m +CONFIG_FAT_DEFAULT_CODEPAGE=850 CONFIG_PROC_KCORE=y -# CONFIG_PROC_PAGE_MONITOR is not set CONFIG_TMPFS=y +CONFIG_CONFIGFS_FS=y CONFIG_JFFS2_FS=y -# CONFIG_JFFS2_FS_WRITEBUFFER is not set +CONFIG_UBIFS_FS=y # CONFIG_NETWORK_FILESYSTEMS is not set +CONFIG_NLS_CODEPAGE_437=m +CONFIG_NLS_CODEPAGE_850=m +CONFIG_NLS_ISO8859_1=m +CONFIG_NLS_UTF8=m CONFIG_MAGIC_SYSRQ=y +CONFIG_DEBUG_FS=y +CONFIG_DEBUG_KERNEL=y +CONFIG_DETECT_HUNG_TASK=y +CONFIG_FRAME_POINTER=y diff --git a/arch/avr32/configs/atstk1006_defconfig b/arch/avr32/configs/atstk1006_defconfig index dbcc1b5..1ed8f22 100644 --- a/arch/avr32/configs/atstk1006_defconfig +++ b/arch/avr32/configs/atstk1006_defconfig @@ -3,7 +3,6 @@ CONFIG_EXPERIMENTAL=y CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y CONFIG_LOG_BUF_SHIFT=14 -CONFIG_SYSFS_DEPRECATED_V2=y CONFIG_RELAY=y CONFIG_BLK_DEV_INITRD=y # CONFIG_SYSCTL_SYSCALL is not set @@ -11,7 +10,7 @@ CONFIG_BLK_DEV_INITRD=y # CONFIG_COMPAT_BRK is not set CONFIG_PROFILING=y CONFIG_OPROFILE=m -CONFIG_KPROBES=y +# CONFIG_KPROBES is not set CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set @@ -37,6 +36,7 @@ CONFIG_INET=y CONFIG_IP_PNP=y CONFIG_IP_PNP_DHCP=y CONFIG_NET_IPIP=m +CONFIG_NET_IPGRE_DEMUX=m CONFIG_NET_IPGRE=m CONFIG_INET_AH=m CONFIG_INET_ESP=m @@ -60,15 +60,13 @@ CONFIG_MTD_BLOCK=y CONFIG_MTD_CFI=y CONFIG_MTD_CFI_AMDSTD=y CONFIG_MTD_PHYSMAP=y -CONFIG_MTD_DATAFLASH=m -CONFIG_MTD_DATAFLASH_OTP=y -CONFIG_MTD_M25P80=m CONFIG_MTD_NAND=y CONFIG_MTD_NAND_ATMEL=y CONFIG_MTD_UBI=y CONFIG_BLK_DEV_LOOP=m CONFIG_BLK_DEV_NBD=m CONFIG_BLK_DEV_RAM=m +CONFIG_MISC_DEVICES=y CONFIG_ATMEL_PWM=m CONFIG_ATMEL_TCLIB=y CONFIG_ATMEL_SSC=m @@ -132,17 +130,17 @@ CONFIG_USB_ETH=m CONFIG_USB_GADGETFS=m CONFIG_USB_FILE_STORAGE=m CONFIG_USB_G_SERIAL=m +CONFIG_USB_CDC_COMPOSITE=m CONFIG_MMC=y +CONFIG_MMC_TEST=m CONFIG_MMC_ATMELMCI=y -CONFIG_MMC_SPI=m CONFIG_NEW_LEDS=y -CONFIG_LEDS_CLASS=m +CONFIG_LEDS_CLASS=y CONFIG_LEDS_ATMEL_PWM=m CONFIG_LEDS_GPIO=m CONFIG_LEDS_TRIGGERS=y CONFIG_LEDS_TRIGGER_TIMER=m CONFIG_LEDS_TRIGGER_HEARTBEAT=m -CONFIG_LEDS_TRIGGER_DEFAULT_ON=m CONFIG_RTC_CLASS=y CONFIG_RTC_DRV_AT32AP700X=y CONFIG_DMADEVICES=y @@ -156,15 +154,18 @@ CONFIG_EXT4_FS=y CONFIG_FUSE_FS=m CONFIG_MSDOS_FS=m CONFIG_VFAT_FS=m +CONFIG_FAT_DEFAULT_CODEPAGE=850 CONFIG_PROC_KCORE=y CONFIG_TMPFS=y +CONFIG_CONFIGFS_FS=y CONFIG_JFFS2_FS=y CONFIG_UBIFS_FS=y -CONFIG_MINIX_FS=m CONFIG_NFS_FS=y CONFIG_NFS_V3=y CONFIG_ROOT_NFS=y +CONFIG_CIFS=m CONFIG_NLS_CODEPAGE_437=m +CONFIG_NLS_CODEPAGE_850=m CONFIG_NLS_ISO8859_1=m CONFIG_NLS_UTF8=m CONFIG_MAGIC_SYSRQ=y @@ -172,7 +173,3 @@ CONFIG_DEBUG_FS=y CONFIG_DEBUG_KERNEL=y CONFIG_DETECT_HUNG_TASK=y CONFIG_FRAME_POINTER=y -# CONFIG_RCU_CPU_STALL_DETECTOR is not set -CONFIG_CRYPTO_FIPS=y -# CONFIG_CRYPTO_HW is not set -CONFIG_CRC_T10DIF=m diff --git a/arch/avr32/configs/favr-32_defconfig b/arch/avr32/configs/favr-32_defconfig index 0c813b6..aeadc95 100644 --- a/arch/avr32/configs/favr-32_defconfig +++ b/arch/avr32/configs/favr-32_defconfig @@ -11,7 +11,7 @@ CONFIG_BLK_DEV_INITRD=y # CONFIG_COMPAT_BRK is not set CONFIG_PROFILING=y CONFIG_OPROFILE=m -CONFIG_KPROBES=y +# CONFIG_KPROBES is not set CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set diff --git a/arch/avr32/configs/hammerhead_defconfig b/arch/avr32/configs/hammerhead_defconfig index dcc01f0..1692bee 100644 --- a/arch/avr32/configs/hammerhead_defconfig +++ b/arch/avr32/configs/hammerhead_defconfig @@ -12,7 +12,7 @@ CONFIG_BLK_DEV_INITRD=y # CONFIG_COMPAT_BRK is not set CONFIG_PROFILING=y CONFIG_OPROFILE=m -CONFIG_KPROBES=y +# CONFIG_KPROBES is not set CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y CONFIG_MODULE_FORCE_UNLOAD=y diff --git a/arch/avr32/include/asm/syscalls.h b/arch/avr32/include/asm/syscalls.h index ab608b7..244f2ac 100644 --- a/arch/avr32/include/asm/syscalls.h +++ b/arch/avr32/include/asm/syscalls.h @@ -15,20 +15,6 @@ #include <linux/types.h> #include <linux/signal.h> -/* kernel/process.c */ -asmlinkage int sys_fork(struct pt_regs *); -asmlinkage int sys_clone(unsigned long, unsigned long, - unsigned long, unsigned long, - struct pt_regs *); -asmlinkage int sys_vfork(struct pt_regs *); -asmlinkage int sys_execve(const char __user *, char __user *__user *, - char __user *__user *, struct pt_regs *); - -/* kernel/signal.c */ -asmlinkage int sys_sigaltstack(const stack_t __user *, stack_t __user *, - struct pt_regs *); -asmlinkage int sys_rt_sigreturn(struct pt_regs *); - /* mm/cache.c */ asmlinkage int sys_cacheflush(int, void __user *, size_t); diff --git a/arch/avr32/kernel/process.c b/arch/avr32/kernel/process.c index 9c46aaa..ef5a2a0 100644 --- a/arch/avr32/kernel/process.c +++ b/arch/avr32/kernel/process.c @@ -367,14 +367,13 @@ asmlinkage int sys_fork(struct pt_regs *regs) } asmlinkage int sys_clone(unsigned long clone_flags, unsigned long newsp, - unsigned long parent_tidptr, - unsigned long child_tidptr, struct pt_regs *regs) + void __user *parent_tidptr, void __user *child_tidptr, + struct pt_regs *regs) { if (!newsp) newsp = regs->sp; - return do_fork(clone_flags, newsp, regs, 0, - (int __user *)parent_tidptr, - (int __user *)child_tidptr); + return do_fork(clone_flags, newsp, regs, 0, parent_tidptr, + child_tidptr); } asmlinkage int sys_vfork(struct pt_regs *regs) diff --git a/arch/avr32/kernel/time.c b/arch/avr32/kernel/time.c index 668ed28..05ad291 100644 --- a/arch/avr32/kernel/time.c +++ b/arch/avr32/kernel/time.c @@ -35,7 +35,6 @@ static struct clocksource counter = { .rating = 50, .read = read_cycle_count, .mask = CLOCKSOURCE_MASK(32), - .shift = 16, .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; @@ -123,9 +122,7 @@ void __init time_init(void) /* figure rate for counter */ counter_hz = clk_get_rate(boot_cpu_data.clk); - counter.mult = clocksource_hz2mult(counter_hz, counter.shift); - - ret = clocksource_register(&counter); + ret = clocksource_register_hz(&counter, counter_hz); if (ret) pr_debug("timer: could not register clocksource: %d\n", ret); diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c index ac76da0..89accc6 100644 --- a/arch/ia64/kernel/perfmon.c +++ b/arch/ia64/kernel/perfmon.c @@ -618,7 +618,7 @@ pfm_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, } /* forward declaration */ -static static const struct dentry_operations pfmfs_dentry_operations; +static const struct dentry_operations pfmfs_dentry_operations; static struct dentry * pfmfs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c index 1841ee7..5ca674b 100644 --- a/arch/ia64/mm/hugetlbpage.c +++ b/arch/ia64/mm/hugetlbpage.c @@ -38,7 +38,7 @@ huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz) if (pud) { pmd = pmd_alloc(mm, pud, taddr); if (pmd) - pte = pte_alloc_map(mm, pmd, taddr); + pte = pte_alloc_map(mm, NULL, pmd, taddr); } return pte; } diff --git a/arch/mips/include/asm/mman.h b/arch/mips/include/asm/mman.h index c892bfb..785b4ea 100644 --- a/arch/mips/include/asm/mman.h +++ b/arch/mips/include/asm/mman.h @@ -77,6 +77,9 @@ #define MADV_UNMERGEABLE 13 /* KSM may not merge identical pages */ #define MADV_HWPOISON 100 /* poison a page for testing */ +#define MADV_HUGEPAGE 14 /* Worth backing with hugepages */ +#define MADV_NOHUGEPAGE 15 /* Not worth backing with hugepages */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/arch/mips/kernel/module.c b/arch/mips/kernel/module.c index 6f51dda..d87a72e 100644 --- a/arch/mips/kernel/module.c +++ b/arch/mips/kernel/module.c @@ -46,17 +46,9 @@ static DEFINE_SPINLOCK(dbe_lock); void *module_alloc(unsigned long size) { #ifdef MODULE_START - struct vm_struct *area; - - size = PAGE_ALIGN(size); - if (!size) - return NULL; - - area = __get_vm_area(size, VM_ALLOC, MODULE_START, MODULE_END); - if (!area) - return NULL; - - return __vmalloc_area(area, GFP_KERNEL, PAGE_KERNEL); + return __vmalloc_node_range(size, 1, MODULE_START, MODULE_END, + GFP_KERNEL, PAGE_KERNEL, -1, + __builtin_return_address(0)); #else if (size == 0) return NULL; diff --git a/arch/parisc/include/asm/mman.h b/arch/parisc/include/asm/mman.h index 9749c8a..f5b7bf5 100644 --- a/arch/parisc/include/asm/mman.h +++ b/arch/parisc/include/asm/mman.h @@ -59,6 +59,9 @@ #define MADV_MERGEABLE 65 /* KSM may merge identical pages */ #define MADV_UNMERGEABLE 66 /* KSM may not merge identical pages */ +#define MADV_HUGEPAGE 67 /* Worth backing with hugepages */ +#define MADV_NOHUGEPAGE 68 /* Not worth backing with hugepages */ + /* compatibility flags */ #define MAP_FILE 0 #define MAP_VARIABLE 0 diff --git a/arch/powerpc/mm/gup.c b/arch/powerpc/mm/gup.c index d7efdbf..fec1320 100644 --- a/arch/powerpc/mm/gup.c +++ b/arch/powerpc/mm/gup.c @@ -16,6 +16,16 @@ #ifdef __HAVE_ARCH_PTE_SPECIAL +static inline void get_huge_page_tail(struct page *page) +{ + /* + * __split_huge_page_refcount() cannot run + * from under us. + */ + VM_BUG_ON(atomic_read(&page->_count) < 0); + atomic_inc(&page->_count); +} + /* * The performance critical leaf functions are made noinline otherwise gcc * inlines everything into a single function which results in too much @@ -47,6 +57,8 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, put_page(page); return 0; } + if (PageTail(page)) + get_huge_page_tail(page); pages[*nr] = page; (*nr)++; diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c index 9163db3..d776234 100644 --- a/arch/sh/mm/hugetlbpage.c +++ b/arch/sh/mm/hugetlbpage.c @@ -35,7 +35,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, if (pud) { pmd = pmd_alloc(mm, pud, addr); if (pmd) - pte = pte_alloc_map(mm, pmd, addr); + pte = pte_alloc_map(mm, NULL, pmd, addr); } } diff --git a/arch/sparc/kernel/module.c b/arch/sparc/kernel/module.c index ee3c7dd..8d348c4 100644 --- a/arch/sparc/kernel/module.c +++ b/arch/sparc/kernel/module.c @@ -23,17 +23,11 @@ static void *module_map(unsigned long size) { - struct vm_struct *area; - - size = PAGE_ALIGN(size); - if (!size || size > MODULES_LEN) - return NULL; - - area = __get_vm_area(size, VM_ALLOC, MODULES_VADDR, MODULES_END); - if (!area) + if (PAGE_ALIGN(size) > MODULES_LEN) return NULL; - - return __vmalloc_area(area, GFP_KERNEL, PAGE_KERNEL); + return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END, + GFP_KERNEL, PAGE_KERNEL, -1, + __builtin_return_address(0)); } static char *dot2underscore(char *name) diff --git a/arch/sparc/mm/generic_32.c b/arch/sparc/mm/generic_32.c index 5edcac1..e6067b7 100644 --- a/arch/sparc/mm/generic_32.c +++ b/arch/sparc/mm/generic_32.c @@ -50,7 +50,7 @@ static inline int io_remap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned end = PGDIR_SIZE; offset -= address; do { - pte_t * pte = pte_alloc_map(mm, pmd, address); + pte_t *pte = pte_alloc_map(mm, NULL, pmd, address); if (!pte) return -ENOMEM; io_remap_pte_range(mm, pte, address, end - address, address + offset, prot, space); diff --git a/arch/sparc/mm/generic_64.c b/arch/sparc/mm/generic_64.c index 04f2bf4..3cb00df 100644 --- a/arch/sparc/mm/generic_64.c +++ b/arch/sparc/mm/generic_64.c @@ -92,7 +92,7 @@ static inline int io_remap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned end = PGDIR_SIZE; offset -= address; do { - pte_t * pte = pte_alloc_map(mm, pmd, address); + pte_t *pte = pte_alloc_map(mm, NULL, pmd, address); if (!pte) return -ENOMEM; io_remap_pte_range(mm, pte, address, end - address, address + offset, prot, space); diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c index 5fdddf1..f4e9764 100644 --- a/arch/sparc/mm/hugetlbpage.c +++ b/arch/sparc/mm/hugetlbpage.c @@ -214,7 +214,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, if (pud) { pmd = pmd_alloc(mm, pud, addr); if (pmd) - pte = pte_alloc_map(mm, pmd, addr); + pte = pte_alloc_map(mm, NULL, pmd, addr); } return pte; } diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c index 3d099f9..1aee587 100644 --- a/arch/um/kernel/skas/mmu.c +++ b/arch/um/kernel/skas/mmu.c @@ -31,7 +31,7 @@ static int init_stub_pte(struct mm_struct *mm, unsigned long proc, if (!pmd) goto out_pmd; - pte = pte_alloc_map(mm, pmd, proc); + pte = pte_alloc_map(mm, NULL, pmd, proc); if (!pte) goto out_pte; diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index aa75f21..ffd7f8d 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -822,6 +822,7 @@ extern bool kvm_rebooting; #define KVM_ARCH_WANT_MMU_NOTIFIER int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); int kvm_age_hva(struct kvm *kvm, unsigned long hva); +int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); int cpuid_maxphyaddr(struct kvm_vcpu *vcpu); int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 7709c12..2071a8b 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -435,6 +435,11 @@ static inline void pte_update(struct mm_struct *mm, unsigned long addr, { PVOP_VCALL3(pv_mmu_ops.pte_update, mm, addr, ptep); } +static inline void pmd_update(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp) +{ + PVOP_VCALL3(pv_mmu_ops.pmd_update, mm, addr, pmdp); +} static inline void pte_update_defer(struct mm_struct *mm, unsigned long addr, pte_t *ptep) @@ -442,6 +447,12 @@ static inline void pte_update_defer(struct mm_struct *mm, unsigned long addr, PVOP_VCALL3(pv_mmu_ops.pte_update_defer, mm, addr, ptep); } +static inline void pmd_update_defer(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp) +{ + PVOP_VCALL3(pv_mmu_ops.pmd_update_defer, mm, addr, pmdp); +} + static inline pte_t __pte(pteval_t val) { pteval_t ret; @@ -543,6 +554,20 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, PVOP_VCALL4(pv_mmu_ops.set_pte_at, mm, addr, ptep, pte.pte); } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, pmd_t pmd) +{ +#if PAGETABLE_LEVELS >= 3 + if (sizeof(pmdval_t) > sizeof(long)) + /* 5 arg words */ + pv_mmu_ops.set_pmd_at(mm, addr, pmdp, pmd); + else + PVOP_VCALL4(pv_mmu_ops.set_pmd_at, mm, addr, pmdp, pmd.pmd); +#endif +} +#endif + static inline void set_pmd(pmd_t *pmdp, pmd_t pmd) { pmdval_t val = native_pmd_val(pmd); diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index b82bac9..8288509 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -265,10 +265,16 @@ struct pv_mmu_ops { void (*set_pte_at)(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval); void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval); + void (*set_pmd_at)(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, pmd_t pmdval); void (*pte_update)(struct mm_struct *mm, unsigned long addr, pte_t *ptep); void (*pte_update_defer)(struct mm_struct *mm, unsigned long addr, pte_t *ptep); + void (*pmd_update)(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp); + void (*pmd_update_defer)(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp); pte_t (*ptep_modify_prot_start)(struct mm_struct *mm, unsigned long addr, pte_t *ptep); diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h index 2334982..98391db 100644 --- a/arch/x86/include/asm/pgtable-2level.h +++ b/arch/x86/include/asm/pgtable-2level.h @@ -46,6 +46,15 @@ static inline pte_t native_ptep_get_and_clear(pte_t *xp) #define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp) #endif +#ifdef CONFIG_SMP +static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp) +{ + return __pmd(xchg((pmdval_t *)xp, 0)); +} +#else +#define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp) +#endif + /* * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE and _PAGE_BIT_PROTNONE are taken, * split up the 29 bits of offset into this range: diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index 177b016..94b979d 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h @@ -104,6 +104,29 @@ static inline pte_t native_ptep_get_and_clear(pte_t *ptep) #define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp) #endif +#ifdef CONFIG_SMP +union split_pmd { + struct { + u32 pmd_low; + u32 pmd_high; + }; + pmd_t pmd; +}; +static inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp) +{ + union split_pmd res, *orig = (union split_pmd *)pmdp; + + /* xchg acts as a barrier before setting of the high bits */ + res.pmd_low = xchg(&orig->pmd_low, 0); + res.pmd_high = orig->pmd_high; + orig->pmd_high = 0; + + return res.pmd; +} +#else +#define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp) +#endif + /* * Bits 0, 6 and 7 are taken in the low part of the pte, * put the 32 bits of offset into the high part. diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index ada823a..18601c8 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -35,6 +35,7 @@ extern struct mm_struct *pgd_page_get_mm(struct page *page); #else /* !CONFIG_PARAVIRT */ #define set_pte(ptep, pte) native_set_pte(ptep, pte) #define set_pte_at(mm, addr, ptep, pte) native_set_pte_at(mm, addr, ptep, pte) +#define set_pmd_at(mm, addr, pmdp, pmd) native_set_pmd_at(mm, addr, pmdp, pmd) #define set_pte_atomic(ptep, pte) \ native_set_pte_atomic(ptep, pte) @@ -59,6 +60,8 @@ extern struct mm_struct *pgd_page_get_mm(struct page *page); #define pte_update(mm, addr, ptep) do { } while (0) #define pte_update_defer(mm, addr, ptep) do { } while (0) +#define pmd_update(mm, addr, ptep) do { } while (0) +#define pmd_update_defer(mm, addr, ptep) do { } while (0) #define pgd_val(x) native_pgd_val(x) #define __pgd(x) native_make_pgd(x) @@ -94,6 +97,11 @@ static inline int pte_young(pte_t pte) return pte_flags(pte) & _PAGE_ACCESSED; } +static inline int pmd_young(pmd_t pmd) +{ + return pmd_flags(pmd) & _PAGE_ACCESSED; +} + static inline int pte_write(pte_t pte) { return pte_flags(pte) & _PAGE_RW; @@ -142,6 +150,23 @@ static inline int pmd_large(pmd_t pte) (_PAGE_PSE | _PAGE_PRESENT); } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline int pmd_trans_splitting(pmd_t pmd) +{ + return pmd_val(pmd) & _PAGE_SPLITTING; +} + +static inline int pmd_trans_huge(pmd_t pmd) +{ + return pmd_val(pmd) & _PAGE_PSE; +} + +static inline int has_transparent_hugepage(void) +{ + return cpu_has_pse; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + static inline pte_t pte_set_flags(pte_t pte, pteval_t set) { pteval_t v = native_pte_val(pte); @@ -216,6 +241,55 @@ static inline pte_t pte_mkspecial(pte_t pte) return pte_set_flags(pte, _PAGE_SPECIAL); } +static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set) +{ + pmdval_t v = native_pmd_val(pmd); + + return __pmd(v | set); +} + +static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear) +{ + pmdval_t v = native_pmd_val(pmd); + + return __pmd(v & ~clear); +} + +static inline pmd_t pmd_mkold(pmd_t pmd) +{ + return pmd_clear_flags(pmd, _PAGE_ACCESSED); +} + +static inline pmd_t pmd_wrprotect(pmd_t pmd) +{ + return pmd_clear_flags(pmd, _PAGE_RW); +} + +static inline pmd_t pmd_mkdirty(pmd_t pmd) +{ + return pmd_set_flags(pmd, _PAGE_DIRTY); +} + +static inline pmd_t pmd_mkhuge(pmd_t pmd) +{ + return pmd_set_flags(pmd, _PAGE_PSE); +} + +static inline pmd_t pmd_mkyoung(pmd_t pmd) +{ + return pmd_set_flags(pmd, _PAGE_ACCESSED); +} + +static inline pmd_t pmd_mkwrite(pmd_t pmd) +{ + return pmd_set_flags(pmd, _PAGE_RW); +} + +static inline pmd_t pmd_mknotpresent(pmd_t pmd) +{ + return pmd_clear_flags(pmd, _PAGE_PRESENT); +} + /* * Mask out unsupported bits in a present pgprot. Non-present pgprots * can use those bits for other purposes, so leave them be. @@ -256,6 +330,16 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) return __pte(val); } +static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) +{ + pmdval_t val = pmd_val(pmd); + + val &= _HPAGE_CHG_MASK; + val |= massage_pgprot(newprot) & ~_HPAGE_CHG_MASK; + + return __pmd(val); +} + /* mprotect needs to preserve PAT bits when updating vm_page_prot */ #define pgprot_modify pgprot_modify static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) @@ -350,7 +434,7 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) * Currently stuck as a macro due to indirect forward reference to * linux/mmzone.h's __section_mem_map_addr() definition: */ -#define pmd_page(pmd) pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT) +#define pmd_page(pmd) pfn_to_page((pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT) /* * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD] @@ -524,12 +608,26 @@ static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep) return res; } +static inline pmd_t native_local_pmdp_get_and_clear(pmd_t *pmdp) +{ + pmd_t res = *pmdp; + + native_pmd_clear(pmdp); + return res; +} + static inline void native_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep , pte_t pte) { native_set_pte(ptep, pte); } +static inline void native_set_pmd_at(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp , pmd_t pmd) +{ + native_set_pmd(pmdp, pmd); +} + #ifndef CONFIG_PARAVIRT /* * Rules for using pte_update - it must be called after any PTE update which @@ -607,6 +705,49 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, #define flush_tlb_fix_spurious_fault(vma, address) +#define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot)) + +#define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS +extern int pmdp_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp, + pmd_t entry, int dirty); + +#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG +extern int pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp); + +#define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH +extern int pmdp_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp); + + +#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH +extern void pmdp_splitting_flush(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp); + +#define __HAVE_ARCH_PMD_WRITE +static inline int pmd_write(pmd_t pmd) +{ + return pmd_flags(pmd) & _PAGE_RW; +} + +#define __HAVE_ARCH_PMDP_GET_AND_CLEAR +static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp) +{ + pmd_t pmd = native_pmdp_get_and_clear(pmdp); + pmd_update(mm, addr, pmdp); + return pmd; +} + +#define __HAVE_ARCH_PMDP_SET_WRPROTECT +static inline void pmdp_set_wrprotect(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp) +{ + clear_bit(_PAGE_BIT_RW, (unsigned long *)pmdp); + pmd_update(mm, addr, pmdp); +} + /* * clone_pgd_range(pgd_t *dst, pgd_t *src, int count); * diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index f86da20..975f709 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -59,6 +59,16 @@ static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte) native_set_pte(ptep, pte); } +static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd) +{ + *pmdp = pmd; +} + +static inline void native_pmd_clear(pmd_t *pmd) +{ + native_set_pmd(pmd, native_make_pmd(0)); +} + static inline pte_t native_ptep_get_and_clear(pte_t *xp) { #ifdef CONFIG_SMP @@ -72,14 +82,17 @@ static inline pte_t native_ptep_get_and_clear(pte_t *xp) #endif } -static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd) +static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp) { - *pmdp = pmd; -} - -static inline void native_pmd_clear(pmd_t *pmd) -{ - native_set_pmd(pmd, native_make_pmd(0)); +#ifdef CONFIG_SMP + return native_make_pmd(xchg(&xp->pmd, 0)); +#else + /* native_local_pmdp_get_and_clear, + but duplicated because of cyclic dependency */ + pmd_t ret = *xp; + native_pmd_clear(xp); + return ret; +#endif } static inline void native_set_pud(pud_t *pudp, pud_t pud) @@ -168,6 +181,7 @@ extern void cleanup_highmap(void); #define kc_offset_to_vaddr(o) ((o) | ~__VIRTUAL_MASK) #define __HAVE_ARCH_PTE_SAME + #endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_PGTABLE_64_H */ diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index d1f4a76..7db7723 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -22,6 +22,7 @@ #define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ #define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1 #define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1 +#define _PAGE_BIT_SPLITTING _PAGE_BIT_UNUSED1 /* only valid on a PSE pmd */ #define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ /* If _PAGE_BIT_PRESENT is clear, we use these: */ @@ -45,6 +46,7 @@ #define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) #define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL) #define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST) +#define _PAGE_SPLITTING (_AT(pteval_t, 1) << _PAGE_BIT_SPLITTING) #define __HAVE_ARCH_PTE_SPECIAL #ifdef CONFIG_KMEMCHECK @@ -70,6 +72,7 @@ /* Set of bits not changed in pte_modify */ #define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \ _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY) +#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE) #define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT) #define _PAGE_CACHE_WB (0) diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index 8760cc6..f25bdf2 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -42,6 +42,11 @@ extern unsigned int machine_to_phys_order; extern unsigned long get_phys_to_machine(unsigned long pfn); extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn); +extern int m2p_add_override(unsigned long mfn, struct page *page); +extern int m2p_remove_override(struct page *page); +extern struct page *m2p_find_override(unsigned long mfn); +extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn); + static inline unsigned long pfn_to_mfn(unsigned long pfn) { unsigned long mfn; @@ -72,9 +77,6 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn) if (xen_feature(XENFEAT_auto_translated_physmap)) return mfn; - if (unlikely((mfn >> machine_to_phys_order) != 0)) - return ~0; - pfn = 0; /* * The array access can fail (e.g., device space beyond end of RAM). @@ -83,6 +85,14 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn) */ __get_user(pfn, &machine_to_phys_mapping[mfn]); + /* + * If this appears to be a foreign mfn (because the pfn + * doesn't map back to the mfn), then check the local override + * table to see if there's a better pfn to use. + */ + if (get_phys_to_machine(pfn) != mfn) + pfn = m2p_find_override_pfn(mfn, pfn); + return pfn; } diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 8f29560..ab23f1a 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -37,20 +37,11 @@ void *module_alloc(unsigned long size) { - struct vm_struct *area; - - if (!size) - return NULL; - size = PAGE_ALIGN(size); - if (size > MODULES_LEN) + if (PAGE_ALIGN(size) > MODULES_LEN) return NULL; - - area = __get_vm_area(size, VM_ALLOC, MODULES_VADDR, MODULES_END); - if (!area) - return NULL; - - return __vmalloc_area(area, GFP_KERNEL | __GFP_HIGHMEM, - PAGE_KERNEL_EXEC); + return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END, + GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC, + -1, __builtin_return_address(0)); } /* Free memory returned from module_alloc */ diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index c5b2500..869e1ae 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -421,8 +421,11 @@ struct pv_mmu_ops pv_mmu_ops = { .set_pte = native_set_pte, .set_pte_at = native_set_pte_at, .set_pmd = native_set_pmd, + .set_pmd_at = native_set_pmd_at, .pte_update = paravirt_nop, .pte_update_defer = paravirt_nop, + .pmd_update = paravirt_nop, + .pmd_update_defer = paravirt_nop, .ptep_modify_prot_start = __ptep_modify_prot_start, .ptep_modify_prot_commit = __ptep_modify_prot_commit, diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index c2f1b26..998e972 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -133,7 +133,7 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn, pmd = pmd_alloc(&tboot_mm, pud, vaddr); if (!pmd) return -1; - pte = pte_alloc_map(&tboot_mm, pmd, vaddr); + pte = pte_alloc_map(&tboot_mm, NULL, pmd, vaddr); if (!pte) return -1; set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot)); diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 61fb985..863f875 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -179,6 +179,7 @@ static void mark_screen_rdonly(struct mm_struct *mm) if (pud_none_or_clear_bad(pud)) goto out; pmd = pmd_offset(pud, 0xA0000); + split_huge_page_pmd(mm, pmd); if (pmd_none_or_clear_bad(pmd)) goto out; pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 9cafbb4..f02b8ed 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -554,14 +554,18 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn) return ret; } -static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) +static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn) { struct kvm_memory_slot *slot; - int host_level, level, max_level; - slot = gfn_to_memslot(vcpu->kvm, large_gfn); if (slot && slot->dirty_bitmap) - return PT_PAGE_TABLE_LEVEL; + return true; + return false; +} + +static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) +{ + int host_level, level, max_level; host_level = host_mapping_level(vcpu->kvm, large_gfn); @@ -941,6 +945,35 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, return young; } +static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, + unsigned long data) +{ + u64 *spte; + int young = 0; + + /* + * If there's no access bit in the secondary pte set by the + * hardware it's up to gup-fast/gup to set the access bit in + * the primary pte or in the page structure. + */ + if (!shadow_accessed_mask) + goto out; + + spte = rmap_next(kvm, rmapp, NULL); + while (spte) { + u64 _spte = *spte; + BUG_ON(!(_spte & PT_PRESENT_MASK)); + young = _spte & PT_ACCESSED_MASK; + if (young) { + young = 1; + break; + } + spte = rmap_next(kvm, rmapp, spte); + } +out: + return young; +} + #define RMAP_RECYCLE_THRESHOLD 1000 static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) @@ -961,6 +994,11 @@ int kvm_age_hva(struct kvm *kvm, unsigned long hva) return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp); } +int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) +{ + return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp); +} + #ifdef MMU_DEBUG static int is_empty_shadow_page(u64 *spt) { @@ -2281,6 +2319,48 @@ static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn) return 1; } +static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, + gfn_t *gfnp, pfn_t *pfnp, int *levelp) +{ + pfn_t pfn = *pfnp; + gfn_t gfn = *gfnp; + int level = *levelp; + + /* + * Check if it's a transparent hugepage. If this would be an + * hugetlbfs page, level wouldn't be set to + * PT_PAGE_TABLE_LEVEL and there would be no adjustment done + * here. + */ + if (!is_error_pfn(pfn) && !kvm_is_mmio_pfn(pfn) && + level == PT_PAGE_TABLE_LEVEL && + PageTransCompound(pfn_to_page(pfn)) && + !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) { + unsigned long mask; + /* + * mmu_notifier_retry was successful and we hold the + * mmu_lock here, so the pmd can't become splitting + * from under us, and in turn + * __split_huge_page_refcount() can't run from under + * us and we can safely transfer the refcount from + * PG_tail to PG_head as we switch the pfn to tail to + * head. + */ + *levelp = level = PT_DIRECTORY_LEVEL; + mask = KVM_PAGES_PER_HPAGE(level) - 1; + VM_BUG_ON((gfn & mask) != (pfn & mask)); + if (pfn & mask) { + gfn &= ~mask; + *gfnp = gfn; + kvm_release_pfn_clean(pfn); + pfn &= ~mask; + if (!get_page_unless_zero(pfn_to_page(pfn))) + BUG(); + *pfnp = pfn; + } + } +} + static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, gva_t gva, pfn_t *pfn, bool write, bool *writable); @@ -2289,20 +2369,25 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn, { int r; int level; + int force_pt_level; pfn_t pfn; unsigned long mmu_seq; bool map_writable; - level = mapping_level(vcpu, gfn); - - /* - * This path builds a PAE pagetable - so we can map 2mb pages at - * maximum. Therefore check if the level is larger than that. - */ - if (level > PT_DIRECTORY_LEVEL) - level = PT_DIRECTORY_LEVEL; + force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn); + if (likely(!force_pt_level)) { + level = mapping_level(vcpu, gfn); + /* + * This path builds a PAE pagetable - so we can map + * 2mb pages at maximum. Therefore check if the level + * is larger than that. + */ + if (level > PT_DIRECTORY_LEVEL) + level = PT_DIRECTORY_LEVEL; - gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); + gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); + } else + level = PT_PAGE_TABLE_LEVEL; mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); @@ -2318,6 +2403,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn, if (mmu_notifier_retry(vcpu, mmu_seq)) goto out_unlock; kvm_mmu_free_some_pages(vcpu); + if (likely(!force_pt_level)) + transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn, prefault); spin_unlock(&vcpu->kvm->mmu_lock); @@ -2655,6 +2742,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, pfn_t pfn; int r; int level; + int force_pt_level; gfn_t gfn = gpa >> PAGE_SHIFT; unsigned long mmu_seq; int write = error_code & PFERR_WRITE_MASK; @@ -2667,9 +2755,12 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, if (r) return r; - level = mapping_level(vcpu, gfn); - - gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); + force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn); + if (likely(!force_pt_level)) { + level = mapping_level(vcpu, gfn); + gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); + } else + level = PT_PAGE_TABLE_LEVEL; mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); @@ -2684,6 +2775,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, if (mmu_notifier_retry(vcpu, mmu_seq)) goto out_unlock; kvm_mmu_free_some_pages(vcpu); + if (likely(!force_pt_level)) + transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); r = __direct_map(vcpu, gpa, write, map_writable, level, gfn, pfn, prefault); spin_unlock(&vcpu->kvm->mmu_lock); diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 53210f1..6bccc24 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -550,6 +550,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, int r; pfn_t pfn; int level = PT_PAGE_TABLE_LEVEL; + int force_pt_level; unsigned long mmu_seq; bool map_writable; @@ -577,7 +578,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, return 0; } - if (walker.level >= PT_DIRECTORY_LEVEL) { + if (walker.level >= PT_DIRECTORY_LEVEL) + force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn); + else + force_pt_level = 1; + if (!force_pt_level) { level = min(walker.level, mapping_level(vcpu, walker.gfn)); walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); } @@ -599,6 +604,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); kvm_mmu_free_some_pages(vcpu); + if (!force_pt_level) + transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level); sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, level, &write_pt, pfn, map_writable, prefault); (void)sptep; diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index 738e659..dbe34b9 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -8,6 +8,7 @@ #include <linux/mm.h> #include <linux/vmstat.h> #include <linux/highmem.h> +#include <linux/swap.h> #include <asm/pgtable.h> @@ -89,6 +90,7 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, VM_BUG_ON(!pfn_valid(pte_pfn(pte))); page = pte_page(pte); get_page(page); + SetPageReferenced(page); pages[*nr] = page; (*nr)++; @@ -103,6 +105,17 @@ static inline void get_head_page_multiple(struct page *page, int nr) VM_BUG_ON(page != compound_head(page)); VM_BUG_ON(page_count(page) == 0); atomic_add(nr, &page->_count); + SetPageReferenced(page); +} + +static inline void get_huge_page_tail(struct page *page) +{ + /* + * __split_huge_page_refcount() cannot run + * from under us. + */ + VM_BUG_ON(atomic_read(&page->_count) < 0); + atomic_inc(&page->_count); } static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, @@ -128,6 +141,8 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, do { VM_BUG_ON(compound_head(page) != head); pages[*nr] = page; + if (PageTail(page)) + get_huge_page_tail(page); (*nr)++; page++; refs++; @@ -148,7 +163,18 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, pmd_t pmd = *pmdp; next = pmd_addr_end(addr, end); - if (pmd_none(pmd)) + /* + * The pmd_trans_splitting() check below explains why + * pmdp_splitting_flush has to flush the tlb, to stop + * this gup-fast code from running while we set the + * splitting bit in the pmd. Returning zero will take + * the slow path that will call wait_split_huge_page() + * if the pmd is still in splitting state. gup-fast + * can't because it has irq disabled and + * wait_split_huge_page() would never return as the + * tlb flush IPI wouldn't run. + */ + if (pmd_none(pmd) || pmd_trans_splitting(pmd)) return 0; if (unlikely(pmd_large(pmd))) { if (!gup_huge_pmd(pmd, addr, next, write, pages, nr)) diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 8be8c7d..500242d 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -320,6 +320,25 @@ int ptep_set_access_flags(struct vm_area_struct *vma, return changed; } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +int pmdp_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp, + pmd_t entry, int dirty) +{ + int changed = !pmd_same(*pmdp, entry); + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + + if (changed && dirty) { + *pmdp = entry; + pmd_update_defer(vma->vm_mm, address, pmdp); + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + } + + return changed; +} +#endif + int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { @@ -335,6 +354,23 @@ int ptep_test_and_clear_young(struct vm_area_struct *vma, return ret; } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +int pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp) +{ + int ret = 0; + + if (pmd_young(*pmdp)) + ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, + (unsigned long *)pmdp); + + if (ret) + pmd_update(vma->vm_mm, addr, pmdp); + + return ret; +} +#endif + int ptep_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { @@ -347,6 +383,36 @@ int ptep_clear_flush_young(struct vm_area_struct *vma, return young; } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +int pmdp_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) +{ + int young; + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + + young = pmdp_test_and_clear_young(vma, address, pmdp); + if (young) + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + + return young; +} + +void pmdp_splitting_flush(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) +{ + int set; + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + set = !test_and_set_bit(_PAGE_BIT_SPLITTING, + (unsigned long *)pmdp); + if (set) { + pmd_update(vma->vm_mm, address, pmdp); + /* need tlb flush only to serialize against gup-fast */ + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + } +} +#endif + /** * reserve_top_address - reserves a hole in the top of kernel address space * @reserve - size of hole to reserve diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 7793851..17c565d 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -12,7 +12,8 @@ CFLAGS_mmu.o := $(nostackp) obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ time.o xen-asm.o xen-asm_$(BITS).o \ - grant-table.o suspend.o platform-pci-unplug.o + grant-table.o suspend.o platform-pci-unplug.o \ + p2m.o obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 44924e5..7575e55 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -173,371 +173,6 @@ DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */ */ #define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK) -/* - * Xen leaves the responsibility for maintaining p2m mappings to the - * guests themselves, but it must also access and update the p2m array - * during suspend/resume when all the pages are reallocated. - * - * The p2m table is logically a flat array, but we implement it as a - * three-level tree to allow the address space to be sparse. - * - * Xen - * | - * p2m_top p2m_top_mfn - * / \ / \ - * p2m_mid p2m_mid p2m_mid_mfn p2m_mid_mfn - * / \ / \ / / - * p2m p2m p2m p2m p2m p2m p2m ... - * - * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p. - * - * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the - * maximum representable pseudo-physical address space is: - * P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages - * - * P2M_PER_PAGE depends on the architecture, as a mfn is always - * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to - * 512 and 1024 entries respectively. - */ - -unsigned long xen_max_p2m_pfn __read_mostly; - -#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) -#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *)) -#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **)) - -#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE) - -/* Placeholders for holes in the address space */ -static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE); -static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE); -static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE); - -static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE); -static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE); -static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE); - -RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); -RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); - -static inline unsigned p2m_top_index(unsigned long pfn) -{ - BUG_ON(pfn >= MAX_P2M_PFN); - return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE); -} - -static inline unsigned p2m_mid_index(unsigned long pfn) -{ - return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE; -} - -static inline unsigned p2m_index(unsigned long pfn) -{ - return pfn % P2M_PER_PAGE; -} - -static void p2m_top_init(unsigned long ***top) -{ - unsigned i; - - for (i = 0; i < P2M_TOP_PER_PAGE; i++) - top[i] = p2m_mid_missing; -} - -static void p2m_top_mfn_init(unsigned long *top) -{ - unsigned i; - - for (i = 0; i < P2M_TOP_PER_PAGE; i++) - top[i] = virt_to_mfn(p2m_mid_missing_mfn); -} - -static void p2m_top_mfn_p_init(unsigned long **top) -{ - unsigned i; - - for (i = 0; i < P2M_TOP_PER_PAGE; i++) - top[i] = p2m_mid_missing_mfn; -} - -static void p2m_mid_init(unsigned long **mid) -{ - unsigned i; - - for (i = 0; i < P2M_MID_PER_PAGE; i++) - mid[i] = p2m_missing; -} - -static void p2m_mid_mfn_init(unsigned long *mid) -{ - unsigned i; - - for (i = 0; i < P2M_MID_PER_PAGE; i++) - mid[i] = virt_to_mfn(p2m_missing); -} - -static void p2m_init(unsigned long *p2m) -{ - unsigned i; - - for (i = 0; i < P2M_MID_PER_PAGE; i++) - p2m[i] = INVALID_P2M_ENTRY; -} - -/* - * Build the parallel p2m_top_mfn and p2m_mid_mfn structures - * - * This is called both at boot time, and after resuming from suspend: - * - At boot time we're called very early, and must use extend_brk() - * to allocate memory. - * - * - After resume we're called from within stop_machine, but the mfn - * tree should alreay be completely allocated. - */ -void xen_build_mfn_list_list(void) -{ - unsigned long pfn; - - /* Pre-initialize p2m_top_mfn to be completely missing */ - if (p2m_top_mfn == NULL) { - p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_mid_mfn_init(p2m_mid_missing_mfn); - - p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_top_mfn_p_init(p2m_top_mfn_p); - - p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_top_mfn_init(p2m_top_mfn); - } else { - /* Reinitialise, mfn's all change after migration */ - p2m_mid_mfn_init(p2m_mid_missing_mfn); - } - - for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) { - unsigned topidx = p2m_top_index(pfn); - unsigned mididx = p2m_mid_index(pfn); - unsigned long **mid; - unsigned long *mid_mfn_p; - - mid = p2m_top[topidx]; - mid_mfn_p = p2m_top_mfn_p[topidx]; - - /* Don't bother allocating any mfn mid levels if - * they're just missing, just update the stored mfn, - * since all could have changed over a migrate. - */ - if (mid == p2m_mid_missing) { - BUG_ON(mididx); - BUG_ON(mid_mfn_p != p2m_mid_missing_mfn); - p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn); - pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE; - continue; - } - - if (mid_mfn_p == p2m_mid_missing_mfn) { - /* - * XXX boot-time only! We should never find - * missing parts of the mfn tree after - * runtime. extend_brk() will BUG if we call - * it too late. - */ - mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_mid_mfn_init(mid_mfn_p); - - p2m_top_mfn_p[topidx] = mid_mfn_p; - } - - p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p); - mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]); - } -} - -void xen_setup_mfn_list_list(void) -{ - BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); - - HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = - virt_to_mfn(p2m_top_mfn); - HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn; -} - -/* Set up p2m_top to point to the domain-builder provided p2m pages */ -void __init xen_build_dynamic_phys_to_machine(void) -{ - unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list; - unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages); - unsigned long pfn; - - xen_max_p2m_pfn = max_pfn; - - p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_init(p2m_missing); - - p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_mid_init(p2m_mid_missing); - - p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_top_init(p2m_top); - - /* - * The domain builder gives us a pre-constructed p2m array in - * mfn_list for all the pages initially given to us, so we just - * need to graft that into our tree structure. - */ - for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) { - unsigned topidx = p2m_top_index(pfn); - unsigned mididx = p2m_mid_index(pfn); - - if (p2m_top[topidx] == p2m_mid_missing) { - unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_mid_init(mid); - - p2m_top[topidx] = mid; - } - - p2m_top[topidx][mididx] = &mfn_list[pfn]; - } -} - -unsigned long get_phys_to_machine(unsigned long pfn) -{ - unsigned topidx, mididx, idx; - - if (unlikely(pfn >= MAX_P2M_PFN)) - return INVALID_P2M_ENTRY; - - topidx = p2m_top_index(pfn); - mididx = p2m_mid_index(pfn); - idx = p2m_index(pfn); - - return p2m_top[topidx][mididx][idx]; -} -EXPORT_SYMBOL_GPL(get_phys_to_machine); - -static void *alloc_p2m_page(void) -{ - return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT); -} - -static void free_p2m_page(void *p) -{ - free_page((unsigned long)p); -} - -/* - * Fully allocate the p2m structure for a given pfn. We need to check - * that both the top and mid levels are allocated, and make sure the - * parallel mfn tree is kept in sync. We may race with other cpus, so - * the new pages are installed with cmpxchg; if we lose the race then - * simply free the page we allocated and use the one that's there. - */ -static bool alloc_p2m(unsigned long pfn) -{ - unsigned topidx, mididx; - unsigned long ***top_p, **mid; - unsigned long *top_mfn_p, *mid_mfn; - - topidx = p2m_top_index(pfn); - mididx = p2m_mid_index(pfn); - - top_p = &p2m_top[topidx]; - mid = *top_p; - - if (mid == p2m_mid_missing) { - /* Mid level is missing, allocate a new one */ - mid = alloc_p2m_page(); - if (!mid) - return false; - - p2m_mid_init(mid); - - if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing) - free_p2m_page(mid); - } - - top_mfn_p = &p2m_top_mfn[topidx]; - mid_mfn = p2m_top_mfn_p[topidx]; - - BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p); - - if (mid_mfn == p2m_mid_missing_mfn) { - /* Separately check the mid mfn level */ - unsigned long missing_mfn; - unsigned long mid_mfn_mfn; - - mid_mfn = alloc_p2m_page(); - if (!mid_mfn) - return false; - - p2m_mid_mfn_init(mid_mfn); - - missing_mfn = virt_to_mfn(p2m_mid_missing_mfn); - mid_mfn_mfn = virt_to_mfn(mid_mfn); - if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn) - free_p2m_page(mid_mfn); - else - p2m_top_mfn_p[topidx] = mid_mfn; - } - - if (p2m_top[topidx][mididx] == p2m_missing) { - /* p2m leaf page is missing */ - unsigned long *p2m; - - p2m = alloc_p2m_page(); - if (!p2m) - return false; - - p2m_init(p2m); - - if (cmpxchg(&mid[mididx], p2m_missing, p2m) != p2m_missing) - free_p2m_page(p2m); - else - mid_mfn[mididx] = virt_to_mfn(p2m); - } - - return true; -} - -/* Try to install p2m mapping; fail if intermediate bits missing */ -bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) -{ - unsigned topidx, mididx, idx; - - if (unlikely(pfn >= MAX_P2M_PFN)) { - BUG_ON(mfn != INVALID_P2M_ENTRY); - return true; - } - - topidx = p2m_top_index(pfn); - mididx = p2m_mid_index(pfn); - idx = p2m_index(pfn); - - if (p2m_top[topidx][mididx] == p2m_missing) - return mfn == INVALID_P2M_ENTRY; - - p2m_top[topidx][mididx][idx] = mfn; - - return true; -} - -bool set_phys_to_machine(unsigned long pfn, unsigned long mfn) -{ - if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) { - BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY); - return true; - } - - if (unlikely(!__set_phys_to_machine(pfn, mfn))) { - if (!alloc_p2m(pfn)) - return false; - - if (!__set_phys_to_machine(pfn, mfn)) - return false; - } - - return true; -} - unsigned long arbitrary_virt_to_mfn(void *vaddr) { xmaddr_t maddr = arbitrary_virt_to_machine(vaddr); diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c new file mode 100644 index 0000000..8f2251d --- /dev/null +++ b/arch/x86/xen/p2m.c @@ -0,0 +1,510 @@ +/* + * Xen leaves the responsibility for maintaining p2m mappings to the + * guests themselves, but it must also access and update the p2m array + * during suspend/resume when all the pages are reallocated. + * + * The p2m table is logically a flat array, but we implement it as a + * three-level tree to allow the address space to be sparse. + * + * Xen + * | + * p2m_top p2m_top_mfn + * / \ / \ + * p2m_mid p2m_mid p2m_mid_mfn p2m_mid_mfn + * / \ / \ / / + * p2m p2m p2m p2m p2m p2m p2m ... + * + * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p. + * + * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the + * maximum representable pseudo-physical address space is: + * P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages + * + * P2M_PER_PAGE depends on the architecture, as a mfn is always + * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to + * 512 and 1024 entries respectively. + */ + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/list.h> +#include <linux/hash.h> +#include <linux/sched.h> + +#include <asm/cache.h> +#include <asm/setup.h> + +#include <asm/xen/page.h> +#include <asm/xen/hypercall.h> +#include <asm/xen/hypervisor.h> + +#include "xen-ops.h" + +static void __init m2p_override_init(void); + +unsigned long xen_max_p2m_pfn __read_mostly; + +#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) +#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *)) +#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **)) + +#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE) + +/* Placeholders for holes in the address space */ +static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE); +static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE); +static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE); + +static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE); +static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE); +static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE); + +RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); +RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); + +static inline unsigned p2m_top_index(unsigned long pfn) +{ + BUG_ON(pfn >= MAX_P2M_PFN); + return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE); +} + +static inline unsigned p2m_mid_index(unsigned long pfn) +{ + return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE; +} + +static inline unsigned p2m_index(unsigned long pfn) +{ + return pfn % P2M_PER_PAGE; +} + +static void p2m_top_init(unsigned long ***top) +{ + unsigned i; + + for (i = 0; i < P2M_TOP_PER_PAGE; i++) + top[i] = p2m_mid_missing; +} + +static void p2m_top_mfn_init(unsigned long *top) +{ + unsigned i; + + for (i = 0; i < P2M_TOP_PER_PAGE; i++) + top[i] = virt_to_mfn(p2m_mid_missing_mfn); +} + +static void p2m_top_mfn_p_init(unsigned long **top) +{ + unsigned i; + + for (i = 0; i < P2M_TOP_PER_PAGE; i++) + top[i] = p2m_mid_missing_mfn; +} + +static void p2m_mid_init(unsigned long **mid) +{ + unsigned i; + + for (i = 0; i < P2M_MID_PER_PAGE; i++) + mid[i] = p2m_missing; +} + +static void p2m_mid_mfn_init(unsigned long *mid) +{ + unsigned i; + + for (i = 0; i < P2M_MID_PER_PAGE; i++) + mid[i] = virt_to_mfn(p2m_missing); +} + +static void p2m_init(unsigned long *p2m) +{ + unsigned i; + + for (i = 0; i < P2M_MID_PER_PAGE; i++) + p2m[i] = INVALID_P2M_ENTRY; +} + +/* + * Build the parallel p2m_top_mfn and p2m_mid_mfn structures + * + * This is called both at boot time, and after resuming from suspend: + * - At boot time we're called very early, and must use extend_brk() + * to allocate memory. + * + * - After resume we're called from within stop_machine, but the mfn + * tree should alreay be completely allocated. + */ +void xen_build_mfn_list_list(void) +{ + unsigned long pfn; + + /* Pre-initialize p2m_top_mfn to be completely missing */ + if (p2m_top_mfn == NULL) { + p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_mid_mfn_init(p2m_mid_missing_mfn); + + p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_top_mfn_p_init(p2m_top_mfn_p); + + p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_top_mfn_init(p2m_top_mfn); + } else { + /* Reinitialise, mfn's all change after migration */ + p2m_mid_mfn_init(p2m_mid_missing_mfn); + } + + for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) { + unsigned topidx = p2m_top_index(pfn); + unsigned mididx = p2m_mid_index(pfn); + unsigned long **mid; + unsigned long *mid_mfn_p; + + mid = p2m_top[topidx]; + mid_mfn_p = p2m_top_mfn_p[topidx]; + + /* Don't bother allocating any mfn mid levels if + * they're just missing, just update the stored mfn, + * since all could have changed over a migrate. + */ + if (mid == p2m_mid_missing) { + BUG_ON(mididx); + BUG_ON(mid_mfn_p != p2m_mid_missing_mfn); + p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn); + pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE; + continue; + } + + if (mid_mfn_p == p2m_mid_missing_mfn) { + /* + * XXX boot-time only! We should never find + * missing parts of the mfn tree after + * runtime. extend_brk() will BUG if we call + * it too late. + */ + mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_mid_mfn_init(mid_mfn_p); + + p2m_top_mfn_p[topidx] = mid_mfn_p; + } + + p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p); + mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]); + } +} + +void xen_setup_mfn_list_list(void) +{ + BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); + + HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = + virt_to_mfn(p2m_top_mfn); + HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn; +} + +/* Set up p2m_top to point to the domain-builder provided p2m pages */ +void __init xen_build_dynamic_phys_to_machine(void) +{ + unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list; + unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages); + unsigned long pfn; + + xen_max_p2m_pfn = max_pfn; + + p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_init(p2m_missing); + + p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_mid_init(p2m_mid_missing); + + p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_top_init(p2m_top); + + /* + * The domain builder gives us a pre-constructed p2m array in + * mfn_list for all the pages initially given to us, so we just + * need to graft that into our tree structure. + */ + for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) { + unsigned topidx = p2m_top_index(pfn); + unsigned mididx = p2m_mid_index(pfn); + + if (p2m_top[topidx] == p2m_mid_missing) { + unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_mid_init(mid); + + p2m_top[topidx] = mid; + } + + p2m_top[topidx][mididx] = &mfn_list[pfn]; + } + + m2p_override_init(); +} + +unsigned long get_phys_to_machine(unsigned long pfn) +{ + unsigned topidx, mididx, idx; + + if (unlikely(pfn >= MAX_P2M_PFN)) + return INVALID_P2M_ENTRY; + + topidx = p2m_top_index(pfn); + mididx = p2m_mid_index(pfn); + idx = p2m_index(pfn); + + return p2m_top[topidx][mididx][idx]; +} +EXPORT_SYMBOL_GPL(get_phys_to_machine); + +static void *alloc_p2m_page(void) +{ + return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT); +} + +static void free_p2m_page(void *p) +{ + free_page((unsigned long)p); +} + +/* + * Fully allocate the p2m structure for a given pfn. We need to check + * that both the top and mid levels are allocated, and make sure the + * parallel mfn tree is kept in sync. We may race with other cpus, so + * the new pages are installed with cmpxchg; if we lose the race then + * simply free the page we allocated and use the one that's there. + */ +static bool alloc_p2m(unsigned long pfn) +{ + unsigned topidx, mididx; + unsigned long ***top_p, **mid; + unsigned long *top_mfn_p, *mid_mfn; + + topidx = p2m_top_index(pfn); + mididx = p2m_mid_index(pfn); + + top_p = &p2m_top[topidx]; + mid = *top_p; + + if (mid == p2m_mid_missing) { + /* Mid level is missing, allocate a new one */ + mid = alloc_p2m_page(); + if (!mid) + return false; + + p2m_mid_init(mid); + + if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing) + free_p2m_page(mid); + } + + top_mfn_p = &p2m_top_mfn[topidx]; + mid_mfn = p2m_top_mfn_p[topidx]; + + BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p); + + if (mid_mfn == p2m_mid_missing_mfn) { + /* Separately check the mid mfn level */ + unsigned long missing_mfn; + unsigned long mid_mfn_mfn; + + mid_mfn = alloc_p2m_page(); + if (!mid_mfn) + return false; + + p2m_mid_mfn_init(mid_mfn); + + missing_mfn = virt_to_mfn(p2m_mid_missing_mfn); + mid_mfn_mfn = virt_to_mfn(mid_mfn); + if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn) + free_p2m_page(mid_mfn); + else + p2m_top_mfn_p[topidx] = mid_mfn; + } + + if (p2m_top[topidx][mididx] == p2m_missing) { + /* p2m leaf page is missing */ + unsigned long *p2m; + + p2m = alloc_p2m_page(); + if (!p2m) + return false; + + p2m_init(p2m); + + if (cmpxchg(&mid[mididx], p2m_missing, p2m) != p2m_missing) + free_p2m_page(p2m); + else + mid_mfn[mididx] = virt_to_mfn(p2m); + } + + return true; +} + +/* Try to install p2m mapping; fail if intermediate bits missing */ +bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) +{ + unsigned topidx, mididx, idx; + + if (unlikely(pfn >= MAX_P2M_PFN)) { + BUG_ON(mfn != INVALID_P2M_ENTRY); + return true; + } + + topidx = p2m_top_index(pfn); + mididx = p2m_mid_index(pfn); + idx = p2m_index(pfn); + + if (p2m_top[topidx][mididx] == p2m_missing) + return mfn == INVALID_P2M_ENTRY; + + p2m_top[topidx][mididx][idx] = mfn; + + return true; +} + +bool set_phys_to_machine(unsigned long pfn, unsigned long mfn) +{ + if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) { + BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY); + return true; + } + + if (unlikely(!__set_phys_to_machine(pfn, mfn))) { + if (!alloc_p2m(pfn)) + return false; + + if (!__set_phys_to_machine(pfn, mfn)) + return false; + } + + return true; +} + +#define M2P_OVERRIDE_HASH_SHIFT 10 +#define M2P_OVERRIDE_HASH (1 << M2P_OVERRIDE_HASH_SHIFT) + +static RESERVE_BRK_ARRAY(struct list_head, m2p_overrides, M2P_OVERRIDE_HASH); +static DEFINE_SPINLOCK(m2p_override_lock); + +static void __init m2p_override_init(void) +{ + unsigned i; + + m2p_overrides = extend_brk(sizeof(*m2p_overrides) * M2P_OVERRIDE_HASH, + sizeof(unsigned long)); + + for (i = 0; i < M2P_OVERRIDE_HASH; i++) + INIT_LIST_HEAD(&m2p_overrides[i]); +} + +static unsigned long mfn_hash(unsigned long mfn) +{ + return hash_long(mfn, M2P_OVERRIDE_HASH_SHIFT); +} + +/* Add an MFN override for a particular page */ +int m2p_add_override(unsigned long mfn, struct page *page) +{ + unsigned long flags; + unsigned long pfn; + unsigned long address; + unsigned level; + pte_t *ptep = NULL; + + pfn = page_to_pfn(page); + if (!PageHighMem(page)) { + address = (unsigned long)__va(pfn << PAGE_SHIFT); + ptep = lookup_address(address, &level); + + if (WARN(ptep == NULL || level != PG_LEVEL_4K, + "m2p_add_override: pfn %lx not mapped", pfn)) + return -EINVAL; + } + + page->private = mfn; + page->index = pfn_to_mfn(pfn); + + __set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)); + if (!PageHighMem(page)) + /* Just zap old mapping for now */ + pte_clear(&init_mm, address, ptep); + + spin_lock_irqsave(&m2p_override_lock, flags); + list_add(&page->lru, &m2p_overrides[mfn_hash(mfn)]); + spin_unlock_irqrestore(&m2p_override_lock, flags); + + return 0; +} + +int m2p_remove_override(struct page *page) +{ + unsigned long flags; + unsigned long mfn; + unsigned long pfn; + unsigned long address; + unsigned level; + pte_t *ptep = NULL; + + pfn = page_to_pfn(page); + mfn = get_phys_to_machine(pfn); + if (mfn == INVALID_P2M_ENTRY || !(mfn & FOREIGN_FRAME_BIT)) + return -EINVAL; + + if (!PageHighMem(page)) { + address = (unsigned long)__va(pfn << PAGE_SHIFT); + ptep = lookup_address(address, &level); + + if (WARN(ptep == NULL || level != PG_LEVEL_4K, + "m2p_remove_override: pfn %lx not mapped", pfn)) + return -EINVAL; + } + + spin_lock_irqsave(&m2p_override_lock, flags); + list_del(&page->lru); + spin_unlock_irqrestore(&m2p_override_lock, flags); + __set_phys_to_machine(pfn, page->index); + + if (!PageHighMem(page)) + set_pte_at(&init_mm, address, ptep, + pfn_pte(pfn, PAGE_KERNEL)); + /* No tlb flush necessary because the caller already + * left the pte unmapped. */ + + return 0; +} + +struct page *m2p_find_override(unsigned long mfn) +{ + unsigned long flags; + struct list_head *bucket = &m2p_overrides[mfn_hash(mfn)]; + struct page *p, *ret; + + ret = NULL; + + spin_lock_irqsave(&m2p_override_lock, flags); + + list_for_each_entry(p, bucket, lru) { + if (p->private == mfn) { + ret = p; + break; + } + } + + spin_unlock_irqrestore(&m2p_override_lock, flags); + + return ret; +} + +unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn) +{ + struct page *p = m2p_find_override(mfn); + unsigned long ret = pfn; + + if (p) + ret = page_to_pfn(p); + + return ret; +} +EXPORT_SYMBOL_GPL(m2p_find_override_pfn); diff --git a/arch/xtensa/include/asm/mman.h b/arch/xtensa/include/asm/mman.h index fca4db4..3078901 100644 --- a/arch/xtensa/include/asm/mman.h +++ b/arch/xtensa/include/asm/mman.h @@ -83,6 +83,9 @@ #define MADV_MERGEABLE 12 /* KSM may merge identical pages */ #define MADV_UNMERGEABLE 13 /* KSM may not merge identical pages */ +#define MADV_HUGEPAGE 14 /* Worth backing with hugepages */ +#define MADV_NOHUGEPAGE 15 /* Not worth backing with hugepages */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/drivers/base/node.c b/drivers/base/node.c index ce012a9..36b4305 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -117,12 +117,21 @@ static ssize_t node_read_meminfo(struct sys_device * dev, "Node %d WritebackTmp: %8lu kB\n" "Node %d Slab: %8lu kB\n" "Node %d SReclaimable: %8lu kB\n" - "Node %d SUnreclaim: %8lu kB\n", + "Node %d SUnreclaim: %8lu kB\n" +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + "Node %d AnonHugePages: %8lu kB\n" +#endif + , nid, K(node_page_state(nid, NR_FILE_DIRTY)), nid, K(node_page_state(nid, NR_WRITEBACK)), nid, K(node_page_state(nid, NR_FILE_PAGES)), nid, K(node_page_state(nid, NR_FILE_MAPPED)), - nid, K(node_page_state(nid, NR_ANON_PAGES)), + nid, K(node_page_state(nid, NR_ANON_PAGES) +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + + node_page_state(nid, NR_ANON_TRANSPARENT_HUGEPAGES) * + HPAGE_PMD_NR +#endif + ), nid, K(node_page_state(nid, NR_SHMEM)), nid, node_page_state(nid, NR_KERNEL_STACK) * THREAD_SIZE / 1024, @@ -133,7 +142,13 @@ static ssize_t node_read_meminfo(struct sys_device * dev, nid, K(node_page_state(nid, NR_SLAB_RECLAIMABLE) + node_page_state(nid, NR_SLAB_UNRECLAIMABLE)), nid, K(node_page_state(nid, NR_SLAB_RECLAIMABLE)), - nid, K(node_page_state(nid, NR_SLAB_UNRECLAIMABLE))); + nid, K(node_page_state(nid, NR_SLAB_UNRECLAIMABLE)) +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + , nid, + K(node_page_state(nid, NR_ANON_TRANSPARENT_HUGEPAGES) * + HPAGE_PMD_NR) +#endif + ); n += hugetlb_report_node_meminfo(nid, buf + n); return n; } diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index bf1a95e3..98d9ec8 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -240,6 +240,30 @@ config DM_MIRROR Allow volume managers to mirror logical volumes, also needed for live data migration tools such as 'pvmove'. +config DM_RAID + tristate "RAID 4/5/6 target (EXPERIMENTAL)" + depends on BLK_DEV_DM && EXPERIMENTAL + select MD_RAID456 + select BLK_DEV_MD + ---help--- + A dm target that supports RAID4, RAID5 and RAID6 mappings + + A RAID-5 set of N drives with a capacity of C MB per drive provides + the capacity of C * (N - 1) MB, and protects against a failure + of a single drive. For a given sector (row) number, (N - 1) drives + contain data sectors, and one drive contains the parity protection. + For a RAID-4 set, the parity blocks are present on a single drive, + while a RAID-5 set distributes the parity across the drives in one + of the available parity distribution methods. + + A RAID-6 set of N drives with a capacity of C MB per drive + provides the capacity of C * (N - 2) MB, and protects + against a failure of any two drives. For a given sector + (row) number, (N - 2) drives contain data sectors, and two + drives contains two independent redundancy syndromes. Like + RAID-5, RAID-6 distributes the syndromes across the drives + in one of the available parity distribution methods. + config DM_LOG_USERSPACE tristate "Mirror userspace logging (EXPERIMENTAL)" depends on DM_MIRROR && EXPERIMENTAL && NET diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 5e3aac4..d013860 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -36,6 +36,7 @@ obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o obj-$(CONFIG_DM_LOG_USERSPACE) += dm-log-userspace.o obj-$(CONFIG_DM_ZERO) += dm-zero.o +obj-$(CONFIG_DM_RAID) += dm-raid.o ifeq ($(CONFIG_DM_UEVENT),y) dm-mod-objs += dm-uevent.o diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 5a1ffe3..9a35320 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -210,11 +210,11 @@ static struct page *read_sb_page(mddev_t *mddev, loff_t offset, || test_bit(Faulty, &rdev->flags)) continue; - target = rdev->sb_start + offset + index * (PAGE_SIZE/512); + target = offset + index * (PAGE_SIZE/512); if (sync_page_io(rdev, target, roundup(size, bdev_logical_block_size(rdev->bdev)), - page, READ)) { + page, READ, true)) { page->index = index; attach_page_buffers(page, NULL); /* so that free_buffer will * quietly no-op */ @@ -264,14 +264,18 @@ static mdk_rdev_t *next_active_rdev(mdk_rdev_t *rdev, mddev_t *mddev) static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) { mdk_rdev_t *rdev = NULL; + struct block_device *bdev; mddev_t *mddev = bitmap->mddev; while ((rdev = next_active_rdev(rdev, mddev)) != NULL) { int size = PAGE_SIZE; loff_t offset = mddev->bitmap_info.offset; + + bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev; + if (page->index == bitmap->file_pages-1) size = roundup(bitmap->last_page_size, - bdev_logical_block_size(rdev->bdev)); + bdev_logical_block_size(bdev)); /* Just make sure we aren't corrupting data or * metadata */ @@ -1542,7 +1546,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector) wait_event(bitmap->mddev->recovery_wait, atomic_read(&bitmap->mddev->recovery_active) == 0); - bitmap->mddev->curr_resync_completed = bitmap->mddev->curr_resync; + bitmap->mddev->curr_resync_completed = sector; set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags); sector &= ~((1ULL << CHUNK_BLOCK_SHIFT(bitmap)) - 1); s = 0; diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index d5b0e4c..4e054bd 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -18,10 +18,14 @@ #include <linux/crypto.h> #include <linux/workqueue.h> #include <linux/backing-dev.h> +#include <linux/percpu.h> #include <asm/atomic.h> #include <linux/scatterlist.h> #include <asm/page.h> #include <asm/unaligned.h> +#include <crypto/hash.h> +#include <crypto/md5.h> +#include <crypto/algapi.h> #include <linux/device-mapper.h> @@ -63,6 +67,7 @@ struct dm_crypt_request { struct convert_context *ctx; struct scatterlist sg_in; struct scatterlist sg_out; + sector_t iv_sector; }; struct crypt_config; @@ -73,11 +78,13 @@ struct crypt_iv_operations { void (*dtr)(struct crypt_config *cc); int (*init)(struct crypt_config *cc); int (*wipe)(struct crypt_config *cc); - int (*generator)(struct crypt_config *cc, u8 *iv, sector_t sector); + int (*generator)(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq); + int (*post)(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq); }; struct iv_essiv_private { - struct crypto_cipher *tfm; struct crypto_hash *hash_tfm; u8 *salt; }; @@ -86,11 +93,32 @@ struct iv_benbi_private { int shift; }; +#define LMK_SEED_SIZE 64 /* hash + 0 */ +struct iv_lmk_private { + struct crypto_shash *hash_tfm; + u8 *seed; +}; + /* * Crypt: maps a linear range of a block device * and encrypts / decrypts at the same time. */ enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID }; + +/* + * Duplicated per-CPU state for cipher. + */ +struct crypt_cpu { + struct ablkcipher_request *req; + /* ESSIV: struct crypto_cipher *essiv_tfm */ + void *iv_private; + struct crypto_ablkcipher *tfms[0]; +}; + +/* + * The fields in here must be read only after initialization, + * changing state should be in crypt_cpu. + */ struct crypt_config { struct dm_dev *dev; sector_t start; @@ -108,17 +136,25 @@ struct crypt_config { struct workqueue_struct *crypt_queue; char *cipher; - char *cipher_mode; + char *cipher_string; struct crypt_iv_operations *iv_gen_ops; union { struct iv_essiv_private essiv; struct iv_benbi_private benbi; + struct iv_lmk_private lmk; } iv_gen_private; sector_t iv_offset; unsigned int iv_size; /* + * Duplicated per cpu state. Access through + * per_cpu_ptr() only. + */ + struct crypt_cpu __percpu *cpu; + unsigned tfms_count; + + /* * Layout of each crypto request: * * struct ablkcipher_request @@ -132,11 +168,10 @@ struct crypt_config { * correctly aligned. */ unsigned int dmreq_start; - struct ablkcipher_request *req; - struct crypto_ablkcipher *tfm; unsigned long flags; unsigned int key_size; + unsigned int key_parts; u8 key[0]; }; @@ -148,6 +183,20 @@ static struct kmem_cache *_crypt_io_pool; static void clone_init(struct dm_crypt_io *, struct bio *); static void kcryptd_queue_crypt(struct dm_crypt_io *io); +static u8 *iv_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq); + +static struct crypt_cpu *this_crypt_config(struct crypt_config *cc) +{ + return this_cpu_ptr(cc->cpu); +} + +/* + * Use this to access cipher attributes that are the same for each CPU. + */ +static struct crypto_ablkcipher *any_tfm(struct crypt_config *cc) +{ + return __this_cpu_ptr(cc->cpu)->tfms[0]; +} /* * Different IV generation algorithms: @@ -168,23 +217,38 @@ static void kcryptd_queue_crypt(struct dm_crypt_io *io); * null: the initial vector is always zero. Provides compatibility with * obsolete loop_fish2 devices. Do not use for new devices. * + * lmk: Compatible implementation of the block chaining mode used + * by the Loop-AES block device encryption system + * designed by Jari Ruusu. See http://loop-aes.sourceforge.net/ + * It operates on full 512 byte sectors and uses CBC + * with an IV derived from the sector number, the data and + * optionally extra IV seed. + * This means that after decryption the first block + * of sector must be tweaked according to decrypted data. + * Loop-AES can use three encryption schemes: + * version 1: is plain aes-cbc mode + * version 2: uses 64 multikey scheme with lmk IV generator + * version 3: the same as version 2 with additional IV seed + * (it uses 65 keys, last key is used as IV seed) + * * plumb: unimplemented, see: * http://article.gmane.org/gmane.linux.kernel.device-mapper.dm-crypt/454 */ -static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv, sector_t sector) +static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq) { memset(iv, 0, cc->iv_size); - *(u32 *)iv = cpu_to_le32(sector & 0xffffffff); + *(u32 *)iv = cpu_to_le32(dmreq->iv_sector & 0xffffffff); return 0; } static int crypt_iv_plain64_gen(struct crypt_config *cc, u8 *iv, - sector_t sector) + struct dm_crypt_request *dmreq) { memset(iv, 0, cc->iv_size); - *(u64 *)iv = cpu_to_le64(sector); + *(u64 *)iv = cpu_to_le64(dmreq->iv_sector); return 0; } @@ -195,7 +259,8 @@ static int crypt_iv_essiv_init(struct crypt_config *cc) struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; struct hash_desc desc; struct scatterlist sg; - int err; + struct crypto_cipher *essiv_tfm; + int err, cpu; sg_init_one(&sg, cc->key, cc->key_size); desc.tfm = essiv->hash_tfm; @@ -205,8 +270,16 @@ static int crypt_iv_essiv_init(struct crypt_config *cc) if (err) return err; - return crypto_cipher_setkey(essiv->tfm, essiv->salt, + for_each_possible_cpu(cpu) { + essiv_tfm = per_cpu_ptr(cc->cpu, cpu)->iv_private, + + err = crypto_cipher_setkey(essiv_tfm, essiv->salt, crypto_hash_digestsize(essiv->hash_tfm)); + if (err) + return err; + } + + return 0; } /* Wipe salt and reset key derived from volume key */ @@ -214,24 +287,76 @@ static int crypt_iv_essiv_wipe(struct crypt_config *cc) { struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; unsigned salt_size = crypto_hash_digestsize(essiv->hash_tfm); + struct crypto_cipher *essiv_tfm; + int cpu, r, err = 0; memset(essiv->salt, 0, salt_size); - return crypto_cipher_setkey(essiv->tfm, essiv->salt, salt_size); + for_each_possible_cpu(cpu) { + essiv_tfm = per_cpu_ptr(cc->cpu, cpu)->iv_private; + r = crypto_cipher_setkey(essiv_tfm, essiv->salt, salt_size); + if (r) + err = r; + } + + return err; +} + +/* Set up per cpu cipher state */ +static struct crypto_cipher *setup_essiv_cpu(struct crypt_config *cc, + struct dm_target *ti, + u8 *salt, unsigned saltsize) +{ + struct crypto_cipher *essiv_tfm; + int err; + + /* Setup the essiv_tfm with the given salt */ + essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(essiv_tfm)) { + ti->error = "Error allocating crypto tfm for ESSIV"; + return essiv_tfm; + } + + if (crypto_cipher_blocksize(essiv_tfm) != + crypto_ablkcipher_ivsize(any_tfm(cc))) { + ti->error = "Block size of ESSIV cipher does " + "not match IV size of block cipher"; + crypto_free_cipher(essiv_tfm); + return ERR_PTR(-EINVAL); + } + + err = crypto_cipher_setkey(essiv_tfm, salt, saltsize); + if (err) { + ti->error = "Failed to set key for ESSIV cipher"; + crypto_free_cipher(essiv_tfm); + return ERR_PTR(err); + } + + return essiv_tfm; } static void crypt_iv_essiv_dtr(struct crypt_config *cc) { + int cpu; + struct crypt_cpu *cpu_cc; + struct crypto_cipher *essiv_tfm; struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; - crypto_free_cipher(essiv->tfm); - essiv->tfm = NULL; - crypto_free_hash(essiv->hash_tfm); essiv->hash_tfm = NULL; kzfree(essiv->salt); essiv->salt = NULL; + + for_each_possible_cpu(cpu) { + cpu_cc = per_cpu_ptr(cc->cpu, cpu); + essiv_tfm = cpu_cc->iv_private; + + if (essiv_tfm) + crypto_free_cipher(essiv_tfm); + + cpu_cc->iv_private = NULL; + } } static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, @@ -240,7 +365,7 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, struct crypto_cipher *essiv_tfm = NULL; struct crypto_hash *hash_tfm = NULL; u8 *salt = NULL; - int err; + int err, cpu; if (!opts) { ti->error = "Digest algorithm missing for ESSIV mode"; @@ -262,48 +387,44 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, goto bad; } - /* Allocate essiv_tfm */ - essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(essiv_tfm)) { - ti->error = "Error allocating crypto tfm for ESSIV"; - err = PTR_ERR(essiv_tfm); - goto bad; - } - if (crypto_cipher_blocksize(essiv_tfm) != - crypto_ablkcipher_ivsize(cc->tfm)) { - ti->error = "Block size of ESSIV cipher does " - "not match IV size of block cipher"; - err = -EINVAL; - goto bad; - } - cc->iv_gen_private.essiv.salt = salt; - cc->iv_gen_private.essiv.tfm = essiv_tfm; cc->iv_gen_private.essiv.hash_tfm = hash_tfm; + for_each_possible_cpu(cpu) { + essiv_tfm = setup_essiv_cpu(cc, ti, salt, + crypto_hash_digestsize(hash_tfm)); + if (IS_ERR(essiv_tfm)) { + crypt_iv_essiv_dtr(cc); + return PTR_ERR(essiv_tfm); + } + per_cpu_ptr(cc->cpu, cpu)->iv_private = essiv_tfm; + } + return 0; bad: - if (essiv_tfm && !IS_ERR(essiv_tfm)) - crypto_free_cipher(essiv_tfm); if (hash_tfm && !IS_ERR(hash_tfm)) crypto_free_hash(hash_tfm); kfree(salt); return err; } -static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector) +static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq) { + struct crypto_cipher *essiv_tfm = this_crypt_config(cc)->iv_private; + memset(iv, 0, cc->iv_size); - *(u64 *)iv = cpu_to_le64(sector); - crypto_cipher_encrypt_one(cc->iv_gen_private.essiv.tfm, iv, iv); + *(u64 *)iv = cpu_to_le64(dmreq->iv_sector); + crypto_cipher_encrypt_one(essiv_tfm, iv, iv); + return 0; } static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti, const char *opts) { - unsigned bs = crypto_ablkcipher_blocksize(cc->tfm); + unsigned bs = crypto_ablkcipher_blocksize(any_tfm(cc)); int log = ilog2(bs); /* we need to calculate how far we must shift the sector count @@ -328,25 +449,177 @@ static void crypt_iv_benbi_dtr(struct crypt_config *cc) { } -static int crypt_iv_benbi_gen(struct crypt_config *cc, u8 *iv, sector_t sector) +static int crypt_iv_benbi_gen(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq) { __be64 val; memset(iv, 0, cc->iv_size - sizeof(u64)); /* rest is cleared below */ - val = cpu_to_be64(((u64)sector << cc->iv_gen_private.benbi.shift) + 1); + val = cpu_to_be64(((u64)dmreq->iv_sector << cc->iv_gen_private.benbi.shift) + 1); put_unaligned(val, (__be64 *)(iv + cc->iv_size - sizeof(u64))); return 0; } -static int crypt_iv_null_gen(struct crypt_config *cc, u8 *iv, sector_t sector) +static int crypt_iv_null_gen(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq) { memset(iv, 0, cc->iv_size); return 0; } +static void crypt_iv_lmk_dtr(struct crypt_config *cc) +{ + struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk; + + if (lmk->hash_tfm && !IS_ERR(lmk->hash_tfm)) + crypto_free_shash(lmk->hash_tfm); + lmk->hash_tfm = NULL; + + kzfree(lmk->seed); + lmk->seed = NULL; +} + +static int crypt_iv_lmk_ctr(struct crypt_config *cc, struct dm_target *ti, + const char *opts) +{ + struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk; + + lmk->hash_tfm = crypto_alloc_shash("md5", 0, 0); + if (IS_ERR(lmk->hash_tfm)) { + ti->error = "Error initializing LMK hash"; + return PTR_ERR(lmk->hash_tfm); + } + + /* No seed in LMK version 2 */ + if (cc->key_parts == cc->tfms_count) { + lmk->seed = NULL; + return 0; + } + + lmk->seed = kzalloc(LMK_SEED_SIZE, GFP_KERNEL); + if (!lmk->seed) { + crypt_iv_lmk_dtr(cc); + ti->error = "Error kmallocing seed storage in LMK"; + return -ENOMEM; + } + + return 0; +} + +static int crypt_iv_lmk_init(struct crypt_config *cc) +{ + struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk; + int subkey_size = cc->key_size / cc->key_parts; + + /* LMK seed is on the position of LMK_KEYS + 1 key */ + if (lmk->seed) + memcpy(lmk->seed, cc->key + (cc->tfms_count * subkey_size), + crypto_shash_digestsize(lmk->hash_tfm)); + + return 0; +} + +static int crypt_iv_lmk_wipe(struct crypt_config *cc) +{ + struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk; + + if (lmk->seed) + memset(lmk->seed, 0, LMK_SEED_SIZE); + + return 0; +} + +static int crypt_iv_lmk_one(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq, + u8 *data) +{ + struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk; + struct { + struct shash_desc desc; + char ctx[crypto_shash_descsize(lmk->hash_tfm)]; + } sdesc; + struct md5_state md5state; + u32 buf[4]; + int i, r; + + sdesc.desc.tfm = lmk->hash_tfm; + sdesc.desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; + + r = crypto_shash_init(&sdesc.desc); + if (r) + return r; + + if (lmk->seed) { + r = crypto_shash_update(&sdesc.desc, lmk->seed, LMK_SEED_SIZE); + if (r) + return r; + } + + /* Sector is always 512B, block size 16, add data of blocks 1-31 */ + r = crypto_shash_update(&sdesc.desc, data + 16, 16 * 31); + if (r) + return r; + + /* Sector is cropped to 56 bits here */ + buf[0] = cpu_to_le32(dmreq->iv_sector & 0xFFFFFFFF); + buf[1] = cpu_to_le32((((u64)dmreq->iv_sector >> 32) & 0x00FFFFFF) | 0x80000000); + buf[2] = cpu_to_le32(4024); + buf[3] = 0; + r = crypto_shash_update(&sdesc.desc, (u8 *)buf, sizeof(buf)); + if (r) + return r; + + /* No MD5 padding here */ + r = crypto_shash_export(&sdesc.desc, &md5state); + if (r) + return r; + + for (i = 0; i < MD5_HASH_WORDS; i++) + __cpu_to_le32s(&md5state.hash[i]); + memcpy(iv, &md5state.hash, cc->iv_size); + + return 0; +} + +static int crypt_iv_lmk_gen(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq) +{ + u8 *src; + int r = 0; + + if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) { + src = kmap_atomic(sg_page(&dmreq->sg_in), KM_USER0); + r = crypt_iv_lmk_one(cc, iv, dmreq, src + dmreq->sg_in.offset); + kunmap_atomic(src, KM_USER0); + } else + memset(iv, 0, cc->iv_size); + + return r; +} + +static int crypt_iv_lmk_post(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq) +{ + u8 *dst; + int r; + + if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) + return 0; + + dst = kmap_atomic(sg_page(&dmreq->sg_out), KM_USER0); + r = crypt_iv_lmk_one(cc, iv, dmreq, dst + dmreq->sg_out.offset); + + /* Tweak the first block of plaintext sector */ + if (!r) + crypto_xor(dst + dmreq->sg_out.offset, iv, cc->iv_size); + + kunmap_atomic(dst, KM_USER0); + return r; +} + static struct crypt_iv_operations crypt_iv_plain_ops = { .generator = crypt_iv_plain_gen }; @@ -373,6 +646,15 @@ static struct crypt_iv_operations crypt_iv_null_ops = { .generator = crypt_iv_null_gen }; +static struct crypt_iv_operations crypt_iv_lmk_ops = { + .ctr = crypt_iv_lmk_ctr, + .dtr = crypt_iv_lmk_dtr, + .init = crypt_iv_lmk_init, + .wipe = crypt_iv_lmk_wipe, + .generator = crypt_iv_lmk_gen, + .post = crypt_iv_lmk_post +}; + static void crypt_convert_init(struct crypt_config *cc, struct convert_context *ctx, struct bio *bio_out, struct bio *bio_in, @@ -400,6 +682,13 @@ static struct ablkcipher_request *req_of_dmreq(struct crypt_config *cc, return (struct ablkcipher_request *)((char *)dmreq - cc->dmreq_start); } +static u8 *iv_of_dmreq(struct crypt_config *cc, + struct dm_crypt_request *dmreq) +{ + return (u8 *)ALIGN((unsigned long)(dmreq + 1), + crypto_ablkcipher_alignmask(any_tfm(cc)) + 1); +} + static int crypt_convert_block(struct crypt_config *cc, struct convert_context *ctx, struct ablkcipher_request *req) @@ -411,9 +700,9 @@ static int crypt_convert_block(struct crypt_config *cc, int r = 0; dmreq = dmreq_of_req(cc, req); - iv = (u8 *)ALIGN((unsigned long)(dmreq + 1), - crypto_ablkcipher_alignmask(cc->tfm) + 1); + iv = iv_of_dmreq(cc, dmreq); + dmreq->iv_sector = ctx->sector; dmreq->ctx = ctx; sg_init_table(&dmreq->sg_in, 1); sg_set_page(&dmreq->sg_in, bv_in->bv_page, 1 << SECTOR_SHIFT, @@ -436,7 +725,7 @@ static int crypt_convert_block(struct crypt_config *cc, } if (cc->iv_gen_ops) { - r = cc->iv_gen_ops->generator(cc, iv, ctx->sector); + r = cc->iv_gen_ops->generator(cc, iv, dmreq); if (r < 0) return r; } @@ -449,21 +738,28 @@ static int crypt_convert_block(struct crypt_config *cc, else r = crypto_ablkcipher_decrypt(req); + if (!r && cc->iv_gen_ops && cc->iv_gen_ops->post) + r = cc->iv_gen_ops->post(cc, iv, dmreq); + return r; } static void kcryptd_async_done(struct crypto_async_request *async_req, int error); + static void crypt_alloc_req(struct crypt_config *cc, struct convert_context *ctx) { - if (!cc->req) - cc->req = mempool_alloc(cc->req_pool, GFP_NOIO); - ablkcipher_request_set_tfm(cc->req, cc->tfm); - ablkcipher_request_set_callback(cc->req, CRYPTO_TFM_REQ_MAY_BACKLOG | - CRYPTO_TFM_REQ_MAY_SLEEP, - kcryptd_async_done, - dmreq_of_req(cc, cc->req)); + struct crypt_cpu *this_cc = this_crypt_config(cc); + unsigned key_index = ctx->sector & (cc->tfms_count - 1); + + if (!this_cc->req) + this_cc->req = mempool_alloc(cc->req_pool, GFP_NOIO); + + ablkcipher_request_set_tfm(this_cc->req, this_cc->tfms[key_index]); + ablkcipher_request_set_callback(this_cc->req, + CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, + kcryptd_async_done, dmreq_of_req(cc, this_cc->req)); } /* @@ -472,6 +768,7 @@ static void crypt_alloc_req(struct crypt_config *cc, static int crypt_convert(struct crypt_config *cc, struct convert_context *ctx) { + struct crypt_cpu *this_cc = this_crypt_config(cc); int r; atomic_set(&ctx->pending, 1); @@ -483,7 +780,7 @@ static int crypt_convert(struct crypt_config *cc, atomic_inc(&ctx->pending); - r = crypt_convert_block(cc, ctx, cc->req); + r = crypt_convert_block(cc, ctx, this_cc->req); switch (r) { /* async */ @@ -492,7 +789,7 @@ static int crypt_convert(struct crypt_config *cc, INIT_COMPLETION(ctx->restart); /* fall through*/ case -EINPROGRESS: - cc->req = NULL; + this_cc->req = NULL; ctx->sector++; continue; @@ -651,6 +948,9 @@ static void crypt_dec_pending(struct dm_crypt_io *io) * They must be separated as otherwise the final stages could be * starved by new requests which can block in the first stages due * to memory allocation. + * + * The work is done per CPU global for all dm-crypt instances. + * They should not depend on each other and do not block. */ static void crypt_endio(struct bio *clone, int error) { @@ -691,26 +991,30 @@ static void clone_init(struct dm_crypt_io *io, struct bio *clone) clone->bi_destructor = dm_crypt_bio_destructor; } -static void kcryptd_io_read(struct dm_crypt_io *io) +static void kcryptd_unplug(struct crypt_config *cc) +{ + blk_unplug(bdev_get_queue(cc->dev->bdev)); +} + +static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp) { struct crypt_config *cc = io->target->private; struct bio *base_bio = io->base_bio; struct bio *clone; - crypt_inc_pending(io); - /* * The block layer might modify the bvec array, so always * copy the required bvecs because we need the original * one in order to decrypt the whole bio data *afterwards*. */ - clone = bio_alloc_bioset(GFP_NOIO, bio_segments(base_bio), cc->bs); - if (unlikely(!clone)) { - io->error = -ENOMEM; - crypt_dec_pending(io); - return; + clone = bio_alloc_bioset(gfp, bio_segments(base_bio), cc->bs); + if (!clone) { + kcryptd_unplug(cc); + return 1; } + crypt_inc_pending(io); + clone_init(io, clone); clone->bi_idx = 0; clone->bi_vcnt = bio_segments(base_bio); @@ -720,6 +1024,7 @@ static void kcryptd_io_read(struct dm_crypt_io *io) sizeof(struct bio_vec) * clone->bi_vcnt); generic_make_request(clone); + return 0; } static void kcryptd_io_write(struct dm_crypt_io *io) @@ -732,9 +1037,12 @@ static void kcryptd_io(struct work_struct *work) { struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work); - if (bio_data_dir(io->base_bio) == READ) - kcryptd_io_read(io); - else + if (bio_data_dir(io->base_bio) == READ) { + crypt_inc_pending(io); + if (kcryptd_io_read(io, GFP_NOIO)) + io->error = -ENOMEM; + crypt_dec_pending(io); + } else kcryptd_io_write(io); } @@ -901,6 +1209,9 @@ static void kcryptd_async_done(struct crypto_async_request *async_req, return; } + if (!error && cc->iv_gen_ops && cc->iv_gen_ops->post) + error = cc->iv_gen_ops->post(cc, iv_of_dmreq(cc, dmreq), dmreq); + mempool_free(req_of_dmreq(cc, dmreq), cc->req_pool); if (!atomic_dec_and_test(&ctx->pending)) @@ -971,34 +1282,84 @@ static void crypt_encode_key(char *hex, u8 *key, unsigned int size) } } -static int crypt_set_key(struct crypt_config *cc, char *key) +static void crypt_free_tfms(struct crypt_config *cc, int cpu) { - unsigned key_size = strlen(key) >> 1; + struct crypt_cpu *cpu_cc = per_cpu_ptr(cc->cpu, cpu); + unsigned i; - if (cc->key_size && cc->key_size != key_size) + for (i = 0; i < cc->tfms_count; i++) + if (cpu_cc->tfms[i] && !IS_ERR(cpu_cc->tfms[i])) { + crypto_free_ablkcipher(cpu_cc->tfms[i]); + cpu_cc->tfms[i] = NULL; + } +} + +static int crypt_alloc_tfms(struct crypt_config *cc, int cpu, char *ciphermode) +{ + struct crypt_cpu *cpu_cc = per_cpu_ptr(cc->cpu, cpu); + unsigned i; + int err; + + for (i = 0; i < cc->tfms_count; i++) { + cpu_cc->tfms[i] = crypto_alloc_ablkcipher(ciphermode, 0, 0); + if (IS_ERR(cpu_cc->tfms[i])) { + err = PTR_ERR(cpu_cc->tfms[i]); + crypt_free_tfms(cc, cpu); + return err; + } + } + + return 0; +} + +static int crypt_setkey_allcpus(struct crypt_config *cc) +{ + unsigned subkey_size = cc->key_size >> ilog2(cc->tfms_count); + int cpu, err = 0, i, r; + + for_each_possible_cpu(cpu) { + for (i = 0; i < cc->tfms_count; i++) { + r = crypto_ablkcipher_setkey(per_cpu_ptr(cc->cpu, cpu)->tfms[i], + cc->key + (i * subkey_size), subkey_size); + if (r) + err = r; + } + } + + return err; +} + +static int crypt_set_key(struct crypt_config *cc, char *key) +{ + /* The key size may not be changed. */ + if (cc->key_size != (strlen(key) >> 1)) return -EINVAL; - cc->key_size = key_size; /* initial settings */ + /* Hyphen (which gives a key_size of zero) means there is no key. */ + if (!cc->key_size && strcmp(key, "-")) + return -EINVAL; - if ((!key_size && strcmp(key, "-")) || - (key_size && crypt_decode_key(cc->key, key, key_size) < 0)) + if (cc->key_size && crypt_decode_key(cc->key, key, cc->key_size) < 0) return -EINVAL; set_bit(DM_CRYPT_KEY_VALID, &cc->flags); - return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size); + return crypt_setkey_allcpus(cc); } static int crypt_wipe_key(struct crypt_config *cc) { clear_bit(DM_CRYPT_KEY_VALID, &cc->flags); memset(&cc->key, 0, cc->key_size * sizeof(u8)); - return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size); + + return crypt_setkey_allcpus(cc); } static void crypt_dtr(struct dm_target *ti) { struct crypt_config *cc = ti->private; + struct crypt_cpu *cpu_cc; + int cpu; ti->private = NULL; @@ -1010,6 +1371,14 @@ static void crypt_dtr(struct dm_target *ti) if (cc->crypt_queue) destroy_workqueue(cc->crypt_queue); + if (cc->cpu) + for_each_possible_cpu(cpu) { + cpu_cc = per_cpu_ptr(cc->cpu, cpu); + if (cpu_cc->req) + mempool_free(cpu_cc->req, cc->req_pool); + crypt_free_tfms(cc, cpu); + } + if (cc->bs) bioset_free(cc->bs); @@ -1023,14 +1392,14 @@ static void crypt_dtr(struct dm_target *ti) if (cc->iv_gen_ops && cc->iv_gen_ops->dtr) cc->iv_gen_ops->dtr(cc); - if (cc->tfm && !IS_ERR(cc->tfm)) - crypto_free_ablkcipher(cc->tfm); - if (cc->dev) dm_put_device(ti, cc->dev); + if (cc->cpu) + free_percpu(cc->cpu); + kzfree(cc->cipher); - kzfree(cc->cipher_mode); + kzfree(cc->cipher_string); /* Must zero key material before freeing */ kzfree(cc); @@ -1040,9 +1409,9 @@ static int crypt_ctr_cipher(struct dm_target *ti, char *cipher_in, char *key) { struct crypt_config *cc = ti->private; - char *tmp, *cipher, *chainmode, *ivmode, *ivopts; + char *tmp, *cipher, *chainmode, *ivmode, *ivopts, *keycount; char *cipher_api = NULL; - int ret = -EINVAL; + int cpu, ret = -EINVAL; /* Convert to crypto api definition? */ if (strchr(cipher_in, '(')) { @@ -1050,23 +1419,31 @@ static int crypt_ctr_cipher(struct dm_target *ti, return -EINVAL; } + cc->cipher_string = kstrdup(cipher_in, GFP_KERNEL); + if (!cc->cipher_string) + goto bad_mem; + /* * Legacy dm-crypt cipher specification - * cipher-mode-iv:ivopts + * cipher[:keycount]-mode-iv:ivopts */ tmp = cipher_in; - cipher = strsep(&tmp, "-"); + keycount = strsep(&tmp, "-"); + cipher = strsep(&keycount, ":"); + + if (!keycount) + cc->tfms_count = 1; + else if (sscanf(keycount, "%u", &cc->tfms_count) != 1 || + !is_power_of_2(cc->tfms_count)) { + ti->error = "Bad cipher key count specification"; + return -EINVAL; + } + cc->key_parts = cc->tfms_count; cc->cipher = kstrdup(cipher, GFP_KERNEL); if (!cc->cipher) goto bad_mem; - if (tmp) { - cc->cipher_mode = kstrdup(tmp, GFP_KERNEL); - if (!cc->cipher_mode) - goto bad_mem; - } - chainmode = strsep(&tmp, "-"); ivopts = strsep(&tmp, "-"); ivmode = strsep(&ivopts, ":"); @@ -1074,10 +1451,19 @@ static int crypt_ctr_cipher(struct dm_target *ti, if (tmp) DMWARN("Ignoring unexpected additional cipher options"); - /* Compatibility mode for old dm-crypt mappings */ + cc->cpu = __alloc_percpu(sizeof(*(cc->cpu)) + + cc->tfms_count * sizeof(*(cc->cpu->tfms)), + __alignof__(struct crypt_cpu)); + if (!cc->cpu) { + ti->error = "Cannot allocate per cpu state"; + goto bad_mem; + } + + /* + * For compatibility with the original dm-crypt mapping format, if + * only the cipher name is supplied, use cbc-plain. + */ if (!chainmode || (!strcmp(chainmode, "plain") && !ivmode)) { - kfree(cc->cipher_mode); - cc->cipher_mode = kstrdup("cbc-plain", GFP_KERNEL); chainmode = "cbc"; ivmode = "plain"; } @@ -1099,11 +1485,12 @@ static int crypt_ctr_cipher(struct dm_target *ti, } /* Allocate cipher */ - cc->tfm = crypto_alloc_ablkcipher(cipher_api, 0, 0); - if (IS_ERR(cc->tfm)) { - ret = PTR_ERR(cc->tfm); - ti->error = "Error allocating crypto tfm"; - goto bad; + for_each_possible_cpu(cpu) { + ret = crypt_alloc_tfms(cc, cpu, cipher_api); + if (ret < 0) { + ti->error = "Error allocating crypto tfm"; + goto bad; + } } /* Initialize and set key */ @@ -1114,7 +1501,7 @@ static int crypt_ctr_cipher(struct dm_target *ti, } /* Initialize IV */ - cc->iv_size = crypto_ablkcipher_ivsize(cc->tfm); + cc->iv_size = crypto_ablkcipher_ivsize(any_tfm(cc)); if (cc->iv_size) /* at least a 64 bit sector number should fit in our buffer */ cc->iv_size = max(cc->iv_size, @@ -1137,7 +1524,15 @@ static int crypt_ctr_cipher(struct dm_target *ti, cc->iv_gen_ops = &crypt_iv_benbi_ops; else if (strcmp(ivmode, "null") == 0) cc->iv_gen_ops = &crypt_iv_null_ops; - else { + else if (strcmp(ivmode, "lmk") == 0) { + cc->iv_gen_ops = &crypt_iv_lmk_ops; + /* Version 2 and 3 is recognised according + * to length of provided multi-key string. + * If present (version 3), last key is used as IV seed. + */ + if (cc->key_size % cc->key_parts) + cc->key_parts++; + } else { ret = -EINVAL; ti->error = "Invalid IV mode"; goto bad; @@ -1194,6 +1589,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->error = "Cannot allocate encryption context"; return -ENOMEM; } + cc->key_size = key_size; ti->private = cc; ret = crypt_ctr_cipher(ti, argv[0], argv[1]); @@ -1208,9 +1604,9 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) } cc->dmreq_start = sizeof(struct ablkcipher_request); - cc->dmreq_start += crypto_ablkcipher_reqsize(cc->tfm); + cc->dmreq_start += crypto_ablkcipher_reqsize(any_tfm(cc)); cc->dmreq_start = ALIGN(cc->dmreq_start, crypto_tfm_ctx_alignment()); - cc->dmreq_start += crypto_ablkcipher_alignmask(cc->tfm) & + cc->dmreq_start += crypto_ablkcipher_alignmask(any_tfm(cc)) & ~(crypto_tfm_ctx_alignment() - 1); cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start + @@ -1219,7 +1615,6 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->error = "Cannot allocate crypt request mempool"; goto bad; } - cc->req = NULL; cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0); if (!cc->page_pool) { @@ -1252,13 +1647,20 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) cc->start = tmpll; ret = -ENOMEM; - cc->io_queue = create_singlethread_workqueue("kcryptd_io"); + cc->io_queue = alloc_workqueue("kcryptd_io", + WQ_NON_REENTRANT| + WQ_MEM_RECLAIM, + 1); if (!cc->io_queue) { ti->error = "Couldn't create kcryptd io queue"; goto bad; } - cc->crypt_queue = create_singlethread_workqueue("kcryptd"); + cc->crypt_queue = alloc_workqueue("kcryptd", + WQ_NON_REENTRANT| + WQ_CPU_INTENSIVE| + WQ_MEM_RECLAIM, + 1); if (!cc->crypt_queue) { ti->error = "Couldn't create kcryptd queue"; goto bad; @@ -1286,9 +1688,10 @@ static int crypt_map(struct dm_target *ti, struct bio *bio, io = crypt_io_alloc(ti, bio, dm_target_offset(ti, bio->bi_sector)); - if (bio_data_dir(io->base_bio) == READ) - kcryptd_queue_io(io); - else + if (bio_data_dir(io->base_bio) == READ) { + if (kcryptd_io_read(io, GFP_NOWAIT)) + kcryptd_queue_io(io); + } else kcryptd_queue_crypt(io); return DM_MAPIO_SUBMITTED; @@ -1306,10 +1709,7 @@ static int crypt_status(struct dm_target *ti, status_type_t type, break; case STATUSTYPE_TABLE: - if (cc->cipher_mode) - DMEMIT("%s-%s ", cc->cipher, cc->cipher_mode); - else - DMEMIT("%s ", cc->cipher); + DMEMIT("%s ", cc->cipher_string); if (cc->key_size > 0) { if ((maxlen - sz) < ((cc->key_size << 1) + 1)) @@ -1421,7 +1821,7 @@ static int crypt_iterate_devices(struct dm_target *ti, static struct target_type crypt_target = { .name = "crypt", - .version = {1, 7, 0}, + .version = {1, 10, 0}, .module = THIS_MODULE, .ctr = crypt_ctr, .dtr = crypt_dtr, diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index baa1191..f18375d 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c @@ -352,7 +352,7 @@ static int __init dm_delay_init(void) { int r = -ENOMEM; - kdelayd_wq = create_workqueue("kdelayd"); + kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0); if (!kdelayd_wq) { DMERR("Couldn't start kdelayd"); goto bad_queue; diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 4b54618..6d12775 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -295,19 +295,55 @@ retry: DMWARN("remove_all left %d open device(s)", dev_skipped); } +/* + * Set the uuid of a hash_cell that isn't already set. + */ +static void __set_cell_uuid(struct hash_cell *hc, char *new_uuid) +{ + mutex_lock(&dm_hash_cells_mutex); + hc->uuid = new_uuid; + mutex_unlock(&dm_hash_cells_mutex); + + list_add(&hc->uuid_list, _uuid_buckets + hash_str(new_uuid)); +} + +/* + * Changes the name of a hash_cell and returns the old name for + * the caller to free. + */ +static char *__change_cell_name(struct hash_cell *hc, char *new_name) +{ + char *old_name; + + /* + * Rename and move the name cell. + */ + list_del(&hc->name_list); + old_name = hc->name; + + mutex_lock(&dm_hash_cells_mutex); + hc->name = new_name; + mutex_unlock(&dm_hash_cells_mutex); + + list_add(&hc->name_list, _name_buckets + hash_str(new_name)); + + return old_name; +} + static struct mapped_device *dm_hash_rename(struct dm_ioctl *param, const char *new) { - char *new_name, *old_name; + char *new_data, *old_name = NULL; struct hash_cell *hc; struct dm_table *table; struct mapped_device *md; + unsigned change_uuid = (param->flags & DM_UUID_FLAG) ? 1 : 0; /* * duplicate new. */ - new_name = kstrdup(new, GFP_KERNEL); - if (!new_name) + new_data = kstrdup(new, GFP_KERNEL); + if (!new_data) return ERR_PTR(-ENOMEM); down_write(&_hash_lock); @@ -315,13 +351,19 @@ static struct mapped_device *dm_hash_rename(struct dm_ioctl *param, /* * Is new free ? */ - hc = __get_name_cell(new); + if (change_uuid) + hc = __get_uuid_cell(new); + else + hc = __get_name_cell(new); + if (hc) { - DMWARN("asked to rename to an already-existing name %s -> %s", + DMWARN("Unable to change %s on mapped device %s to one that " + "already exists: %s", + change_uuid ? "uuid" : "name", param->name, new); dm_put(hc->md); up_write(&_hash_lock); - kfree(new_name); + kfree(new_data); return ERR_PTR(-EBUSY); } @@ -330,22 +372,30 @@ static struct mapped_device *dm_hash_rename(struct dm_ioctl *param, */ hc = __get_name_cell(param->name); if (!hc) { - DMWARN("asked to rename a non-existent device %s -> %s", - param->name, new); + DMWARN("Unable to rename non-existent device, %s to %s%s", + param->name, change_uuid ? "uuid " : "", new); up_write(&_hash_lock); - kfree(new_name); + kfree(new_data); return ERR_PTR(-ENXIO); } /* - * rename and move the name cell. + * Does this device already have a uuid? */ - list_del(&hc->name_list); - old_name = hc->name; - mutex_lock(&dm_hash_cells_mutex); - hc->name = new_name; - mutex_unlock(&dm_hash_cells_mutex); - list_add(&hc->name_list, _name_buckets + hash_str(new_name)); + if (change_uuid && hc->uuid) { + DMWARN("Unable to change uuid of mapped device %s to %s " + "because uuid is already set to %s", + param->name, new, hc->uuid); + dm_put(hc->md); + up_write(&_hash_lock); + kfree(new_data); + return ERR_PTR(-EINVAL); + } + + if (change_uuid) + __set_cell_uuid(hc, new_data); + else + old_name = __change_cell_name(hc, new_data); /* * Wake up any dm event waiters. @@ -729,7 +779,7 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size) hc = __find_device_hash_cell(param); if (!hc) { - DMWARN("device doesn't appear to be in the dev hash table."); + DMDEBUG_LIMIT("device doesn't appear to be in the dev hash table."); up_write(&_hash_lock); return -ENXIO; } @@ -741,7 +791,7 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size) */ r = dm_lock_for_deletion(md); if (r) { - DMWARN("unable to remove open device %s", hc->name); + DMDEBUG_LIMIT("unable to remove open device %s", hc->name); up_write(&_hash_lock); dm_put(md); return r; @@ -774,21 +824,24 @@ static int invalid_str(char *str, void *end) static int dev_rename(struct dm_ioctl *param, size_t param_size) { int r; - char *new_name = (char *) param + param->data_start; + char *new_data = (char *) param + param->data_start; struct mapped_device *md; + unsigned change_uuid = (param->flags & DM_UUID_FLAG) ? 1 : 0; - if (new_name < param->data || - invalid_str(new_name, (void *) param + param_size) || - strlen(new_name) > DM_NAME_LEN - 1) { - DMWARN("Invalid new logical volume name supplied."); + if (new_data < param->data || + invalid_str(new_data, (void *) param + param_size) || + strlen(new_data) > (change_uuid ? DM_UUID_LEN - 1 : DM_NAME_LEN - 1)) { + DMWARN("Invalid new mapped device name or uuid string supplied."); return -EINVAL; } - r = check_name(new_name); - if (r) - return r; + if (!change_uuid) { + r = check_name(new_data); + if (r) + return r; + } - md = dm_hash_rename(param, new_name); + md = dm_hash_rename(param, new_data); if (IS_ERR(md)) return PTR_ERR(md); @@ -885,7 +938,7 @@ static int do_resume(struct dm_ioctl *param) hc = __find_device_hash_cell(param); if (!hc) { - DMWARN("device doesn't appear to be in the dev hash table."); + DMDEBUG_LIMIT("device doesn't appear to be in the dev hash table."); up_write(&_hash_lock); return -ENXIO; } @@ -1212,7 +1265,7 @@ static int table_clear(struct dm_ioctl *param, size_t param_size) hc = __find_device_hash_cell(param); if (!hc) { - DMWARN("device doesn't appear to be in the dev hash table."); + DMDEBUG_LIMIT("device doesn't appear to be in the dev hash table."); up_write(&_hash_lock); return -ENXIO; } diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c index d8587ba..924f5f0 100644 --- a/drivers/md/dm-kcopyd.c +++ b/drivers/md/dm-kcopyd.c @@ -37,6 +37,13 @@ struct dm_kcopyd_client { unsigned int nr_pages; unsigned int nr_free_pages; + /* + * Block devices to unplug. + * Non-NULL pointer means that a block device has some pending requests + * and needs to be unplugged. + */ + struct block_device *unplug[2]; + struct dm_io_client *io_client; wait_queue_head_t destroyq; @@ -308,6 +315,31 @@ static int run_complete_job(struct kcopyd_job *job) return 0; } +/* + * Unplug the block device at the specified index. + */ +static void unplug(struct dm_kcopyd_client *kc, int rw) +{ + if (kc->unplug[rw] != NULL) { + blk_unplug(bdev_get_queue(kc->unplug[rw])); + kc->unplug[rw] = NULL; + } +} + +/* + * Prepare block device unplug. If there's another device + * to be unplugged at the same array index, we unplug that + * device first. + */ +static void prepare_unplug(struct dm_kcopyd_client *kc, int rw, + struct block_device *bdev) +{ + if (likely(kc->unplug[rw] == bdev)) + return; + unplug(kc, rw); + kc->unplug[rw] = bdev; +} + static void complete_io(unsigned long error, void *context) { struct kcopyd_job *job = (struct kcopyd_job *) context; @@ -345,7 +377,7 @@ static int run_io_job(struct kcopyd_job *job) { int r; struct dm_io_request io_req = { - .bi_rw = job->rw | REQ_SYNC | REQ_UNPLUG, + .bi_rw = job->rw, .mem.type = DM_IO_PAGE_LIST, .mem.ptr.pl = job->pages, .mem.offset = job->offset, @@ -354,10 +386,16 @@ static int run_io_job(struct kcopyd_job *job) .client = job->kc->io_client, }; - if (job->rw == READ) + if (job->rw == READ) { r = dm_io(&io_req, 1, &job->source, NULL); - else + prepare_unplug(job->kc, READ, job->source.bdev); + } else { + if (job->num_dests > 1) + io_req.bi_rw |= REQ_UNPLUG; r = dm_io(&io_req, job->num_dests, job->dests, NULL); + if (!(io_req.bi_rw & REQ_UNPLUG)) + prepare_unplug(job->kc, WRITE, job->dests[0].bdev); + } return r; } @@ -435,10 +473,18 @@ static void do_work(struct work_struct *work) * Pages jobs when successful will jump onto the io jobs * list. io jobs call wake when they complete and it all * starts again. + * + * Note that io_jobs add block devices to the unplug array, + * this array is cleared with "unplug" calls. It is thus + * forbidden to run complete_jobs after io_jobs and before + * unplug because the block device could be destroyed in + * job completion callback. */ process_jobs(&kc->complete_jobs, kc, run_complete_job); process_jobs(&kc->pages_jobs, kc, run_pages_job); process_jobs(&kc->io_jobs, kc, run_io_job); + unplug(kc, READ); + unplug(kc, WRITE); } /* @@ -619,12 +665,15 @@ int dm_kcopyd_client_create(unsigned int nr_pages, INIT_LIST_HEAD(&kc->io_jobs); INIT_LIST_HEAD(&kc->pages_jobs); + memset(kc->unplug, 0, sizeof(kc->unplug)); + kc->job_pool = mempool_create_slab_pool(MIN_JOBS, _job_cache); if (!kc->job_pool) goto bad_slab; INIT_WORK(&kc->kcopyd_work, do_work); - kc->kcopyd_wq = create_singlethread_workqueue("kcopyd"); + kc->kcopyd_wq = alloc_workqueue("kcopyd", + WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0); if (!kc->kcopyd_wq) goto bad_workqueue; diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c index 1ed0094..aa2e0c3 100644 --- a/drivers/md/dm-log-userspace-base.c +++ b/drivers/md/dm-log-userspace-base.c @@ -12,12 +12,22 @@ #include "dm-log-userspace-transfer.h" +#define DM_LOG_USERSPACE_VSN "1.1.0" + struct flush_entry { int type; region_t region; struct list_head list; }; +/* + * This limit on the number of mark and clear request is, to a degree, + * arbitrary. However, there is some basis for the choice in the limits + * imposed on the size of data payload by dm-log-userspace-transfer.c: + * dm_consult_userspace(). + */ +#define MAX_FLUSH_GROUP_COUNT 32 + struct log_c { struct dm_target *ti; uint32_t region_size; @@ -37,8 +47,15 @@ struct log_c { */ uint64_t in_sync_hint; + /* + * Mark and clear requests are held until a flush is issued + * so that we can group, and thereby limit, the amount of + * network traffic between kernel and userspace. The 'flush_lock' + * is used to protect these lists. + */ spinlock_t flush_lock; - struct list_head flush_list; /* only for clear and mark requests */ + struct list_head mark_list; + struct list_head clear_list; }; static mempool_t *flush_entry_pool; @@ -169,7 +186,8 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti, strncpy(lc->uuid, argv[0], DM_UUID_LEN); spin_lock_init(&lc->flush_lock); - INIT_LIST_HEAD(&lc->flush_list); + INIT_LIST_HEAD(&lc->mark_list); + INIT_LIST_HEAD(&lc->clear_list); str_size = build_constructor_string(ti, argc - 1, argv + 1, &ctr_str); if (str_size < 0) { @@ -181,8 +199,11 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti, r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_CTR, ctr_str, str_size, NULL, NULL); - if (r == -ESRCH) { - DMERR("Userspace log server not found"); + if (r < 0) { + if (r == -ESRCH) + DMERR("Userspace log server not found"); + else + DMERR("Userspace log server failed to create log"); goto out; } @@ -214,10 +235,9 @@ out: static void userspace_dtr(struct dm_dirty_log *log) { - int r; struct log_c *lc = log->context; - r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_DTR, + (void) dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_DTR, NULL, 0, NULL, NULL); @@ -338,6 +358,71 @@ static int userspace_in_sync(struct dm_dirty_log *log, region_t region, return (r) ? 0 : (int)in_sync; } +static int flush_one_by_one(struct log_c *lc, struct list_head *flush_list) +{ + int r = 0; + struct flush_entry *fe; + + list_for_each_entry(fe, flush_list, list) { + r = userspace_do_request(lc, lc->uuid, fe->type, + (char *)&fe->region, + sizeof(fe->region), + NULL, NULL); + if (r) + break; + } + + return r; +} + +static int flush_by_group(struct log_c *lc, struct list_head *flush_list) +{ + int r = 0; + int count; + uint32_t type = 0; + struct flush_entry *fe, *tmp_fe; + LIST_HEAD(tmp_list); + uint64_t group[MAX_FLUSH_GROUP_COUNT]; + + /* + * Group process the requests + */ + while (!list_empty(flush_list)) { + count = 0; + + list_for_each_entry_safe(fe, tmp_fe, flush_list, list) { + group[count] = fe->region; + count++; + + list_del(&fe->list); + list_add(&fe->list, &tmp_list); + + type = fe->type; + if (count >= MAX_FLUSH_GROUP_COUNT) + break; + } + + r = userspace_do_request(lc, lc->uuid, type, + (char *)(group), + count * sizeof(uint64_t), + NULL, NULL); + if (r) { + /* Group send failed. Attempt one-by-one. */ + list_splice_init(&tmp_list, flush_list); + r = flush_one_by_one(lc, flush_list); + break; + } + } + + /* + * Must collect flush_entrys that were successfully processed + * as a group so that they will be free'd by the caller. + */ + list_splice_init(&tmp_list, flush_list); + + return r; +} + /* * userspace_flush * @@ -360,31 +445,25 @@ static int userspace_flush(struct dm_dirty_log *log) int r = 0; unsigned long flags; struct log_c *lc = log->context; - LIST_HEAD(flush_list); + LIST_HEAD(mark_list); + LIST_HEAD(clear_list); struct flush_entry *fe, *tmp_fe; spin_lock_irqsave(&lc->flush_lock, flags); - list_splice_init(&lc->flush_list, &flush_list); + list_splice_init(&lc->mark_list, &mark_list); + list_splice_init(&lc->clear_list, &clear_list); spin_unlock_irqrestore(&lc->flush_lock, flags); - if (list_empty(&flush_list)) + if (list_empty(&mark_list) && list_empty(&clear_list)) return 0; - /* - * FIXME: Count up requests, group request types, - * allocate memory to stick all requests in and - * send to server in one go. Failing the allocation, - * do it one by one. - */ + r = flush_by_group(lc, &mark_list); + if (r) + goto fail; - list_for_each_entry(fe, &flush_list, list) { - r = userspace_do_request(lc, lc->uuid, fe->type, - (char *)&fe->region, - sizeof(fe->region), - NULL, NULL); - if (r) - goto fail; - } + r = flush_by_group(lc, &clear_list); + if (r) + goto fail; r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH, NULL, 0, NULL, NULL); @@ -395,7 +474,11 @@ fail: * Calling code will receive an error and will know that * the log facility has failed. */ - list_for_each_entry_safe(fe, tmp_fe, &flush_list, list) { + list_for_each_entry_safe(fe, tmp_fe, &mark_list, list) { + list_del(&fe->list); + mempool_free(fe, flush_entry_pool); + } + list_for_each_entry_safe(fe, tmp_fe, &clear_list, list) { list_del(&fe->list); mempool_free(fe, flush_entry_pool); } @@ -425,7 +508,7 @@ static void userspace_mark_region(struct dm_dirty_log *log, region_t region) spin_lock_irqsave(&lc->flush_lock, flags); fe->type = DM_ULOG_MARK_REGION; fe->region = region; - list_add(&fe->list, &lc->flush_list); + list_add(&fe->list, &lc->mark_list); spin_unlock_irqrestore(&lc->flush_lock, flags); return; @@ -462,7 +545,7 @@ static void userspace_clear_region(struct dm_dirty_log *log, region_t region) spin_lock_irqsave(&lc->flush_lock, flags); fe->type = DM_ULOG_CLEAR_REGION; fe->region = region; - list_add(&fe->list, &lc->flush_list); + list_add(&fe->list, &lc->clear_list); spin_unlock_irqrestore(&lc->flush_lock, flags); return; @@ -684,7 +767,7 @@ static int __init userspace_dirty_log_init(void) return r; } - DMINFO("version 1.0.0 loaded"); + DMINFO("version " DM_LOG_USERSPACE_VSN " loaded"); return 0; } @@ -694,7 +777,7 @@ static void __exit userspace_dirty_log_exit(void) dm_ulog_tfr_exit(); mempool_destroy(flush_entry_pool); - DMINFO("version 1.0.0 unloaded"); + DMINFO("version " DM_LOG_USERSPACE_VSN " unloaded"); return; } diff --git a/drivers/md/dm-log-userspace-transfer.c b/drivers/md/dm-log-userspace-transfer.c index 075cbcf..049eaf1 100644 --- a/drivers/md/dm-log-userspace-transfer.c +++ b/drivers/md/dm-log-userspace-transfer.c @@ -198,6 +198,7 @@ resend: memset(tfr, 0, DM_ULOG_PREALLOCED_SIZE - sizeof(struct cn_msg)); memcpy(tfr->uuid, uuid, DM_UUID_LEN); + tfr->version = DM_ULOG_REQUEST_VERSION; tfr->luid = luid; tfr->seq = dm_ulog_seq++; diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c index 33420e6..6951536 100644 --- a/drivers/md/dm-log.c +++ b/drivers/md/dm-log.c @@ -455,7 +455,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti, r = PTR_ERR(lc->io_req.client); DMWARN("couldn't allocate disk io client"); kfree(lc); - return -ENOMEM; + return r; } lc->disk_header = vmalloc(buf_size); diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 487ecda..b82d288 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -23,6 +23,8 @@ #define DM_MSG_PREFIX "multipath" #define MESG_STR(x) x, sizeof(x) +#define DM_PG_INIT_DELAY_MSECS 2000 +#define DM_PG_INIT_DELAY_DEFAULT ((unsigned) -1) /* Path properties */ struct pgpath { @@ -33,8 +35,7 @@ struct pgpath { unsigned fail_count; /* Cumulative failure count */ struct dm_path path; - struct work_struct deactivate_path; - struct work_struct activate_path; + struct delayed_work activate_path; }; #define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path) @@ -65,11 +66,15 @@ struct multipath { const char *hw_handler_name; char *hw_handler_params; + unsigned nr_priority_groups; struct list_head priority_groups; + + wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */ + unsigned pg_init_required; /* pg_init needs calling? */ unsigned pg_init_in_progress; /* Only one pg_init allowed at once */ - wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */ + unsigned pg_init_delay_retry; /* Delay pg_init retry? */ unsigned nr_valid_paths; /* Total number of usable paths */ struct pgpath *current_pgpath; @@ -82,6 +87,7 @@ struct multipath { unsigned saved_queue_if_no_path;/* Saved state during suspension */ unsigned pg_init_retries; /* Number of times to retry pg_init */ unsigned pg_init_count; /* Number of times pg_init called */ + unsigned pg_init_delay_msecs; /* Number of msecs before pg_init retry */ struct work_struct process_queued_ios; struct list_head queued_ios; @@ -116,7 +122,6 @@ static struct workqueue_struct *kmultipathd, *kmpath_handlerd; static void process_queued_ios(struct work_struct *work); static void trigger_event(struct work_struct *work); static void activate_path(struct work_struct *work); -static void deactivate_path(struct work_struct *work); /*----------------------------------------------- @@ -129,8 +134,7 @@ static struct pgpath *alloc_pgpath(void) if (pgpath) { pgpath->is_active = 1; - INIT_WORK(&pgpath->deactivate_path, deactivate_path); - INIT_WORK(&pgpath->activate_path, activate_path); + INIT_DELAYED_WORK(&pgpath->activate_path, activate_path); } return pgpath; @@ -141,14 +145,6 @@ static void free_pgpath(struct pgpath *pgpath) kfree(pgpath); } -static void deactivate_path(struct work_struct *work) -{ - struct pgpath *pgpath = - container_of(work, struct pgpath, deactivate_path); - - blk_abort_queue(pgpath->path.dev->bdev->bd_disk->queue); -} - static struct priority_group *alloc_priority_group(void) { struct priority_group *pg; @@ -199,6 +195,7 @@ static struct multipath *alloc_multipath(struct dm_target *ti) INIT_LIST_HEAD(&m->queued_ios); spin_lock_init(&m->lock); m->queue_io = 1; + m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT; INIT_WORK(&m->process_queued_ios, process_queued_ios); INIT_WORK(&m->trigger_event, trigger_event); init_waitqueue_head(&m->pg_init_wait); @@ -238,14 +235,19 @@ static void free_multipath(struct multipath *m) static void __pg_init_all_paths(struct multipath *m) { struct pgpath *pgpath; + unsigned long pg_init_delay = 0; m->pg_init_count++; m->pg_init_required = 0; + if (m->pg_init_delay_retry) + pg_init_delay = msecs_to_jiffies(m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT ? + m->pg_init_delay_msecs : DM_PG_INIT_DELAY_MSECS); list_for_each_entry(pgpath, &m->current_pg->pgpaths, list) { /* Skip failed paths */ if (!pgpath->is_active) continue; - if (queue_work(kmpath_handlerd, &pgpath->activate_path)) + if (queue_delayed_work(kmpath_handlerd, &pgpath->activate_path, + pg_init_delay)) m->pg_init_in_progress++; } } @@ -793,8 +795,9 @@ static int parse_features(struct arg_set *as, struct multipath *m) const char *param_name; static struct param _params[] = { - {0, 3, "invalid number of feature args"}, + {0, 5, "invalid number of feature args"}, {1, 50, "pg_init_retries must be between 1 and 50"}, + {0, 60000, "pg_init_delay_msecs must be between 0 and 60000"}, }; r = read_param(_params, shift(as), &argc, &ti->error); @@ -821,6 +824,14 @@ static int parse_features(struct arg_set *as, struct multipath *m) continue; } + if (!strnicmp(param_name, MESG_STR("pg_init_delay_msecs")) && + (argc >= 1)) { + r = read_param(_params + 2, shift(as), + &m->pg_init_delay_msecs, &ti->error); + argc--; + continue; + } + ti->error = "Unrecognised multipath feature request"; r = -EINVAL; } while (argc && !r); @@ -931,7 +942,7 @@ static void flush_multipath_work(struct multipath *m) flush_workqueue(kmpath_handlerd); multipath_wait_for_pg_init_completion(m); flush_workqueue(kmultipathd); - flush_scheduled_work(); + flush_work_sync(&m->trigger_event); } static void multipath_dtr(struct dm_target *ti) @@ -995,7 +1006,6 @@ static int fail_path(struct pgpath *pgpath) pgpath->path.dev->name, m->nr_valid_paths); schedule_work(&m->trigger_event); - queue_work(kmultipathd, &pgpath->deactivate_path); out: spin_unlock_irqrestore(&m->lock, flags); @@ -1034,7 +1044,7 @@ static int reinstate_path(struct pgpath *pgpath) m->current_pgpath = NULL; queue_work(kmultipathd, &m->process_queued_ios); } else if (m->hw_handler_name && (m->current_pg == pgpath->pg)) { - if (queue_work(kmpath_handlerd, &pgpath->activate_path)) + if (queue_work(kmpath_handlerd, &pgpath->activate_path.work)) m->pg_init_in_progress++; } @@ -1169,6 +1179,7 @@ static void pg_init_done(void *data, int errors) struct priority_group *pg = pgpath->pg; struct multipath *m = pg->m; unsigned long flags; + unsigned delay_retry = 0; /* device or driver problems */ switch (errors) { @@ -1193,8 +1204,9 @@ static void pg_init_done(void *data, int errors) */ bypass_pg(m, pg, 1); break; - /* TODO: For SCSI_DH_RETRY we should wait a couple seconds */ case SCSI_DH_RETRY: + /* Wait before retrying. */ + delay_retry = 1; case SCSI_DH_IMM_RETRY: case SCSI_DH_RES_TEMP_UNAVAIL: if (pg_init_limit_reached(m, pgpath)) @@ -1227,6 +1239,7 @@ static void pg_init_done(void *data, int errors) if (!m->pg_init_required) m->queue_io = 0; + m->pg_init_delay_retry = delay_retry; queue_work(kmultipathd, &m->process_queued_ios); /* @@ -1241,7 +1254,7 @@ out: static void activate_path(struct work_struct *work) { struct pgpath *pgpath = - container_of(work, struct pgpath, activate_path); + container_of(work, struct pgpath, activate_path.work); scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev), pg_init_done, pgpath); @@ -1382,11 +1395,14 @@ static int multipath_status(struct dm_target *ti, status_type_t type, DMEMIT("2 %u %u ", m->queue_size, m->pg_init_count); else { DMEMIT("%u ", m->queue_if_no_path + - (m->pg_init_retries > 0) * 2); + (m->pg_init_retries > 0) * 2 + + (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) * 2); if (m->queue_if_no_path) DMEMIT("queue_if_no_path "); if (m->pg_init_retries) DMEMIT("pg_init_retries %u ", m->pg_init_retries); + if (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) + DMEMIT("pg_init_delay_msecs %u ", m->pg_init_delay_msecs); } if (!m->hw_handler_name || type == STATUSTYPE_INFO) @@ -1655,7 +1671,7 @@ out: *---------------------------------------------------------------*/ static struct target_type multipath_target = { .name = "multipath", - .version = {1, 1, 1}, + .version = {1, 2, 0}, .module = THIS_MODULE, .ctr = multipath_ctr, .dtr = multipath_dtr, @@ -1687,7 +1703,7 @@ static int __init dm_multipath_init(void) return -EINVAL; } - kmultipathd = create_workqueue("kmpathd"); + kmultipathd = alloc_workqueue("kmpathd", WQ_MEM_RECLAIM, 0); if (!kmultipathd) { DMERR("failed to create workqueue kmpathd"); dm_unregister_target(&multipath_target); @@ -1701,7 +1717,8 @@ static int __init dm_multipath_init(void) * old workqueue would also create a bottleneck in the * path of the storage hardware device activation. */ - kmpath_handlerd = create_singlethread_workqueue("kmpath_handlerd"); + kmpath_handlerd = alloc_ordered_workqueue("kmpath_handlerd", + WQ_MEM_RECLAIM); if (!kmpath_handlerd) { DMERR("failed to create workqueue kmpath_handlerd"); destroy_workqueue(kmultipathd); diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c new file mode 100644 index 0000000..b9e1e15 --- /dev/null +++ b/drivers/md/dm-raid.c @@ -0,0 +1,697 @@ +/* + * Copyright (C) 2010-2011 Neil Brown + * Copyright (C) 2010-2011 Red Hat, Inc. All rights reserved. + * + * This file is released under the GPL. + */ + +#include <linux/slab.h> + +#include "md.h" +#include "raid5.h" +#include "dm.h" +#include "bitmap.h" + +#define DM_MSG_PREFIX "raid" + +/* + * If the MD doesn't support MD_SYNC_STATE_FORCED yet, then + * make it so the flag doesn't set anything. + */ +#ifndef MD_SYNC_STATE_FORCED +#define MD_SYNC_STATE_FORCED 0 +#endif + +struct raid_dev { + /* + * Two DM devices, one to hold metadata and one to hold the + * actual data/parity. The reason for this is to not confuse + * ti->len and give more flexibility in altering size and + * characteristics. + * + * While it is possible for this device to be associated + * with a different physical device than the data_dev, it + * is intended for it to be the same. + * |--------- Physical Device ---------| + * |- meta_dev -|------ data_dev ------| + */ + struct dm_dev *meta_dev; + struct dm_dev *data_dev; + struct mdk_rdev_s rdev; +}; + +/* + * Flags for rs->print_flags field. + */ +#define DMPF_DAEMON_SLEEP 0x1 +#define DMPF_MAX_WRITE_BEHIND 0x2 +#define DMPF_SYNC 0x4 +#define DMPF_NOSYNC 0x8 +#define DMPF_STRIPE_CACHE 0x10 +#define DMPF_MIN_RECOVERY_RATE 0x20 +#define DMPF_MAX_RECOVERY_RATE 0x40 + +struct raid_set { + struct dm_target *ti; + + uint64_t print_flags; + + struct mddev_s md; + struct raid_type *raid_type; + struct dm_target_callbacks callbacks; + + struct raid_dev dev[0]; +}; + +/* Supported raid types and properties. */ +static struct raid_type { + const char *name; /* RAID algorithm. */ + const char *descr; /* Descriptor text for logging. */ + const unsigned parity_devs; /* # of parity devices. */ + const unsigned minimal_devs; /* minimal # of devices in set. */ + const unsigned level; /* RAID level. */ + const unsigned algorithm; /* RAID algorithm. */ +} raid_types[] = { + {"raid4", "RAID4 (dedicated parity disk)", 1, 2, 5, ALGORITHM_PARITY_0}, + {"raid5_la", "RAID5 (left asymmetric)", 1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC}, + {"raid5_ra", "RAID5 (right asymmetric)", 1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC}, + {"raid5_ls", "RAID5 (left symmetric)", 1, 2, 5, ALGORITHM_LEFT_SYMMETRIC}, + {"raid5_rs", "RAID5 (right symmetric)", 1, 2, 5, ALGORITHM_RIGHT_SYMMETRIC}, + {"raid6_zr", "RAID6 (zero restart)", 2, 4, 6, ALGORITHM_ROTATING_ZERO_RESTART}, + {"raid6_nr", "RAID6 (N restart)", 2, 4, 6, ALGORITHM_ROTATING_N_RESTART}, + {"raid6_nc", "RAID6 (N continue)", 2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE} +}; + +static struct raid_type *get_raid_type(char *name) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(raid_types); i++) + if (!strcmp(raid_types[i].name, name)) + return &raid_types[i]; + + return NULL; +} + +static struct raid_set *context_alloc(struct dm_target *ti, struct raid_type *raid_type, unsigned raid_devs) +{ + unsigned i; + struct raid_set *rs; + sector_t sectors_per_dev; + + if (raid_devs <= raid_type->parity_devs) { + ti->error = "Insufficient number of devices"; + return ERR_PTR(-EINVAL); + } + + sectors_per_dev = ti->len; + if (sector_div(sectors_per_dev, (raid_devs - raid_type->parity_devs))) { + ti->error = "Target length not divisible by number of data devices"; + return ERR_PTR(-EINVAL); + } + + rs = kzalloc(sizeof(*rs) + raid_devs * sizeof(rs->dev[0]), GFP_KERNEL); + if (!rs) { + ti->error = "Cannot allocate raid context"; + return ERR_PTR(-ENOMEM); + } + + mddev_init(&rs->md); + + rs->ti = ti; + rs->raid_type = raid_type; + rs->md.raid_disks = raid_devs; + rs->md.level = raid_type->level; + rs->md.new_level = rs->md.level; + rs->md.dev_sectors = sectors_per_dev; + rs->md.layout = raid_type->algorithm; + rs->md.new_layout = rs->md.layout; + rs->md.delta_disks = 0; + rs->md.recovery_cp = 0; + + for (i = 0; i < raid_devs; i++) + md_rdev_init(&rs->dev[i].rdev); + + /* + * Remaining items to be initialized by further RAID params: + * rs->md.persistent + * rs->md.external + * rs->md.chunk_sectors + * rs->md.new_chunk_sectors + */ + + return rs; +} + +static void context_free(struct raid_set *rs) +{ + int i; + + for (i = 0; i < rs->md.raid_disks; i++) + if (rs->dev[i].data_dev) + dm_put_device(rs->ti, rs->dev[i].data_dev); + + kfree(rs); +} + +/* + * For every device we have two words + * <meta_dev>: meta device name or '-' if missing + * <data_dev>: data device name or '-' if missing + * + * This code parses those words. + */ +static int dev_parms(struct raid_set *rs, char **argv) +{ + int i; + int rebuild = 0; + int metadata_available = 0; + int ret = 0; + + for (i = 0; i < rs->md.raid_disks; i++, argv += 2) { + rs->dev[i].rdev.raid_disk = i; + + rs->dev[i].meta_dev = NULL; + rs->dev[i].data_dev = NULL; + + /* + * There are no offsets, since there is a separate device + * for data and metadata. + */ + rs->dev[i].rdev.data_offset = 0; + rs->dev[i].rdev.mddev = &rs->md; + + if (strcmp(argv[0], "-")) { + rs->ti->error = "Metadata devices not supported"; + return -EINVAL; + } + + if (!strcmp(argv[1], "-")) { + if (!test_bit(In_sync, &rs->dev[i].rdev.flags) && + (!rs->dev[i].rdev.recovery_offset)) { + rs->ti->error = "Drive designated for rebuild not specified"; + return -EINVAL; + } + + continue; + } + + ret = dm_get_device(rs->ti, argv[1], + dm_table_get_mode(rs->ti->table), + &rs->dev[i].data_dev); + if (ret) { + rs->ti->error = "RAID device lookup failure"; + return ret; + } + + rs->dev[i].rdev.bdev = rs->dev[i].data_dev->bdev; + list_add(&rs->dev[i].rdev.same_set, &rs->md.disks); + if (!test_bit(In_sync, &rs->dev[i].rdev.flags)) + rebuild++; + } + + if (metadata_available) { + rs->md.external = 0; + rs->md.persistent = 1; + rs->md.major_version = 2; + } else if (rebuild && !rs->md.recovery_cp) { + /* + * Without metadata, we will not be able to tell if the array + * is in-sync or not - we must assume it is not. Therefore, + * it is impossible to rebuild a drive. + * + * Even if there is metadata, the on-disk information may + * indicate that the array is not in-sync and it will then + * fail at that time. + * + * User could specify 'nosync' option if desperate. + */ + DMERR("Unable to rebuild drive while array is not in-sync"); + rs->ti->error = "RAID device lookup failure"; + return -EINVAL; + } + + return 0; +} + +/* + * Possible arguments are... + * RAID456: + * <chunk_size> [optional_args] + * + * Optional args: + * [[no]sync] Force or prevent recovery of the entire array + * [rebuild <idx>] Rebuild the drive indicated by the index + * [daemon_sleep <ms>] Time between bitmap daemon work to clear bits + * [min_recovery_rate <kB/sec/disk>] Throttle RAID initialization + * [max_recovery_rate <kB/sec/disk>] Throttle RAID initialization + * [max_write_behind <sectors>] See '-write-behind=' (man mdadm) + * [stripe_cache <sectors>] Stripe cache size for higher RAIDs + */ +static int parse_raid_params(struct raid_set *rs, char **argv, + unsigned num_raid_params) +{ + unsigned i, rebuild_cnt = 0; + unsigned long value; + char *key; + + /* + * First, parse the in-order required arguments + */ + if ((strict_strtoul(argv[0], 10, &value) < 0) || + !is_power_of_2(value) || (value < 8)) { + rs->ti->error = "Bad chunk size"; + return -EINVAL; + } + + rs->md.new_chunk_sectors = rs->md.chunk_sectors = value; + argv++; + num_raid_params--; + + /* + * Second, parse the unordered optional arguments + */ + for (i = 0; i < rs->md.raid_disks; i++) + set_bit(In_sync, &rs->dev[i].rdev.flags); + + for (i = 0; i < num_raid_params; i++) { + if (!strcmp(argv[i], "nosync")) { + rs->md.recovery_cp = MaxSector; + rs->print_flags |= DMPF_NOSYNC; + rs->md.flags |= MD_SYNC_STATE_FORCED; + continue; + } + if (!strcmp(argv[i], "sync")) { + rs->md.recovery_cp = 0; + rs->print_flags |= DMPF_SYNC; + rs->md.flags |= MD_SYNC_STATE_FORCED; + continue; + } + + /* The rest of the optional arguments come in key/value pairs */ + if ((i + 1) >= num_raid_params) { + rs->ti->error = "Wrong number of raid parameters given"; + return -EINVAL; + } + + key = argv[i++]; + if (strict_strtoul(argv[i], 10, &value) < 0) { + rs->ti->error = "Bad numerical argument given in raid params"; + return -EINVAL; + } + + if (!strcmp(key, "rebuild")) { + if (++rebuild_cnt > rs->raid_type->parity_devs) { + rs->ti->error = "Too many rebuild drives given"; + return -EINVAL; + } + if (value > rs->md.raid_disks) { + rs->ti->error = "Invalid rebuild index given"; + return -EINVAL; + } + clear_bit(In_sync, &rs->dev[value].rdev.flags); + rs->dev[value].rdev.recovery_offset = 0; + } else if (!strcmp(key, "max_write_behind")) { + rs->print_flags |= DMPF_MAX_WRITE_BEHIND; + + /* + * In device-mapper, we specify things in sectors, but + * MD records this value in kB + */ + value /= 2; + if (value > COUNTER_MAX) { + rs->ti->error = "Max write-behind limit out of range"; + return -EINVAL; + } + rs->md.bitmap_info.max_write_behind = value; + } else if (!strcmp(key, "daemon_sleep")) { + rs->print_flags |= DMPF_DAEMON_SLEEP; + if (!value || (value > MAX_SCHEDULE_TIMEOUT)) { + rs->ti->error = "daemon sleep period out of range"; + return -EINVAL; + } + rs->md.bitmap_info.daemon_sleep = value; + } else if (!strcmp(key, "stripe_cache")) { + rs->print_flags |= DMPF_STRIPE_CACHE; + + /* + * In device-mapper, we specify things in sectors, but + * MD records this value in kB + */ + value /= 2; + + if (rs->raid_type->level < 5) { + rs->ti->error = "Inappropriate argument: stripe_cache"; + return -EINVAL; + } + if (raid5_set_cache_size(&rs->md, (int)value)) { + rs->ti->error = "Bad stripe_cache size"; + return -EINVAL; + } + } else if (!strcmp(key, "min_recovery_rate")) { + rs->print_flags |= DMPF_MIN_RECOVERY_RATE; + if (value > INT_MAX) { + rs->ti->error = "min_recovery_rate out of range"; + return -EINVAL; + } + rs->md.sync_speed_min = (int)value; + } else if (!strcmp(key, "max_recovery_rate")) { + rs->print_flags |= DMPF_MAX_RECOVERY_RATE; + if (value > INT_MAX) { + rs->ti->error = "max_recovery_rate out of range"; + return -EINVAL; + } + rs->md.sync_speed_max = (int)value; + } else { + DMERR("Unable to parse RAID parameter: %s", key); + rs->ti->error = "Unable to parse RAID parameters"; + return -EINVAL; + } + } + + /* Assume there are no metadata devices until the drives are parsed */ + rs->md.persistent = 0; + rs->md.external = 1; + + return 0; +} + +static void do_table_event(struct work_struct *ws) +{ + struct raid_set *rs = container_of(ws, struct raid_set, md.event_work); + + dm_table_event(rs->ti->table); +} + +static int raid_is_congested(struct dm_target_callbacks *cb, int bits) +{ + struct raid_set *rs = container_of(cb, struct raid_set, callbacks); + + return md_raid5_congested(&rs->md, bits); +} + +static void raid_unplug(struct dm_target_callbacks *cb) +{ + struct raid_set *rs = container_of(cb, struct raid_set, callbacks); + + md_raid5_unplug_device(rs->md.private); +} + +/* + * Construct a RAID4/5/6 mapping: + * Args: + * <raid_type> <#raid_params> <raid_params> \ + * <#raid_devs> { <meta_dev1> <dev1> .. <meta_devN> <devN> } + * + * ** metadata devices are not supported yet, use '-' instead ** + * + * <raid_params> varies by <raid_type>. See 'parse_raid_params' for + * details on possible <raid_params>. + */ +static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv) +{ + int ret; + struct raid_type *rt; + unsigned long num_raid_params, num_raid_devs; + struct raid_set *rs = NULL; + + /* Must have at least <raid_type> <#raid_params> */ + if (argc < 2) { + ti->error = "Too few arguments"; + return -EINVAL; + } + + /* raid type */ + rt = get_raid_type(argv[0]); + if (!rt) { + ti->error = "Unrecognised raid_type"; + return -EINVAL; + } + argc--; + argv++; + + /* number of RAID parameters */ + if (strict_strtoul(argv[0], 10, &num_raid_params) < 0) { + ti->error = "Cannot understand number of RAID parameters"; + return -EINVAL; + } + argc--; + argv++; + + /* Skip over RAID params for now and find out # of devices */ + if (num_raid_params + 1 > argc) { + ti->error = "Arguments do not agree with counts given"; + return -EINVAL; + } + + if ((strict_strtoul(argv[num_raid_params], 10, &num_raid_devs) < 0) || + (num_raid_devs >= INT_MAX)) { + ti->error = "Cannot understand number of raid devices"; + return -EINVAL; + } + + rs = context_alloc(ti, rt, (unsigned)num_raid_devs); + if (IS_ERR(rs)) + return PTR_ERR(rs); + + ret = parse_raid_params(rs, argv, (unsigned)num_raid_params); + if (ret) + goto bad; + + ret = -EINVAL; + + argc -= num_raid_params + 1; /* +1: we already have num_raid_devs */ + argv += num_raid_params + 1; + + if (argc != (num_raid_devs * 2)) { + ti->error = "Supplied RAID devices does not match the count given"; + goto bad; + } + + ret = dev_parms(rs, argv); + if (ret) + goto bad; + + INIT_WORK(&rs->md.event_work, do_table_event); + ti->split_io = rs->md.chunk_sectors; + ti->private = rs; + + mutex_lock(&rs->md.reconfig_mutex); + ret = md_run(&rs->md); + rs->md.in_sync = 0; /* Assume already marked dirty */ + mutex_unlock(&rs->md.reconfig_mutex); + + if (ret) { + ti->error = "Fail to run raid array"; + goto bad; + } + + rs->callbacks.congested_fn = raid_is_congested; + rs->callbacks.unplug_fn = raid_unplug; + dm_table_add_target_callbacks(ti->table, &rs->callbacks); + + return 0; + +bad: + context_free(rs); + + return ret; +} + +static void raid_dtr(struct dm_target *ti) +{ + struct raid_set *rs = ti->private; + + list_del_init(&rs->callbacks.list); + md_stop(&rs->md); + context_free(rs); +} + +static int raid_map(struct dm_target *ti, struct bio *bio, union map_info *map_context) +{ + struct raid_set *rs = ti->private; + mddev_t *mddev = &rs->md; + + mddev->pers->make_request(mddev, bio); + + return DM_MAPIO_SUBMITTED; +} + +static int raid_status(struct dm_target *ti, status_type_t type, + char *result, unsigned maxlen) +{ + struct raid_set *rs = ti->private; + unsigned raid_param_cnt = 1; /* at least 1 for chunksize */ + unsigned sz = 0; + int i; + sector_t sync; + + switch (type) { + case STATUSTYPE_INFO: + DMEMIT("%s %d ", rs->raid_type->name, rs->md.raid_disks); + + for (i = 0; i < rs->md.raid_disks; i++) { + if (test_bit(Faulty, &rs->dev[i].rdev.flags)) + DMEMIT("D"); + else if (test_bit(In_sync, &rs->dev[i].rdev.flags)) + DMEMIT("A"); + else + DMEMIT("a"); + } + + if (test_bit(MD_RECOVERY_RUNNING, &rs->md.recovery)) + sync = rs->md.curr_resync_completed; + else + sync = rs->md.recovery_cp; + + if (sync > rs->md.resync_max_sectors) + sync = rs->md.resync_max_sectors; + + DMEMIT(" %llu/%llu", + (unsigned long long) sync, + (unsigned long long) rs->md.resync_max_sectors); + + break; + case STATUSTYPE_TABLE: + /* The string you would use to construct this array */ + for (i = 0; i < rs->md.raid_disks; i++) + if (rs->dev[i].data_dev && + !test_bit(In_sync, &rs->dev[i].rdev.flags)) + raid_param_cnt++; /* for rebuilds */ + + raid_param_cnt += (hweight64(rs->print_flags) * 2); + if (rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC)) + raid_param_cnt--; + + DMEMIT("%s %u %u", rs->raid_type->name, + raid_param_cnt, rs->md.chunk_sectors); + + if ((rs->print_flags & DMPF_SYNC) && + (rs->md.recovery_cp == MaxSector)) + DMEMIT(" sync"); + if (rs->print_flags & DMPF_NOSYNC) + DMEMIT(" nosync"); + + for (i = 0; i < rs->md.raid_disks; i++) + if (rs->dev[i].data_dev && + !test_bit(In_sync, &rs->dev[i].rdev.flags)) + DMEMIT(" rebuild %u", i); + + if (rs->print_flags & DMPF_DAEMON_SLEEP) + DMEMIT(" daemon_sleep %lu", + rs->md.bitmap_info.daemon_sleep); + + if (rs->print_flags & DMPF_MIN_RECOVERY_RATE) + DMEMIT(" min_recovery_rate %d", rs->md.sync_speed_min); + + if (rs->print_flags & DMPF_MAX_RECOVERY_RATE) + DMEMIT(" max_recovery_rate %d", rs->md.sync_speed_max); + + if (rs->print_flags & DMPF_MAX_WRITE_BEHIND) + DMEMIT(" max_write_behind %lu", + rs->md.bitmap_info.max_write_behind); + + if (rs->print_flags & DMPF_STRIPE_CACHE) { + raid5_conf_t *conf = rs->md.private; + + /* convert from kiB to sectors */ + DMEMIT(" stripe_cache %d", + conf ? conf->max_nr_stripes * 2 : 0); + } + + DMEMIT(" %d", rs->md.raid_disks); + for (i = 0; i < rs->md.raid_disks; i++) { + DMEMIT(" -"); /* metadata device */ + + if (rs->dev[i].data_dev) + DMEMIT(" %s", rs->dev[i].data_dev->name); + else + DMEMIT(" -"); + } + } + + return 0; +} + +static int raid_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data) +{ + struct raid_set *rs = ti->private; + unsigned i; + int ret = 0; + + for (i = 0; !ret && i < rs->md.raid_disks; i++) + if (rs->dev[i].data_dev) + ret = fn(ti, + rs->dev[i].data_dev, + 0, /* No offset on data devs */ + rs->md.dev_sectors, + data); + + return ret; +} + +static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits) +{ + struct raid_set *rs = ti->private; + unsigned chunk_size = rs->md.chunk_sectors << 9; + raid5_conf_t *conf = rs->md.private; + + blk_limits_io_min(limits, chunk_size); + blk_limits_io_opt(limits, chunk_size * (conf->raid_disks - conf->max_degraded)); +} + +static void raid_presuspend(struct dm_target *ti) +{ + struct raid_set *rs = ti->private; + + md_stop_writes(&rs->md); +} + +static void raid_postsuspend(struct dm_target *ti) +{ + struct raid_set *rs = ti->private; + + mddev_suspend(&rs->md); +} + +static void raid_resume(struct dm_target *ti) +{ + struct raid_set *rs = ti->private; + + mddev_resume(&rs->md); +} + +static struct target_type raid_target = { + .name = "raid", + .version = {1, 0, 0}, + .module = THIS_MODULE, + .ctr = raid_ctr, + .dtr = raid_dtr, + .map = raid_map, + .status = raid_status, + .iterate_devices = raid_iterate_devices, + .io_hints = raid_io_hints, + .presuspend = raid_presuspend, + .postsuspend = raid_postsuspend, + .resume = raid_resume, +}; + +static int __init dm_raid_init(void) +{ + return dm_register_target(&raid_target); +} + +static void __exit dm_raid_exit(void) +{ + dm_unregister_target(&raid_target); +} + +module_init(dm_raid_init); +module_exit(dm_raid_exit); + +MODULE_DESCRIPTION(DM_NAME " raid4/5/6 target"); +MODULE_ALIAS("dm-raid4"); +MODULE_ALIAS("dm-raid5"); +MODULE_ALIAS("dm-raid6"); +MODULE_AUTHOR("Neil Brown <dm-devel@redhat.com>"); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 19a59b0..dee3267 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -261,7 +261,7 @@ static int mirror_flush(struct dm_target *ti) struct dm_io_request io_req = { .bi_rw = WRITE_FLUSH, .mem.type = DM_IO_KMEM, - .mem.ptr.bvec = NULL, + .mem.ptr.addr = NULL, .client = ms->io_client, }; @@ -637,6 +637,12 @@ static void do_write(struct mirror_set *ms, struct bio *bio) .client = ms->io_client, }; + if (bio->bi_rw & REQ_DISCARD) { + io_req.bi_rw |= REQ_DISCARD; + io_req.mem.type = DM_IO_KMEM; + io_req.mem.ptr.addr = NULL; + } + for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++) map_region(dest++, m, bio); @@ -670,7 +676,8 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes) bio_list_init(&requeue); while ((bio = bio_list_pop(writes))) { - if (bio->bi_rw & REQ_FLUSH) { + if ((bio->bi_rw & REQ_FLUSH) || + (bio->bi_rw & REQ_DISCARD)) { bio_list_add(&sync, bio); continue; } @@ -1076,8 +1083,10 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->private = ms; ti->split_io = dm_rh_get_region_size(ms->rh); ti->num_flush_requests = 1; + ti->num_discard_requests = 1; - ms->kmirrord_wq = create_singlethread_workqueue("kmirrord"); + ms->kmirrord_wq = alloc_workqueue("kmirrord", + WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0); if (!ms->kmirrord_wq) { DMERR("couldn't start kmirrord"); r = -ENOMEM; @@ -1130,7 +1139,7 @@ static void mirror_dtr(struct dm_target *ti) del_timer_sync(&ms->timer); flush_workqueue(ms->kmirrord_wq); - flush_scheduled_work(); + flush_work_sync(&ms->trigger_event); dm_kcopyd_client_destroy(ms->kcopyd_client); destroy_workqueue(ms->kmirrord_wq); free_context(ms, ti, ms->nr_mirrors); @@ -1406,7 +1415,7 @@ static int mirror_iterate_devices(struct dm_target *ti, static struct target_type mirror_target = { .name = "mirror", - .version = {1, 12, 0}, + .version = {1, 12, 1}, .module = THIS_MODULE, .ctr = mirror_ctr, .dtr = mirror_dtr, diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c index 2129cdb..95891df 100644 --- a/drivers/md/dm-snap-persistent.c +++ b/drivers/md/dm-snap-persistent.c @@ -256,7 +256,7 @@ static int chunk_io(struct pstore *ps, void *area, chunk_t chunk, int rw, */ INIT_WORK_ONSTACK(&req.work, do_metadata); queue_work(ps->metadata_wq, &req.work); - flush_workqueue(ps->metadata_wq); + flush_work(&req.work); return req.result; } @@ -818,7 +818,7 @@ static int persistent_ctr(struct dm_exception_store *store, atomic_set(&ps->pending_count, 0); ps->callbacks = NULL; - ps->metadata_wq = create_singlethread_workqueue("ksnaphd"); + ps->metadata_wq = alloc_workqueue("ksnaphd", WQ_MEM_RECLAIM, 0); if (!ps->metadata_wq) { kfree(ps); DMERR("couldn't start header metadata update thread"); diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 53cf79d..fdde53c 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -19,7 +19,6 @@ #include <linux/vmalloc.h> #include <linux/log2.h> #include <linux/dm-kcopyd.h> -#include <linux/workqueue.h> #include "dm-exception-store.h" @@ -80,9 +79,6 @@ struct dm_snapshot { /* Origin writes don't trigger exceptions until this is set */ int active; - /* Whether or not owning mapped_device is suspended */ - int suspended; - atomic_t pending_exceptions_count; mempool_t *pending_pool; @@ -106,10 +102,6 @@ struct dm_snapshot { struct dm_kcopyd_client *kcopyd_client; - /* Queue of snapshot writes for ksnapd to flush */ - struct bio_list queued_bios; - struct work_struct queued_bios_work; - /* Wait for events based on state_bits */ unsigned long state_bits; @@ -160,9 +152,6 @@ struct dm_dev *dm_snap_cow(struct dm_snapshot *s) } EXPORT_SYMBOL(dm_snap_cow); -static struct workqueue_struct *ksnapd; -static void flush_queued_bios(struct work_struct *work); - static sector_t chunk_to_sector(struct dm_exception_store *store, chunk_t chunk) { @@ -1110,7 +1099,6 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) s->ti = ti; s->valid = 1; s->active = 0; - s->suspended = 0; atomic_set(&s->pending_exceptions_count, 0); init_rwsem(&s->lock); INIT_LIST_HEAD(&s->list); @@ -1153,9 +1141,6 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) spin_lock_init(&s->tracked_chunk_lock); - bio_list_init(&s->queued_bios); - INIT_WORK(&s->queued_bios_work, flush_queued_bios); - ti->private = s; ti->num_flush_requests = num_flush_requests; @@ -1279,8 +1264,6 @@ static void snapshot_dtr(struct dm_target *ti) struct dm_snapshot *s = ti->private; struct dm_snapshot *snap_src = NULL, *snap_dest = NULL; - flush_workqueue(ksnapd); - down_read(&_origins_lock); /* Check whether exception handover must be cancelled */ (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL); @@ -1342,20 +1325,6 @@ static void flush_bios(struct bio *bio) } } -static void flush_queued_bios(struct work_struct *work) -{ - struct dm_snapshot *s = - container_of(work, struct dm_snapshot, queued_bios_work); - struct bio *queued_bios; - unsigned long flags; - - spin_lock_irqsave(&s->pe_lock, flags); - queued_bios = bio_list_get(&s->queued_bios); - spin_unlock_irqrestore(&s->pe_lock, flags); - - flush_bios(queued_bios); -} - static int do_origin(struct dm_dev *origin, struct bio *bio); /* @@ -1760,15 +1729,6 @@ static void snapshot_merge_presuspend(struct dm_target *ti) stop_merge(s); } -static void snapshot_postsuspend(struct dm_target *ti) -{ - struct dm_snapshot *s = ti->private; - - down_write(&s->lock); - s->suspended = 1; - up_write(&s->lock); -} - static int snapshot_preresume(struct dm_target *ti) { int r = 0; @@ -1783,7 +1743,7 @@ static int snapshot_preresume(struct dm_target *ti) DMERR("Unable to resume snapshot source until " "handover completes."); r = -EINVAL; - } else if (!snap_src->suspended) { + } else if (!dm_suspended(snap_src->ti)) { DMERR("Unable to perform snapshot handover until " "source is suspended."); r = -EINVAL; @@ -1816,7 +1776,6 @@ static void snapshot_resume(struct dm_target *ti) down_write(&s->lock); s->active = 1; - s->suspended = 0; up_write(&s->lock); } @@ -2194,7 +2153,7 @@ static int origin_iterate_devices(struct dm_target *ti, static struct target_type origin_target = { .name = "snapshot-origin", - .version = {1, 7, 0}, + .version = {1, 7, 1}, .module = THIS_MODULE, .ctr = origin_ctr, .dtr = origin_dtr, @@ -2207,13 +2166,12 @@ static struct target_type origin_target = { static struct target_type snapshot_target = { .name = "snapshot", - .version = {1, 9, 0}, + .version = {1, 10, 0}, .module = THIS_MODULE, .ctr = snapshot_ctr, .dtr = snapshot_dtr, .map = snapshot_map, .end_io = snapshot_end_io, - .postsuspend = snapshot_postsuspend, .preresume = snapshot_preresume, .resume = snapshot_resume, .status = snapshot_status, @@ -2222,14 +2180,13 @@ static struct target_type snapshot_target = { static struct target_type merge_target = { .name = dm_snapshot_merge_target_name, - .version = {1, 0, 0}, + .version = {1, 1, 0}, .module = THIS_MODULE, .ctr = snapshot_ctr, .dtr = snapshot_dtr, .map = snapshot_merge_map, .end_io = snapshot_end_io, .presuspend = snapshot_merge_presuspend, - .postsuspend = snapshot_postsuspend, .preresume = snapshot_preresume, .resume = snapshot_merge_resume, .status = snapshot_status, @@ -2291,17 +2248,8 @@ static int __init dm_snapshot_init(void) goto bad_tracked_chunk_cache; } - ksnapd = create_singlethread_workqueue("ksnapd"); - if (!ksnapd) { - DMERR("Failed to create ksnapd workqueue."); - r = -ENOMEM; - goto bad_pending_pool; - } - return 0; -bad_pending_pool: - kmem_cache_destroy(tracked_chunk_cache); bad_tracked_chunk_cache: kmem_cache_destroy(pending_cache); bad_pending_cache: @@ -2322,8 +2270,6 @@ bad_register_snapshot_target: static void __exit dm_snapshot_exit(void) { - destroy_workqueue(ksnapd); - dm_unregister_target(&snapshot_target); dm_unregister_target(&origin_target); dm_unregister_target(&merge_target); diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index f0371b4..dddfa14 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -39,23 +39,20 @@ struct stripe_c { struct dm_target *ti; /* Work struct used for triggering events*/ - struct work_struct kstriped_ws; + struct work_struct trigger_event; struct stripe stripe[0]; }; -static struct workqueue_struct *kstriped; - /* * An event is triggered whenever a drive * drops out of a stripe volume. */ static void trigger_event(struct work_struct *work) { - struct stripe_c *sc = container_of(work, struct stripe_c, kstriped_ws); - + struct stripe_c *sc = container_of(work, struct stripe_c, + trigger_event); dm_table_event(sc->ti->table); - } static inline struct stripe_c *alloc_context(unsigned int stripes) @@ -160,7 +157,7 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) return -ENOMEM; } - INIT_WORK(&sc->kstriped_ws, trigger_event); + INIT_WORK(&sc->trigger_event, trigger_event); /* Set pointer to dm target; used in trigger_event */ sc->ti = ti; @@ -211,7 +208,7 @@ static void stripe_dtr(struct dm_target *ti) for (i = 0; i < sc->stripes; i++) dm_put_device(ti, sc->stripe[i].dev); - flush_workqueue(kstriped); + flush_work_sync(&sc->trigger_event); kfree(sc); } @@ -367,7 +364,7 @@ static int stripe_end_io(struct dm_target *ti, struct bio *bio, atomic_inc(&(sc->stripe[i].error_count)); if (atomic_read(&(sc->stripe[i].error_count)) < DM_IO_ERROR_THRESHOLD) - queue_work(kstriped, &sc->kstriped_ws); + schedule_work(&sc->trigger_event); } return error; @@ -401,7 +398,7 @@ static void stripe_io_hints(struct dm_target *ti, static struct target_type stripe_target = { .name = "striped", - .version = {1, 3, 0}, + .version = {1, 3, 1}, .module = THIS_MODULE, .ctr = stripe_ctr, .dtr = stripe_dtr, @@ -422,20 +419,10 @@ int __init dm_stripe_init(void) return r; } - kstriped = create_singlethread_workqueue("kstriped"); - if (!kstriped) { - DMERR("failed to create workqueue kstriped"); - dm_unregister_target(&stripe_target); - return -ENOMEM; - } - return r; } void dm_stripe_exit(void) { dm_unregister_target(&stripe_target); - destroy_workqueue(kstriped); - - return; } diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 985c20a..dffa0ac 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -71,6 +71,8 @@ struct dm_table { void *event_context; struct dm_md_mempools *mempools; + + struct list_head target_callbacks; }; /* @@ -204,6 +206,7 @@ int dm_table_create(struct dm_table **result, fmode_t mode, return -ENOMEM; INIT_LIST_HEAD(&t->devices); + INIT_LIST_HEAD(&t->target_callbacks); atomic_set(&t->holders, 0); t->discards_supported = 1; @@ -1225,10 +1228,17 @@ int dm_table_resume_targets(struct dm_table *t) return 0; } +void dm_table_add_target_callbacks(struct dm_table *t, struct dm_target_callbacks *cb) +{ + list_add(&cb->list, &t->target_callbacks); +} +EXPORT_SYMBOL_GPL(dm_table_add_target_callbacks); + int dm_table_any_congested(struct dm_table *t, int bdi_bits) { struct dm_dev_internal *dd; struct list_head *devices = dm_table_get_devices(t); + struct dm_target_callbacks *cb; int r = 0; list_for_each_entry(dd, devices, list) { @@ -1243,6 +1253,10 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits) bdevname(dd->dm_dev.bdev, b)); } + list_for_each_entry(cb, &t->target_callbacks, list) + if (cb->congested_fn) + r |= cb->congested_fn(cb, bdi_bits); + return r; } @@ -1264,6 +1278,7 @@ void dm_table_unplug_all(struct dm_table *t) { struct dm_dev_internal *dd; struct list_head *devices = dm_table_get_devices(t); + struct dm_target_callbacks *cb; list_for_each_entry(dd, devices, list) { struct request_queue *q = bdev_get_queue(dd->dm_dev.bdev); @@ -1276,6 +1291,10 @@ void dm_table_unplug_all(struct dm_table *t) dm_device_name(t->md), bdevname(dd->dm_dev.bdev, b)); } + + list_for_each_entry(cb, &t->target_callbacks, list) + if (cb->unplug_fn) + cb->unplug_fn(cb); } struct mapped_device *dm_table_get_md(struct dm_table *t) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index f48a2f3..eaa3af0 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -32,7 +32,6 @@ #define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE" #define DM_COOKIE_LENGTH 24 -static DEFINE_MUTEX(dm_mutex); static const char *_name = DM_NAME; static unsigned int major = 0; @@ -328,7 +327,6 @@ static int dm_blk_open(struct block_device *bdev, fmode_t mode) { struct mapped_device *md; - mutex_lock(&dm_mutex); spin_lock(&_minor_lock); md = bdev->bd_disk->private_data; @@ -346,7 +344,6 @@ static int dm_blk_open(struct block_device *bdev, fmode_t mode) out: spin_unlock(&_minor_lock); - mutex_unlock(&dm_mutex); return md ? 0 : -ENXIO; } @@ -355,10 +352,12 @@ static int dm_blk_close(struct gendisk *disk, fmode_t mode) { struct mapped_device *md = disk->private_data; - mutex_lock(&dm_mutex); + spin_lock(&_minor_lock); + atomic_dec(&md->open_count); dm_put(md); - mutex_unlock(&dm_mutex); + + spin_unlock(&_minor_lock); return 0; } @@ -1638,13 +1637,15 @@ static void dm_request_fn(struct request_queue *q) if (map_request(ti, clone, md)) goto requeued; - spin_lock_irq(q->queue_lock); + BUG_ON(!irqs_disabled()); + spin_lock(q->queue_lock); } goto out; requeued: - spin_lock_irq(q->queue_lock); + BUG_ON(!irqs_disabled()); + spin_lock(q->queue_lock); plug_and_out: if (!elv_queue_empty(q)) @@ -1884,7 +1885,8 @@ static struct mapped_device *alloc_dev(int minor) add_disk(md->disk); format_dev_t(md->name, MKDEV(_major, minor)); - md->wq = create_singlethread_workqueue("kdmflush"); + md->wq = alloc_workqueue("kdmflush", + WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0); if (!md->wq) goto bad_thread; @@ -1992,13 +1994,14 @@ static void event_callback(void *context) wake_up(&md->eventq); } +/* + * Protected by md->suspend_lock obtained by dm_swap_table(). + */ static void __set_size(struct mapped_device *md, sector_t size) { set_capacity(md->disk, size); - mutex_lock(&md->bdev->bd_inode->i_mutex); i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); - mutex_unlock(&md->bdev->bd_inode->i_mutex); } /* diff --git a/drivers/md/md.c b/drivers/md/md.c index 7fc090a..cf8594c 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -288,10 +288,12 @@ static int md_make_request(struct request_queue *q, struct bio *bio) int rv; int cpu; - if (mddev == NULL || mddev->pers == NULL) { + if (mddev == NULL || mddev->pers == NULL + || !mddev->ready) { bio_io_error(bio); return 0; } + smp_rmb(); /* Ensure implications of 'active' are visible */ rcu_read_lock(); if (mddev->suspended) { DEFINE_WAIT(__wait); @@ -703,9 +705,9 @@ static struct mdk_personality *find_pers(int level, char *clevel) } /* return the offset of the super block in 512byte sectors */ -static inline sector_t calc_dev_sboffset(struct block_device *bdev) +static inline sector_t calc_dev_sboffset(mdk_rdev_t *rdev) { - sector_t num_sectors = i_size_read(bdev->bd_inode) / 512; + sector_t num_sectors = i_size_read(rdev->bdev->bd_inode) / 512; return MD_NEW_SIZE_SECTORS(num_sectors); } @@ -763,7 +765,7 @@ void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, */ struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev); - bio->bi_bdev = rdev->bdev; + bio->bi_bdev = rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev; bio->bi_sector = sector; bio_add_page(bio, page, size, 0); bio->bi_private = rdev; @@ -793,7 +795,7 @@ static void bi_complete(struct bio *bio, int error) } int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size, - struct page *page, int rw) + struct page *page, int rw, bool metadata_op) { struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev); struct completion event; @@ -801,8 +803,12 @@ int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size, rw |= REQ_SYNC | REQ_UNPLUG; - bio->bi_bdev = rdev->bdev; - bio->bi_sector = sector; + bio->bi_bdev = (metadata_op && rdev->meta_bdev) ? + rdev->meta_bdev : rdev->bdev; + if (metadata_op) + bio->bi_sector = sector + rdev->sb_start; + else + bio->bi_sector = sector + rdev->data_offset; bio_add_page(bio, page, size, 0); init_completion(&event); bio->bi_private = &event; @@ -827,7 +833,7 @@ static int read_disk_sb(mdk_rdev_t * rdev, int size) return 0; - if (!sync_page_io(rdev, rdev->sb_start, size, rdev->sb_page, READ)) + if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, true)) goto fail; rdev->sb_loaded = 1; return 0; @@ -989,7 +995,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version * * It also happens to be a multiple of 4Kb. */ - rdev->sb_start = calc_dev_sboffset(rdev->bdev); + rdev->sb_start = calc_dev_sboffset(rdev); ret = read_disk_sb(rdev, MD_SB_BYTES); if (ret) return ret; @@ -1330,7 +1336,7 @@ super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) return 0; /* component must fit device */ if (rdev->mddev->bitmap_info.offset) return 0; /* can't move bitmap */ - rdev->sb_start = calc_dev_sboffset(rdev->bdev); + rdev->sb_start = calc_dev_sboffset(rdev); if (!num_sectors || num_sectors > rdev->sb_start) num_sectors = rdev->sb_start; md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, @@ -2465,6 +2471,10 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) if (rdev2->raid_disk == slot) return -EEXIST; + if (slot >= rdev->mddev->raid_disks && + slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) + return -ENOSPC; + rdev->raid_disk = slot; if (test_bit(In_sync, &rdev->flags)) rdev->saved_raid_disk = slot; @@ -2482,7 +2492,8 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) /* failure here is OK */; /* don't wakeup anyone, leave that to userspace. */ } else { - if (slot >= rdev->mddev->raid_disks) + if (slot >= rdev->mddev->raid_disks && + slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) return -ENOSPC; rdev->raid_disk = slot; /* assume it is working */ @@ -3107,7 +3118,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len) char nm[20]; if (rdev->raid_disk < 0) continue; - if (rdev->new_raid_disk > mddev->raid_disks) + if (rdev->new_raid_disk >= mddev->raid_disks) rdev->new_raid_disk = -1; if (rdev->new_raid_disk == rdev->raid_disk) continue; @@ -3736,6 +3747,8 @@ action_show(mddev_t *mddev, char *page) return sprintf(page, "%s\n", type); } +static void reap_sync_thread(mddev_t *mddev); + static ssize_t action_store(mddev_t *mddev, const char *page, size_t len) { @@ -3750,9 +3763,7 @@ action_store(mddev_t *mddev, const char *page, size_t len) if (cmd_match(page, "idle") || cmd_match(page, "frozen")) { if (mddev->sync_thread) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); - md_unregister_thread(mddev->sync_thread); - mddev->sync_thread = NULL; - mddev->recovery = 0; + reap_sync_thread(mddev); } } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) @@ -3904,7 +3915,7 @@ static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); static ssize_t sync_completed_show(mddev_t *mddev, char *page) { - unsigned long max_sectors, resync; + unsigned long long max_sectors, resync; if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) return sprintf(page, "none\n"); @@ -3915,7 +3926,7 @@ sync_completed_show(mddev_t *mddev, char *page) max_sectors = mddev->dev_sectors; resync = mddev->curr_resync_completed; - return sprintf(page, "%lu / %lu\n", resync, max_sectors); + return sprintf(page, "%llu / %llu\n", resync, max_sectors); } static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed); @@ -4002,19 +4013,24 @@ suspend_lo_store(mddev_t *mddev, const char *buf, size_t len) { char *e; unsigned long long new = simple_strtoull(buf, &e, 10); + unsigned long long old = mddev->suspend_lo; if (mddev->pers == NULL || mddev->pers->quiesce == NULL) return -EINVAL; if (buf == e || (*e && *e != '\n')) return -EINVAL; - if (new >= mddev->suspend_hi || - (new > mddev->suspend_lo && new < mddev->suspend_hi)) { - mddev->suspend_lo = new; + + mddev->suspend_lo = new; + if (new >= old) + /* Shrinking suspended region */ mddev->pers->quiesce(mddev, 2); - return len; - } else - return -EINVAL; + else { + /* Expanding suspended region - need to wait */ + mddev->pers->quiesce(mddev, 1); + mddev->pers->quiesce(mddev, 0); + } + return len; } static struct md_sysfs_entry md_suspend_lo = __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); @@ -4031,20 +4047,24 @@ suspend_hi_store(mddev_t *mddev, const char *buf, size_t len) { char *e; unsigned long long new = simple_strtoull(buf, &e, 10); + unsigned long long old = mddev->suspend_hi; if (mddev->pers == NULL || mddev->pers->quiesce == NULL) return -EINVAL; if (buf == e || (*e && *e != '\n')) return -EINVAL; - if ((new <= mddev->suspend_lo && mddev->suspend_lo >= mddev->suspend_hi) || - (new > mddev->suspend_lo && new > mddev->suspend_hi)) { - mddev->suspend_hi = new; + + mddev->suspend_hi = new; + if (new <= old) + /* Shrinking suspended region */ + mddev->pers->quiesce(mddev, 2); + else { + /* Expanding suspended region - need to wait */ mddev->pers->quiesce(mddev, 1); mddev->pers->quiesce(mddev, 0); - return len; - } else - return -EINVAL; + } + return len; } static struct md_sysfs_entry md_suspend_hi = __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); @@ -4422,7 +4442,9 @@ int md_run(mddev_t *mddev) * We don't want the data to overlap the metadata, * Internal Bitmap issues have been handled elsewhere. */ - if (rdev->data_offset < rdev->sb_start) { + if (rdev->meta_bdev) { + /* Nothing to check */; + } else if (rdev->data_offset < rdev->sb_start) { if (mddev->dev_sectors && rdev->data_offset + mddev->dev_sectors > rdev->sb_start) { @@ -4556,7 +4578,8 @@ int md_run(mddev_t *mddev) mddev->safemode_timer.data = (unsigned long) mddev; mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */ mddev->in_sync = 1; - + smp_wmb(); + mddev->ready = 1; list_for_each_entry(rdev, &mddev->disks, same_set) if (rdev->raid_disk >= 0) { char nm[20]; @@ -4693,13 +4716,12 @@ static void md_clean(mddev_t *mddev) mddev->plug = NULL; } -void md_stop_writes(mddev_t *mddev) +static void __md_stop_writes(mddev_t *mddev) { if (mddev->sync_thread) { set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); set_bit(MD_RECOVERY_INTR, &mddev->recovery); - md_unregister_thread(mddev->sync_thread); - mddev->sync_thread = NULL; + reap_sync_thread(mddev); } del_timer_sync(&mddev->safemode_timer); @@ -4713,10 +4735,18 @@ void md_stop_writes(mddev_t *mddev) md_update_sb(mddev, 1); } } + +void md_stop_writes(mddev_t *mddev) +{ + mddev_lock(mddev); + __md_stop_writes(mddev); + mddev_unlock(mddev); +} EXPORT_SYMBOL_GPL(md_stop_writes); void md_stop(mddev_t *mddev) { + mddev->ready = 0; mddev->pers->stop(mddev); if (mddev->pers->sync_request && mddev->to_remove == NULL) mddev->to_remove = &md_redundancy_group; @@ -4736,7 +4766,7 @@ static int md_set_readonly(mddev_t *mddev, int is_open) goto out; } if (mddev->pers) { - md_stop_writes(mddev); + __md_stop_writes(mddev); err = -ENXIO; if (mddev->ro==1) @@ -4773,7 +4803,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) if (mddev->ro) set_disk_ro(disk, 0); - md_stop_writes(mddev); + __md_stop_writes(mddev); md_stop(mddev); mddev->queue->merge_bvec_fn = NULL; mddev->queue->unplug_fn = NULL; @@ -5151,9 +5181,10 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) /* set saved_raid_disk if appropriate */ if (!mddev->persistent) { if (info->state & (1<<MD_DISK_SYNC) && - info->raid_disk < mddev->raid_disks) + info->raid_disk < mddev->raid_disks) { rdev->raid_disk = info->raid_disk; - else + set_bit(In_sync, &rdev->flags); + } else rdev->raid_disk = -1; } else super_types[mddev->major_version]. @@ -5230,7 +5261,7 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) printk(KERN_INFO "md: nonpersistent superblock ...\n"); rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512; } else - rdev->sb_start = calc_dev_sboffset(rdev->bdev); + rdev->sb_start = calc_dev_sboffset(rdev); rdev->sectors = rdev->sb_start; err = bind_rdev_to_array(rdev, mddev); @@ -5297,7 +5328,7 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev) } if (mddev->persistent) - rdev->sb_start = calc_dev_sboffset(rdev->bdev); + rdev->sb_start = calc_dev_sboffset(rdev); else rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512; @@ -5510,7 +5541,6 @@ static int update_size(mddev_t *mddev, sector_t num_sectors) * sb_start or, if that is <data_offset, it must fit before the size * of each device. If num_sectors is zero, we find the largest size * that fits. - */ if (mddev->sync_thread) return -EBUSY; @@ -6033,7 +6063,8 @@ static int md_thread(void * arg) || kthread_should_stop(), thread->timeout); - if (test_and_clear_bit(THREAD_WAKEUP, &thread->flags)) + clear_bit(THREAD_WAKEUP, &thread->flags); + if (!kthread_should_stop()) thread->run(thread->mddev); } @@ -6799,7 +6830,7 @@ void md_do_sync(mddev_t *mddev) desc, mdname(mddev)); mddev->curr_resync = j; } - mddev->curr_resync_completed = mddev->curr_resync; + mddev->curr_resync_completed = j; while (j < max_sectors) { sector_t sectors; @@ -6817,8 +6848,7 @@ void md_do_sync(mddev_t *mddev) md_unplug(mddev); wait_event(mddev->recovery_wait, atomic_read(&mddev->recovery_active) == 0); - mddev->curr_resync_completed = - mddev->curr_resync; + mddev->curr_resync_completed = j; set_bit(MD_CHANGE_CLEAN, &mddev->flags); sysfs_notify(&mddev->kobj, NULL, "sync_completed"); } @@ -7023,6 +7053,45 @@ static int remove_and_add_spares(mddev_t *mddev) } return spares; } + +static void reap_sync_thread(mddev_t *mddev) +{ + mdk_rdev_t *rdev; + + /* resync has finished, collect result */ + md_unregister_thread(mddev->sync_thread); + mddev->sync_thread = NULL; + if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && + !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { + /* success...*/ + /* activate any spares */ + if (mddev->pers->spare_active(mddev)) + sysfs_notify(&mddev->kobj, NULL, + "degraded"); + } + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && + mddev->pers->finish_reshape) + mddev->pers->finish_reshape(mddev); + md_update_sb(mddev, 1); + + /* if array is no-longer degraded, then any saved_raid_disk + * information must be scrapped + */ + if (!mddev->degraded) + list_for_each_entry(rdev, &mddev->disks, same_set) + rdev->saved_raid_disk = -1; + + clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); + clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); + clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); + clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); + clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); + /* flag recovery needed just to double check */ + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + sysfs_notify_dirent_safe(mddev->sysfs_action); + md_new_event(mddev); +} + /* * This routine is regularly called by all per-raid-array threads to * deal with generic issues like resync and super-block update. @@ -7047,9 +7116,6 @@ static int remove_and_add_spares(mddev_t *mddev) */ void md_check_recovery(mddev_t *mddev) { - mdk_rdev_t *rdev; - - if (mddev->bitmap) bitmap_daemon_work(mddev); @@ -7117,34 +7183,7 @@ void md_check_recovery(mddev_t *mddev) goto unlock; } if (mddev->sync_thread) { - /* resync has finished, collect result */ - md_unregister_thread(mddev->sync_thread); - mddev->sync_thread = NULL; - if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && - !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { - /* success...*/ - /* activate any spares */ - if (mddev->pers->spare_active(mddev)) - sysfs_notify(&mddev->kobj, NULL, - "degraded"); - } - if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && - mddev->pers->finish_reshape) - mddev->pers->finish_reshape(mddev); - md_update_sb(mddev, 1); - - /* if array is no-longer degraded, then any saved_raid_disk - * information must be scrapped - */ - if (!mddev->degraded) - list_for_each_entry(rdev, &mddev->disks, same_set) - rdev->saved_raid_disk = -1; - - mddev->recovery = 0; - /* flag recovery needed just to double check */ - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - sysfs_notify_dirent_safe(mddev->sysfs_action); - md_new_event(mddev); + reap_sync_thread(mddev); goto unlock; } /* Set RUNNING before clearing NEEDED to avoid @@ -7202,7 +7241,11 @@ void md_check_recovery(mddev_t *mddev) " thread...\n", mdname(mddev)); /* leave the spares where they are, it shouldn't hurt */ - mddev->recovery = 0; + clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); + clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); + clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); + clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); + clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); } else md_wakeup_thread(mddev->sync_thread); sysfs_notify_dirent_safe(mddev->sysfs_action); diff --git a/drivers/md/md.h b/drivers/md/md.h index d05bab5..eec517c 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -60,6 +60,12 @@ struct mdk_rdev_s mddev_t *mddev; /* RAID array if running */ int last_events; /* IO event timestamp */ + /* + * If meta_bdev is non-NULL, it means that a separate device is + * being used to store the metadata (superblock/bitmap) which + * would otherwise be contained on the same device as the data (bdev). + */ + struct block_device *meta_bdev; struct block_device *bdev; /* block device handle */ struct page *sb_page; @@ -148,7 +154,8 @@ struct mddev_s * are happening, so run/ * takeover/stop are not safe */ - + int ready; /* See when safe to pass + * IO requests down */ struct gendisk *gendisk; struct kobject kobj; @@ -497,8 +504,8 @@ extern void md_flush_request(mddev_t *mddev, struct bio *bio); extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, sector_t sector, int size, struct page *page); extern void md_super_wait(mddev_t *mddev); -extern int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size, - struct page *page, int rw); +extern int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size, + struct page *page, int rw, bool metadata_op); extern void md_do_sync(mddev_t *mddev); extern void md_new_event(mddev_t *mddev); extern int md_allow_write(mddev_t *mddev); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 845cf95..a23ffa3 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1027,8 +1027,9 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) } else set_bit(Faulty, &rdev->flags); set_bit(MD_CHANGE_DEVS, &mddev->flags); - printk(KERN_ALERT "md/raid1:%s: Disk failure on %s, disabling device.\n" - KERN_ALERT "md/raid1:%s: Operation continuing on %d devices.\n", + printk(KERN_ALERT + "md/raid1:%s: Disk failure on %s, disabling device.\n" + "md/raid1:%s: Operation continuing on %d devices.\n", mdname(mddev), bdevname(rdev->bdev, b), mdname(mddev), conf->raid_disks - mddev->degraded); } @@ -1364,10 +1365,10 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) */ rdev = conf->mirrors[d].rdev; if (sync_page_io(rdev, - sect + rdev->data_offset, + sect, s<<9, bio->bi_io_vec[idx].bv_page, - READ)) { + READ, false)) { success = 1; break; } @@ -1390,10 +1391,10 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) rdev = conf->mirrors[d].rdev; atomic_add(s, &rdev->corrected_errors); if (sync_page_io(rdev, - sect + rdev->data_offset, + sect, s<<9, bio->bi_io_vec[idx].bv_page, - WRITE) == 0) + WRITE, false) == 0) md_error(mddev, rdev); } d = start; @@ -1405,10 +1406,10 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) continue; rdev = conf->mirrors[d].rdev; if (sync_page_io(rdev, - sect + rdev->data_offset, + sect, s<<9, bio->bi_io_vec[idx].bv_page, - READ) == 0) + READ, false) == 0) md_error(mddev, rdev); } } else { @@ -1488,10 +1489,8 @@ static void fix_read_error(conf_t *conf, int read_disk, rdev = conf->mirrors[d].rdev; if (rdev && test_bit(In_sync, &rdev->flags) && - sync_page_io(rdev, - sect + rdev->data_offset, - s<<9, - conf->tmppage, READ)) + sync_page_io(rdev, sect, s<<9, + conf->tmppage, READ, false)) success = 1; else { d++; @@ -1514,9 +1513,8 @@ static void fix_read_error(conf_t *conf, int read_disk, rdev = conf->mirrors[d].rdev; if (rdev && test_bit(In_sync, &rdev->flags)) { - if (sync_page_io(rdev, - sect + rdev->data_offset, - s<<9, conf->tmppage, WRITE) + if (sync_page_io(rdev, sect, s<<9, + conf->tmppage, WRITE, false) == 0) /* Well, this device is dead */ md_error(mddev, rdev); @@ -1531,9 +1529,8 @@ static void fix_read_error(conf_t *conf, int read_disk, rdev = conf->mirrors[d].rdev; if (rdev && test_bit(In_sync, &rdev->flags)) { - if (sync_page_io(rdev, - sect + rdev->data_offset, - s<<9, conf->tmppage, READ) + if (sync_page_io(rdev, sect, s<<9, + conf->tmppage, READ, false) == 0) /* Well, this device is dead */ md_error(mddev, rdev); diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 0641674..69b6595 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1051,8 +1051,9 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) } set_bit(Faulty, &rdev->flags); set_bit(MD_CHANGE_DEVS, &mddev->flags); - printk(KERN_ALERT "md/raid10:%s: Disk failure on %s, disabling device.\n" - KERN_ALERT "md/raid10:%s: Operation continuing on %d devices.\n", + printk(KERN_ALERT + "md/raid10:%s: Disk failure on %s, disabling device.\n" + "md/raid10:%s: Operation continuing on %d devices.\n", mdname(mddev), bdevname(rdev->bdev, b), mdname(mddev), conf->raid_disks - mddev->degraded); } @@ -1559,9 +1560,9 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) rcu_read_unlock(); success = sync_page_io(rdev, r10_bio->devs[sl].addr + - sect + rdev->data_offset, + sect, s<<9, - conf->tmppage, READ); + conf->tmppage, READ, false); rdev_dec_pending(rdev, mddev); rcu_read_lock(); if (success) @@ -1598,8 +1599,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) atomic_add(s, &rdev->corrected_errors); if (sync_page_io(rdev, r10_bio->devs[sl].addr + - sect + rdev->data_offset, - s<<9, conf->tmppage, WRITE) + sect, + s<<9, conf->tmppage, WRITE, false) == 0) { /* Well, this device is dead */ printk(KERN_NOTICE @@ -1635,9 +1636,9 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) rcu_read_unlock(); if (sync_page_io(rdev, r10_bio->devs[sl].addr + - sect + rdev->data_offset, + sect, s<<9, conf->tmppage, - READ) == 0) { + READ, false) == 0) { /* Well, this device is dead */ printk(KERN_NOTICE "md/raid10:%s: unable to read back " diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index dc574f3..5044bab 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1721,7 +1721,6 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) set_bit(Faulty, &rdev->flags); printk(KERN_ALERT "md/raid:%s: Disk failure on %s, disabling device.\n" - KERN_ALERT "md/raid:%s: Operation continuing on %d devices.\n", mdname(mddev), bdevname(rdev->bdev, b), @@ -4237,7 +4236,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped wait_event(conf->wait_for_overlap, atomic_read(&conf->reshape_stripes)==0); mddev->reshape_position = conf->reshape_progress; - mddev->curr_resync_completed = mddev->curr_resync; + mddev->curr_resync_completed = sector_nr; conf->reshape_checkpoint = jiffies; set_bit(MD_CHANGE_DEVS, &mddev->flags); md_wakeup_thread(mddev->thread); @@ -4338,7 +4337,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped wait_event(conf->wait_for_overlap, atomic_read(&conf->reshape_stripes) == 0); mddev->reshape_position = conf->reshape_progress; - mddev->curr_resync_completed = mddev->curr_resync + reshape_sectors; + mddev->curr_resync_completed = sector_nr; conf->reshape_checkpoint = jiffies; set_bit(MD_CHANGE_DEVS, &mddev->flags); md_wakeup_thread(mddev->thread); @@ -5339,7 +5338,7 @@ static int raid5_spare_active(mddev_t *mddev) && !test_bit(Faulty, &tmp->rdev->flags) && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { count++; - sysfs_notify_dirent(tmp->rdev->sysfs_state); + sysfs_notify_dirent_safe(tmp->rdev->sysfs_state); } } spin_lock_irqsave(&conf->device_lock, flags); @@ -5528,8 +5527,8 @@ static int raid5_start_reshape(mddev_t *mddev) return -ENOSPC; list_for_each_entry(rdev, &mddev->disks, same_set) - if (rdev->raid_disk < 0 && - !test_bit(Faulty, &rdev->flags)) + if ((rdev->raid_disk < 0 || rdev->raid_disk >= conf->raid_disks) + && !test_bit(Faulty, &rdev->flags)) spares++; if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded) @@ -5589,6 +5588,11 @@ static int raid5_start_reshape(mddev_t *mddev) /* Failure here is OK */; } else break; + } else if (rdev->raid_disk >= conf->previous_raid_disks + && !test_bit(Faulty, &rdev->flags)) { + /* This is a spare that was manually added */ + set_bit(In_sync, &rdev->flags); + added_devices++; } /* When a reshape changes the number of devices, ->degraded diff --git a/drivers/serial/atmel_serial.c b/drivers/serial/atmel_serial.c index 3892666..2a1d52f 100644 --- a/drivers/serial/atmel_serial.c +++ b/drivers/serial/atmel_serial.c @@ -1732,6 +1732,11 @@ static int __devinit atmel_serial_probe(struct platform_device *pdev) device_init_wakeup(&pdev->dev, 1); platform_set_drvdata(pdev, port); + if (port->rs485.flags & SER_RS485_ENABLED) { + UART_PUT_MR(&port->uart, ATMEL_US_USMODE_NORMAL); + UART_PUT_CR(&port->uart, ATMEL_US_RTSEN); + } + return 0; err_add_port: diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig index 5a48ce9..07bec09 100644 --- a/drivers/xen/Kconfig +++ b/drivers/xen/Kconfig @@ -71,11 +71,18 @@ config XEN_SYS_HYPERVISOR but will have no xen contents. config XEN_XENBUS_FRONTEND - tristate + tristate + +config XEN_GNTDEV + tristate "userspace grant access device driver" + depends on XEN + select MMU_NOTIFIER + help + Allows userspace processes to use grants. config XEN_PLATFORM_PCI tristate "xen platform pci device driver" - depends on XEN_PVHVM + depends on XEN_PVHVM && PCI default m help Driver for the Xen PCI Platform device: it is responsible for diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile index 533a199..5088cc2 100644 --- a/drivers/xen/Makefile +++ b/drivers/xen/Makefile @@ -9,11 +9,14 @@ obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o obj-$(CONFIG_XEN_XENCOMM) += xencomm.o obj-$(CONFIG_XEN_BALLOON) += balloon.o obj-$(CONFIG_XEN_DEV_EVTCHN) += xen-evtchn.o +obj-$(CONFIG_XEN_GNTDEV) += xen-gntdev.o obj-$(CONFIG_XENFS) += xenfs/ obj-$(CONFIG_XEN_SYS_HYPERVISOR) += sys-hypervisor.o -obj-$(CONFIG_XEN_PLATFORM_PCI) += platform-pci.o +obj-$(CONFIG_XEN_PLATFORM_PCI) += xen-platform-pci.o obj-$(CONFIG_SWIOTLB_XEN) += swiotlb-xen.o obj-$(CONFIG_XEN_DOM0) += pci.o xen-evtchn-y := evtchn.o +xen-gntdev-y := gntdev.o +xen-platform-pci-y := platform-pci.o diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c new file mode 100644 index 0000000..1e31cdc --- /dev/null +++ b/drivers/xen/gntdev.c @@ -0,0 +1,665 @@ +/****************************************************************************** + * gntdev.c + * + * Device for accessing (in user-space) pages that have been granted by other + * domains. + * + * Copyright (c) 2006-2007, D G Murray. + * (c) 2009 Gerd Hoffmann <kraxel@redhat.com> + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#undef DEBUG + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/miscdevice.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/mman.h> +#include <linux/mmu_notifier.h> +#include <linux/types.h> +#include <linux/uaccess.h> +#include <linux/sched.h> +#include <linux/spinlock.h> +#include <linux/slab.h> + +#include <xen/xen.h> +#include <xen/grant_table.h> +#include <xen/gntdev.h> +#include <asm/xen/hypervisor.h> +#include <asm/xen/hypercall.h> +#include <asm/xen/page.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Derek G. Murray <Derek.Murray@cl.cam.ac.uk>, " + "Gerd Hoffmann <kraxel@redhat.com>"); +MODULE_DESCRIPTION("User-space granted page access driver"); + +static int limit = 1024; +module_param(limit, int, 0644); +MODULE_PARM_DESC(limit, "Maximum number of grants that may be mapped at " + "once by a gntdev instance"); + +struct gntdev_priv { + struct list_head maps; + uint32_t used; + uint32_t limit; + /* lock protects maps from concurrent changes */ + spinlock_t lock; + struct mm_struct *mm; + struct mmu_notifier mn; +}; + +struct grant_map { + struct list_head next; + struct gntdev_priv *priv; + struct vm_area_struct *vma; + int index; + int count; + int flags; + int is_mapped; + struct ioctl_gntdev_grant_ref *grants; + struct gnttab_map_grant_ref *map_ops; + struct gnttab_unmap_grant_ref *unmap_ops; + struct page **pages; +}; + +/* ------------------------------------------------------------------ */ + +static void gntdev_print_maps(struct gntdev_priv *priv, + char *text, int text_index) +{ +#ifdef DEBUG + struct grant_map *map; + + pr_debug("maps list (priv %p, usage %d/%d)\n", + priv, priv->used, priv->limit); + + list_for_each_entry(map, &priv->maps, next) + pr_debug(" index %2d, count %2d %s\n", + map->index, map->count, + map->index == text_index && text ? text : ""); +#endif +} + +static struct grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count) +{ + struct grant_map *add; + int i; + + add = kzalloc(sizeof(struct grant_map), GFP_KERNEL); + if (NULL == add) + return NULL; + + add->grants = kzalloc(sizeof(add->grants[0]) * count, GFP_KERNEL); + add->map_ops = kzalloc(sizeof(add->map_ops[0]) * count, GFP_KERNEL); + add->unmap_ops = kzalloc(sizeof(add->unmap_ops[0]) * count, GFP_KERNEL); + add->pages = kzalloc(sizeof(add->pages[0]) * count, GFP_KERNEL); + if (NULL == add->grants || + NULL == add->map_ops || + NULL == add->unmap_ops || + NULL == add->pages) + goto err; + + for (i = 0; i < count; i++) { + add->pages[i] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); + if (add->pages[i] == NULL) + goto err; + } + + add->index = 0; + add->count = count; + add->priv = priv; + + if (add->count + priv->used > priv->limit) + goto err; + + return add; + +err: + if (add->pages) + for (i = 0; i < count; i++) { + if (add->pages[i]) + __free_page(add->pages[i]); + } + kfree(add->pages); + kfree(add->grants); + kfree(add->map_ops); + kfree(add->unmap_ops); + kfree(add); + return NULL; +} + +static void gntdev_add_map(struct gntdev_priv *priv, struct grant_map *add) +{ + struct grant_map *map; + + list_for_each_entry(map, &priv->maps, next) { + if (add->index + add->count < map->index) { + list_add_tail(&add->next, &map->next); + goto done; + } + add->index = map->index + map->count; + } + list_add_tail(&add->next, &priv->maps); + +done: + priv->used += add->count; + gntdev_print_maps(priv, "[new]", add->index); +} + +static struct grant_map *gntdev_find_map_index(struct gntdev_priv *priv, + int index, int count) +{ + struct grant_map *map; + + list_for_each_entry(map, &priv->maps, next) { + if (map->index != index) + continue; + if (map->count != count) + continue; + return map; + } + return NULL; +} + +static struct grant_map *gntdev_find_map_vaddr(struct gntdev_priv *priv, + unsigned long vaddr) +{ + struct grant_map *map; + + list_for_each_entry(map, &priv->maps, next) { + if (!map->vma) + continue; + if (vaddr < map->vma->vm_start) + continue; + if (vaddr >= map->vma->vm_end) + continue; + return map; + } + return NULL; +} + +static int gntdev_del_map(struct grant_map *map) +{ + int i; + + if (map->vma) + return -EBUSY; + for (i = 0; i < map->count; i++) + if (map->unmap_ops[i].handle) + return -EBUSY; + + map->priv->used -= map->count; + list_del(&map->next); + return 0; +} + +static void gntdev_free_map(struct grant_map *map) +{ + int i; + + if (!map) + return; + + if (map->pages) + for (i = 0; i < map->count; i++) { + if (map->pages[i]) + __free_page(map->pages[i]); + } + kfree(map->pages); + kfree(map->grants); + kfree(map->map_ops); + kfree(map->unmap_ops); + kfree(map); +} + +/* ------------------------------------------------------------------ */ + +static int find_grant_ptes(pte_t *pte, pgtable_t token, + unsigned long addr, void *data) +{ + struct grant_map *map = data; + unsigned int pgnr = (addr - map->vma->vm_start) >> PAGE_SHIFT; + u64 pte_maddr; + + BUG_ON(pgnr >= map->count); + pte_maddr = arbitrary_virt_to_machine(pte).maddr; + + gnttab_set_map_op(&map->map_ops[pgnr], pte_maddr, + GNTMAP_contains_pte | map->flags, + map->grants[pgnr].ref, + map->grants[pgnr].domid); + gnttab_set_unmap_op(&map->unmap_ops[pgnr], pte_maddr, + GNTMAP_contains_pte | map->flags, + 0 /* handle */); + return 0; +} + +static int map_grant_pages(struct grant_map *map) +{ + int i, err = 0; + + pr_debug("map %d+%d\n", map->index, map->count); + err = gnttab_map_refs(map->map_ops, map->pages, map->count); + if (err) + return err; + + for (i = 0; i < map->count; i++) { + if (map->map_ops[i].status) + err = -EINVAL; + map->unmap_ops[i].handle = map->map_ops[i].handle; + } + return err; +} + +static int unmap_grant_pages(struct grant_map *map, int offset, int pages) +{ + int i, err = 0; + + pr_debug("map %d+%d [%d+%d]\n", map->index, map->count, offset, pages); + err = gnttab_unmap_refs(map->unmap_ops + offset, map->pages, pages); + if (err) + return err; + + for (i = 0; i < pages; i++) { + if (map->unmap_ops[offset+i].status) + err = -EINVAL; + map->unmap_ops[offset+i].handle = 0; + } + return err; +} + +/* ------------------------------------------------------------------ */ + +static void gntdev_vma_close(struct vm_area_struct *vma) +{ + struct grant_map *map = vma->vm_private_data; + + pr_debug("close %p\n", vma); + map->is_mapped = 0; + map->vma = NULL; + vma->vm_private_data = NULL; +} + +static int gntdev_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + pr_debug("vaddr %p, pgoff %ld (shouldn't happen)\n", + vmf->virtual_address, vmf->pgoff); + vmf->flags = VM_FAULT_ERROR; + return 0; +} + +static struct vm_operations_struct gntdev_vmops = { + .close = gntdev_vma_close, + .fault = gntdev_vma_fault, +}; + +/* ------------------------------------------------------------------ */ + +static void mn_invl_range_start(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn); + struct grant_map *map; + unsigned long mstart, mend; + int err; + + spin_lock(&priv->lock); + list_for_each_entry(map, &priv->maps, next) { + if (!map->vma) + continue; + if (!map->is_mapped) + continue; + if (map->vma->vm_start >= end) + continue; + if (map->vma->vm_end <= start) + continue; + mstart = max(start, map->vma->vm_start); + mend = min(end, map->vma->vm_end); + pr_debug("map %d+%d (%lx %lx), range %lx %lx, mrange %lx %lx\n", + map->index, map->count, + map->vma->vm_start, map->vma->vm_end, + start, end, mstart, mend); + err = unmap_grant_pages(map, + (mstart - map->vma->vm_start) >> PAGE_SHIFT, + (mend - mstart) >> PAGE_SHIFT); + WARN_ON(err); + } + spin_unlock(&priv->lock); +} + +static void mn_invl_page(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long address) +{ + mn_invl_range_start(mn, mm, address, address + PAGE_SIZE); +} + +static void mn_release(struct mmu_notifier *mn, + struct mm_struct *mm) +{ + struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn); + struct grant_map *map; + int err; + + spin_lock(&priv->lock); + list_for_each_entry(map, &priv->maps, next) { + if (!map->vma) + continue; + pr_debug("map %d+%d (%lx %lx)\n", + map->index, map->count, + map->vma->vm_start, map->vma->vm_end); + err = unmap_grant_pages(map, /* offset */ 0, map->count); + WARN_ON(err); + } + spin_unlock(&priv->lock); +} + +struct mmu_notifier_ops gntdev_mmu_ops = { + .release = mn_release, + .invalidate_page = mn_invl_page, + .invalidate_range_start = mn_invl_range_start, +}; + +/* ------------------------------------------------------------------ */ + +static int gntdev_open(struct inode *inode, struct file *flip) +{ + struct gntdev_priv *priv; + int ret = 0; + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + INIT_LIST_HEAD(&priv->maps); + spin_lock_init(&priv->lock); + priv->limit = limit; + + priv->mm = get_task_mm(current); + if (!priv->mm) { + kfree(priv); + return -ENOMEM; + } + priv->mn.ops = &gntdev_mmu_ops; + ret = mmu_notifier_register(&priv->mn, priv->mm); + mmput(priv->mm); + + if (ret) { + kfree(priv); + return ret; + } + + flip->private_data = priv; + pr_debug("priv %p\n", priv); + + return 0; +} + +static int gntdev_release(struct inode *inode, struct file *flip) +{ + struct gntdev_priv *priv = flip->private_data; + struct grant_map *map; + int err; + + pr_debug("priv %p\n", priv); + + spin_lock(&priv->lock); + while (!list_empty(&priv->maps)) { + map = list_entry(priv->maps.next, struct grant_map, next); + err = gntdev_del_map(map); + if (WARN_ON(err)) + gntdev_free_map(map); + + } + spin_unlock(&priv->lock); + + mmu_notifier_unregister(&priv->mn, priv->mm); + kfree(priv); + return 0; +} + +static long gntdev_ioctl_map_grant_ref(struct gntdev_priv *priv, + struct ioctl_gntdev_map_grant_ref __user *u) +{ + struct ioctl_gntdev_map_grant_ref op; + struct grant_map *map; + int err; + + if (copy_from_user(&op, u, sizeof(op)) != 0) + return -EFAULT; + pr_debug("priv %p, add %d\n", priv, op.count); + if (unlikely(op.count <= 0)) + return -EINVAL; + if (unlikely(op.count > priv->limit)) + return -EINVAL; + + err = -ENOMEM; + map = gntdev_alloc_map(priv, op.count); + if (!map) + return err; + if (copy_from_user(map->grants, &u->refs, + sizeof(map->grants[0]) * op.count) != 0) { + gntdev_free_map(map); + return err; + } + + spin_lock(&priv->lock); + gntdev_add_map(priv, map); + op.index = map->index << PAGE_SHIFT; + spin_unlock(&priv->lock); + + if (copy_to_user(u, &op, sizeof(op)) != 0) { + spin_lock(&priv->lock); + gntdev_del_map(map); + spin_unlock(&priv->lock); + gntdev_free_map(map); + return err; + } + return 0; +} + +static long gntdev_ioctl_unmap_grant_ref(struct gntdev_priv *priv, + struct ioctl_gntdev_unmap_grant_ref __user *u) +{ + struct ioctl_gntdev_unmap_grant_ref op; + struct grant_map *map; + int err = -ENOENT; + + if (copy_from_user(&op, u, sizeof(op)) != 0) + return -EFAULT; + pr_debug("priv %p, del %d+%d\n", priv, (int)op.index, (int)op.count); + + spin_lock(&priv->lock); + map = gntdev_find_map_index(priv, op.index >> PAGE_SHIFT, op.count); + if (map) + err = gntdev_del_map(map); + spin_unlock(&priv->lock); + if (!err) + gntdev_free_map(map); + return err; +} + +static long gntdev_ioctl_get_offset_for_vaddr(struct gntdev_priv *priv, + struct ioctl_gntdev_get_offset_for_vaddr __user *u) +{ + struct ioctl_gntdev_get_offset_for_vaddr op; + struct grant_map *map; + + if (copy_from_user(&op, u, sizeof(op)) != 0) + return -EFAULT; + pr_debug("priv %p, offset for vaddr %lx\n", priv, (unsigned long)op.vaddr); + + spin_lock(&priv->lock); + map = gntdev_find_map_vaddr(priv, op.vaddr); + if (map == NULL || + map->vma->vm_start != op.vaddr) { + spin_unlock(&priv->lock); + return -EINVAL; + } + op.offset = map->index << PAGE_SHIFT; + op.count = map->count; + spin_unlock(&priv->lock); + + if (copy_to_user(u, &op, sizeof(op)) != 0) + return -EFAULT; + return 0; +} + +static long gntdev_ioctl_set_max_grants(struct gntdev_priv *priv, + struct ioctl_gntdev_set_max_grants __user *u) +{ + struct ioctl_gntdev_set_max_grants op; + + if (copy_from_user(&op, u, sizeof(op)) != 0) + return -EFAULT; + pr_debug("priv %p, limit %d\n", priv, op.count); + if (op.count > limit) + return -E2BIG; + + spin_lock(&priv->lock); + priv->limit = op.count; + spin_unlock(&priv->lock); + return 0; +} + +static long gntdev_ioctl(struct file *flip, + unsigned int cmd, unsigned long arg) +{ + struct gntdev_priv *priv = flip->private_data; + void __user *ptr = (void __user *)arg; + + switch (cmd) { + case IOCTL_GNTDEV_MAP_GRANT_REF: + return gntdev_ioctl_map_grant_ref(priv, ptr); + + case IOCTL_GNTDEV_UNMAP_GRANT_REF: + return gntdev_ioctl_unmap_grant_ref(priv, ptr); + + case IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR: + return gntdev_ioctl_get_offset_for_vaddr(priv, ptr); + + case IOCTL_GNTDEV_SET_MAX_GRANTS: + return gntdev_ioctl_set_max_grants(priv, ptr); + + default: + pr_debug("priv %p, unknown cmd %x\n", priv, cmd); + return -ENOIOCTLCMD; + } + + return 0; +} + +static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) +{ + struct gntdev_priv *priv = flip->private_data; + int index = vma->vm_pgoff; + int count = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + struct grant_map *map; + int err = -EINVAL; + + if ((vma->vm_flags & VM_WRITE) && !(vma->vm_flags & VM_SHARED)) + return -EINVAL; + + pr_debug("map %d+%d at %lx (pgoff %lx)\n", + index, count, vma->vm_start, vma->vm_pgoff); + + spin_lock(&priv->lock); + map = gntdev_find_map_index(priv, index, count); + if (!map) + goto unlock_out; + if (map->vma) + goto unlock_out; + if (priv->mm != vma->vm_mm) { + printk(KERN_WARNING "Huh? Other mm?\n"); + goto unlock_out; + } + + vma->vm_ops = &gntdev_vmops; + + vma->vm_flags |= VM_RESERVED|VM_DONTCOPY|VM_DONTEXPAND|VM_PFNMAP; + + vma->vm_private_data = map; + map->vma = vma; + + map->flags = GNTMAP_host_map | GNTMAP_application_map; + if (!(vma->vm_flags & VM_WRITE)) + map->flags |= GNTMAP_readonly; + + spin_unlock(&priv->lock); + + err = apply_to_page_range(vma->vm_mm, vma->vm_start, + vma->vm_end - vma->vm_start, + find_grant_ptes, map); + if (err) { + printk(KERN_WARNING "find_grant_ptes() failure.\n"); + return err; + } + + err = map_grant_pages(map); + if (err) { + printk(KERN_WARNING "map_grant_pages() failure.\n"); + return err; + } + + map->is_mapped = 1; + + return 0; + +unlock_out: + spin_unlock(&priv->lock); + return err; +} + +static const struct file_operations gntdev_fops = { + .owner = THIS_MODULE, + .open = gntdev_open, + .release = gntdev_release, + .mmap = gntdev_mmap, + .unlocked_ioctl = gntdev_ioctl +}; + +static struct miscdevice gntdev_miscdev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "xen/gntdev", + .fops = &gntdev_fops, +}; + +/* ------------------------------------------------------------------ */ + +static int __init gntdev_init(void) +{ + int err; + + if (!xen_domain()) + return -ENODEV; + + err = misc_register(&gntdev_miscdev); + if (err != 0) { + printk(KERN_ERR "Could not register gntdev device\n"); + return err; + } + return 0; +} + +static void __exit gntdev_exit(void) +{ + misc_deregister(&gntdev_miscdev); +} + +module_init(gntdev_init); +module_exit(gntdev_exit); + +/* ------------------------------------------------------------------ */ diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index 6c45318..9ef54eb 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -447,6 +447,52 @@ unsigned int gnttab_max_grant_frames(void) } EXPORT_SYMBOL_GPL(gnttab_max_grant_frames); +int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops, + struct page **pages, unsigned int count) +{ + int i, ret; + pte_t *pte; + unsigned long mfn; + + ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map_ops, count); + if (ret) + return ret; + + for (i = 0; i < count; i++) { + /* m2p override only supported for GNTMAP_contains_pte mappings */ + if (!(map_ops[i].flags & GNTMAP_contains_pte)) + continue; + pte = (pte_t *) (mfn_to_virt(PFN_DOWN(map_ops[i].host_addr)) + + (map_ops[i].host_addr & ~PAGE_MASK)); + mfn = pte_mfn(*pte); + ret = m2p_add_override(mfn, pages[i]); + if (ret) + return ret; + } + + return ret; +} +EXPORT_SYMBOL_GPL(gnttab_map_refs); + +int gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops, + struct page **pages, unsigned int count) +{ + int i, ret; + + ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, unmap_ops, count); + if (ret) + return ret; + + for (i = 0; i < count; i++) { + ret = m2p_remove_override(pages[i]); + if (ret) + return ret; + } + + return ret; +} +EXPORT_SYMBOL_GPL(gnttab_unmap_refs); + static int gnttab_map(unsigned int start_idx, unsigned int end_idx) { struct gnttab_setup_table setup; diff --git a/drivers/xen/platform-pci.c b/drivers/xen/platform-pci.c index c01b5dd..afbe041 100644 --- a/drivers/xen/platform-pci.c +++ b/drivers/xen/platform-pci.c @@ -105,7 +105,7 @@ static int __devinit platform_pci_init(struct pci_dev *pdev, const struct pci_device_id *ent) { int i, ret; - long ioaddr, iolen; + long ioaddr; long mmio_addr, mmio_len; unsigned int max_nr_gframes; @@ -114,7 +114,6 @@ static int __devinit platform_pci_init(struct pci_dev *pdev, return i; ioaddr = pci_resource_start(pdev, 0); - iolen = pci_resource_len(pdev, 0); mmio_addr = pci_resource_start(pdev, 1); mmio_len = pci_resource_len(pdev, 1); @@ -125,19 +124,13 @@ static int __devinit platform_pci_init(struct pci_dev *pdev, goto pci_out; } - if (request_mem_region(mmio_addr, mmio_len, DRV_NAME) == NULL) { - dev_err(&pdev->dev, "MEM I/O resource 0x%lx @ 0x%lx busy\n", - mmio_addr, mmio_len); - ret = -EBUSY; + ret = pci_request_region(pdev, 1, DRV_NAME); + if (ret < 0) goto pci_out; - } - if (request_region(ioaddr, iolen, DRV_NAME) == NULL) { - dev_err(&pdev->dev, "I/O resource 0x%lx @ 0x%lx busy\n", - iolen, ioaddr); - ret = -EBUSY; + ret = pci_request_region(pdev, 0, DRV_NAME); + if (ret < 0) goto mem_out; - } platform_mmio = mmio_addr; platform_mmiolen = mmio_len; @@ -169,9 +162,9 @@ static int __devinit platform_pci_init(struct pci_dev *pdev, return 0; out: - release_region(ioaddr, iolen); + pci_release_region(pdev, 0); mem_out: - release_mem_region(mmio_addr, mmio_len); + pci_release_region(pdev, 1); pci_out: pci_disable_device(pdev); return ret; diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 9ed4769..d3b28ab 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -141,13 +141,12 @@ int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry) return rc; } -static inode *ecryptfs_get_inode(struct inode *lower_inode, +static struct inode *ecryptfs_get_inode(struct inode *lower_inode, struct super_block *sb) { struct inode *inode; int rc = 0; - lower_inode = lower_dentry->d_inode; if (lower_inode->i_sb != ecryptfs_superblock_to_lower(sb)) { rc = -EXDEV; goto out; @@ -202,7 +201,7 @@ int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry, { struct inode *lower_inode = lower_dentry->d_inode; struct inode *inode = ecryptfs_get_inode(lower_inode, sb); - if (IS_ERR(inode) + if (IS_ERR(inode)) return PTR_ERR(inode); if (flags & ECRYPTFS_INTERPOSE_FLAG_D_ADD) d_add(dentry, inode); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 3d06ccc..59c6e49 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -84,13 +84,9 @@ static inline struct inode *wb_inode(struct list_head *head) return list_entry(head, struct inode, i_wb_list); } -static void bdi_queue_work(struct backing_dev_info *bdi, - struct wb_writeback_work *work) +/* Wakeup flusher thread or forker thread to fork it. Requires bdi->wb_lock. */ +static void bdi_wakeup_flusher(struct backing_dev_info *bdi) { - trace_writeback_queue(bdi, work); - - spin_lock_bh(&bdi->wb_lock); - list_add_tail(&work->list, &bdi->work_list); if (bdi->wb.task) { wake_up_process(bdi->wb.task); } else { @@ -98,15 +94,26 @@ static void bdi_queue_work(struct backing_dev_info *bdi, * The bdi thread isn't there, wake up the forker thread which * will create and run it. */ - trace_writeback_nothread(bdi, work); wake_up_process(default_backing_dev_info.wb.task); } +} + +static void bdi_queue_work(struct backing_dev_info *bdi, + struct wb_writeback_work *work) +{ + trace_writeback_queue(bdi, work); + + spin_lock_bh(&bdi->wb_lock); + list_add_tail(&work->list, &bdi->work_list); + if (!bdi->wb.task) + trace_writeback_nothread(bdi, work); + bdi_wakeup_flusher(bdi); spin_unlock_bh(&bdi->wb_lock); } static void __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, - bool range_cyclic, bool for_background) + bool range_cyclic) { struct wb_writeback_work *work; @@ -126,7 +133,6 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, work->sync_mode = WB_SYNC_NONE; work->nr_pages = nr_pages; work->range_cyclic = range_cyclic; - work->for_background = for_background; bdi_queue_work(bdi, work); } @@ -144,7 +150,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, */ void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages) { - __bdi_start_writeback(bdi, nr_pages, true, false); + __bdi_start_writeback(bdi, nr_pages, true); } /** @@ -152,13 +158,21 @@ void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages) * @bdi: the backing device to write from * * Description: - * This does WB_SYNC_NONE background writeback. The IO is only - * started when this function returns, we make no guarentees on - * completion. Caller need not hold sb s_umount semaphore. + * This makes sure WB_SYNC_NONE background writeback happens. When + * this function returns, it is only guaranteed that for given BDI + * some IO is happening if we are over background dirty threshold. + * Caller need not hold sb s_umount semaphore. */ void bdi_start_background_writeback(struct backing_dev_info *bdi) { - __bdi_start_writeback(bdi, LONG_MAX, true, true); + /* + * We just wake up the flusher thread. It will perform background + * writeback as soon as there is no other work to do. + */ + trace_writeback_wake_background(bdi); + spin_lock_bh(&bdi->wb_lock); + bdi_wakeup_flusher(bdi); + spin_unlock_bh(&bdi->wb_lock); } /* @@ -616,6 +630,7 @@ static long wb_writeback(struct bdi_writeback *wb, }; unsigned long oldest_jif; long wrote = 0; + long write_chunk; struct inode *inode; if (wbc.for_kupdate) { @@ -628,6 +643,24 @@ static long wb_writeback(struct bdi_writeback *wb, wbc.range_end = LLONG_MAX; } + /* + * WB_SYNC_ALL mode does livelock avoidance by syncing dirty + * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX + * here avoids calling into writeback_inodes_wb() more than once. + * + * The intended call sequence for WB_SYNC_ALL writeback is: + * + * wb_writeback() + * __writeback_inodes_sb() <== called only once + * write_cache_pages() <== called once for each inode + * (quickly) tag currently dirty pages + * (maybe slowly) sync all tagged pages + */ + if (wbc.sync_mode == WB_SYNC_NONE) + write_chunk = MAX_WRITEBACK_PAGES; + else + write_chunk = LONG_MAX; + wbc.wb_start = jiffies; /* livelock avoidance */ for (;;) { /* @@ -637,6 +670,16 @@ static long wb_writeback(struct bdi_writeback *wb, break; /* + * Background writeout and kupdate-style writeback may + * run forever. Stop them if there is other work to do + * so that e.g. sync can proceed. They'll be restarted + * after the other works are all done. + */ + if ((work->for_background || work->for_kupdate) && + !list_empty(&wb->bdi->work_list)) + break; + + /* * For background writeout, stop when we are below the * background dirty threshold */ @@ -644,7 +687,7 @@ static long wb_writeback(struct bdi_writeback *wb, break; wbc.more_io = 0; - wbc.nr_to_write = MAX_WRITEBACK_PAGES; + wbc.nr_to_write = write_chunk; wbc.pages_skipped = 0; trace_wbc_writeback_start(&wbc, wb->bdi); @@ -654,8 +697,8 @@ static long wb_writeback(struct bdi_writeback *wb, writeback_inodes_wb(wb, &wbc); trace_wbc_writeback_written(&wbc, wb->bdi); - work->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; - wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write; + work->nr_pages -= write_chunk - wbc.nr_to_write; + wrote += write_chunk - wbc.nr_to_write; /* * If we consumed everything, see if we have more @@ -670,7 +713,7 @@ static long wb_writeback(struct bdi_writeback *wb, /* * Did we write something? Try for more */ - if (wbc.nr_to_write < MAX_WRITEBACK_PAGES) + if (wbc.nr_to_write < write_chunk) continue; /* * Nothing written. Wait for some inode to @@ -718,6 +761,23 @@ static unsigned long get_nr_dirty_pages(void) get_nr_dirty_inodes(); } +static long wb_check_background_flush(struct bdi_writeback *wb) +{ + if (over_bground_thresh()) { + + struct wb_writeback_work work = { + .nr_pages = LONG_MAX, + .sync_mode = WB_SYNC_NONE, + .for_background = 1, + .range_cyclic = 1, + }; + + return wb_writeback(wb, &work); + } + + return 0; +} + static long wb_check_old_data_flush(struct bdi_writeback *wb) { unsigned long expired; @@ -787,6 +847,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait) * Check for periodic writeback, kupdated() style */ wrote += wb_check_old_data_flush(wb); + wrote += wb_check_background_flush(wb); clear_bit(BDI_writeback_running, &wb->bdi->state); return wrote; @@ -873,7 +934,7 @@ void wakeup_flusher_threads(long nr_pages) list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { if (!bdi_has_dirty_io(bdi)) continue; - __bdi_start_writeback(bdi, nr_pages, false, false); + __bdi_start_writeback(bdi, nr_pages, false); } rcu_read_unlock(); } @@ -1164,7 +1225,7 @@ EXPORT_SYMBOL(writeback_inodes_sb_nr_if_idle); * @sb: the superblock * * This function writes and waits on any dirty inode belonging to this - * super_block. The number of pages synced is returned. + * super_block. */ void sync_inodes_sb(struct super_block *sb) { @@ -1242,11 +1303,11 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc) EXPORT_SYMBOL(sync_inode); /** - * sync_inode - write an inode to disk + * sync_inode_metadata - write an inode to disk * @inode: the inode to sync * @wait: wait for I/O to complete. * - * Write an inode to disk and adjust it's dirty state after completion. + * Write an inode to disk and adjust its dirty state after completion. * * Note: only writes the actual inode, no associated data or other metadata. */ @@ -40,7 +40,7 @@ * status of that page is hard. See end_buffer_async_read() for the details. * There is no point in duplicating all that complexity. */ -static void mpage_end_io_read(struct bio *bio, int err) +static void mpage_end_io(struct bio *bio, int err) { const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; @@ -50,44 +50,29 @@ static void mpage_end_io_read(struct bio *bio, int err) if (--bvec >= bio->bi_io_vec) prefetchw(&bvec->bv_page->flags); - - if (uptodate) { - SetPageUptodate(page); - } else { - ClearPageUptodate(page); - SetPageError(page); - } - unlock_page(page); - } while (bvec >= bio->bi_io_vec); - bio_put(bio); -} - -static void mpage_end_io_write(struct bio *bio, int err) -{ - const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; - - do { - struct page *page = bvec->bv_page; - - if (--bvec >= bio->bi_io_vec) - prefetchw(&bvec->bv_page->flags); - - if (!uptodate){ - SetPageError(page); - if (page->mapping) - set_bit(AS_EIO, &page->mapping->flags); + if (bio_data_dir(bio) == READ) { + if (uptodate) { + SetPageUptodate(page); + } else { + ClearPageUptodate(page); + SetPageError(page); + } + unlock_page(page); + } else { /* bio_data_dir(bio) == WRITE */ + if (!uptodate) { + SetPageError(page); + if (page->mapping) + set_bit(AS_EIO, &page->mapping->flags); + } + end_page_writeback(page); } - end_page_writeback(page); } while (bvec >= bio->bi_io_vec); bio_put(bio); } static struct bio *mpage_bio_submit(int rw, struct bio *bio) { - bio->bi_end_io = mpage_end_io_read; - if (rw == WRITE) - bio->bi_end_io = mpage_end_io_write; + bio->bi_end_io = mpage_end_io; submit_bio(rw, bio); return NULL; } diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 95b081b..64ee240 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1579,6 +1579,7 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode, { struct iattr attr; int error; + int open_flags = 0; dfprintk(VFS, "NFS: create(%s/%ld), %s\n", dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); @@ -1586,7 +1587,10 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode, attr.ia_mode = mode; attr.ia_valid = ATTR_MODE; - error = NFS_PROTO(dir)->create(dir, dentry, &attr, 0, NULL); + if ((nd->flags & LOOKUP_CREATE) != 0) + open_flags = nd->intent.open.flags; + + error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, NULL); if (error != 0) goto out_err; return 0; diff --git a/fs/proc/base.c b/fs/proc/base.c index 93f1cdd..9d096e8 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1151,7 +1151,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, goto err_task_lock; } - if (oom_score_adj < task->signal->oom_score_adj && + if (oom_score_adj < task->signal->oom_score_adj_min && !capable(CAP_SYS_RESOURCE)) { err = -EACCES; goto err_sighand; @@ -1164,6 +1164,8 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, atomic_dec(&task->mm->oom_disable_count); } task->signal->oom_score_adj = oom_score_adj; + if (has_capability_noaudit(current, CAP_SYS_RESOURCE)) + task->signal->oom_score_adj_min = oom_score_adj; /* * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is * always attainable. diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index a65239c..ed257d1 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -101,6 +101,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v) #ifdef CONFIG_MEMORY_FAILURE "HardwareCorrupted: %5lu kB\n" #endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + "AnonHugePages: %8lu kB\n" +#endif , K(i.totalram), K(i.freeram), @@ -128,7 +131,12 @@ static int meminfo_proc_show(struct seq_file *m, void *v) K(i.freeswap), K(global_page_state(NR_FILE_DIRTY)), K(global_page_state(NR_WRITEBACK)), - K(global_page_state(NR_ANON_PAGES)), + K(global_page_state(NR_ANON_PAGES) +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + + global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) * + HPAGE_PMD_NR +#endif + ), K(global_page_state(NR_FILE_MAPPED)), K(global_page_state(NR_SHMEM)), K(global_page_state(NR_SLAB_RECLAIMABLE) + @@ -151,6 +159,10 @@ static int meminfo_proc_show(struct seq_file *m, void *v) #ifdef CONFIG_MEMORY_FAILURE ,atomic_long_read(&mce_bad_pages) << (PAGE_SHIFT - 10) #endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + ,K(global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) * + HPAGE_PMD_NR) +#endif ); hugetlb_report_meminfo(m); diff --git a/fs/proc/page.c b/fs/proc/page.c index b06c674..6d8e6a9 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -116,15 +116,17 @@ u64 stable_page_flags(struct page *page) if (PageHuge(page)) u |= 1 << KPF_HUGE; - u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked); - /* - * Caveats on high order pages: - * PG_buddy will only be set on the head page; SLUB/SLQB do the same - * for PG_slab; SLOB won't set PG_slab at all on compound pages. + * Caveats on high order pages: page->_count will only be set + * -1 on the head page; SLUB/SLQB do the same for PG_slab; + * SLOB won't set PG_slab at all on compound pages. */ + if (PageBuddy(page)) + u |= 1 << KPF_BUDDY; + + u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked); + u |= kpf_copy_bit(k, KPF_SLAB, PG_slab); - u |= kpf_copy_bit(k, KPF_BUDDY, PG_buddy); u |= kpf_copy_bit(k, KPF_ERROR, PG_error); u |= kpf_copy_bit(k, KPF_DIRTY, PG_dirty); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index c3755bd..60b9148 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -418,7 +418,8 @@ static int show_smap(struct seq_file *m, void *v) "Anonymous: %8lu kB\n" "Swap: %8lu kB\n" "KernelPageSize: %8lu kB\n" - "MMUPageSize: %8lu kB\n", + "MMUPageSize: %8lu kB\n" + "Locked: %8lu kB\n", (vma->vm_end - vma->vm_start) >> 10, mss.resident >> 10, (unsigned long)(mss.pss >> (10 + PSS_SHIFT)), @@ -430,7 +431,9 @@ static int show_smap(struct seq_file *m, void *v) mss.anonymous >> 10, mss.swap >> 10, vma_kernel_pagesize(vma) >> 10, - vma_mmu_pagesize(vma) >> 10); + vma_mmu_pagesize(vma) >> 10, + (vma->vm_flags & VM_LOCKED) ? + (unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0); if (m->count < m->size) /* vma is copied successfully */ m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0; diff --git a/include/asm-generic/gpio.h b/include/asm-generic/gpio.h index 6098cae..ff5c660 100644 --- a/include/asm-generic/gpio.h +++ b/include/asm-generic/gpio.h @@ -147,11 +147,11 @@ extern struct gpio_chip *gpiochip_find(void *data, /* Always use the library code for GPIO management calls, * or when sleeping may be involved. */ -extern int __must_check gpio_request(unsigned gpio, const char *label); +extern int gpio_request(unsigned gpio, const char *label); extern void gpio_free(unsigned gpio); -extern int __must_check gpio_direction_input(unsigned gpio); -extern int __must_check gpio_direction_output(unsigned gpio, int value); +extern int gpio_direction_input(unsigned gpio); +extern int gpio_direction_output(unsigned gpio, int value); extern int gpio_set_debounce(unsigned gpio, unsigned debounce); @@ -192,8 +192,8 @@ struct gpio { const char *label; }; -extern int __must_check gpio_request_one(unsigned gpio, unsigned long flags, const char *label); -extern int __must_check gpio_request_array(struct gpio *array, size_t num); +extern int gpio_request_one(unsigned gpio, unsigned long flags, const char *label); +extern int gpio_request_array(struct gpio *array, size_t num); extern void gpio_free_array(struct gpio *array, size_t num); #ifdef CONFIG_GPIO_SYSFS diff --git a/include/asm-generic/mman-common.h b/include/asm-generic/mman-common.h index 3da9e27..787abbb 100644 --- a/include/asm-generic/mman-common.h +++ b/include/asm-generic/mman-common.h @@ -45,6 +45,9 @@ #define MADV_MERGEABLE 12 /* KSM may merge identical pages */ #define MADV_UNMERGEABLE 13 /* KSM may not merge identical pages */ +#define MADV_HUGEPAGE 14 /* Worth backing with hugepages */ +#define MADV_NOHUGEPAGE 15 /* Not worth backing with hugepages */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 6f3c6ae..f1eddf7 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -5,67 +5,108 @@ #ifdef CONFIG_MMU #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS -/* - * Largely same as above, but only sets the access flags (dirty, - * accessed, and writable). Furthermore, we know it always gets set - * to a "more permissive" setting, which allows most architectures - * to optimize this. We return whether the PTE actually changed, which - * in turn instructs the caller to do things like update__mmu_cache. - * This used to be done in the caller, but sparc needs minor faults to - * force that call on sun4c so we changed this macro slightly - */ -#define ptep_set_access_flags(__vma, __address, __ptep, __entry, __dirty) \ -({ \ - int __changed = !pte_same(*(__ptep), __entry); \ - if (__changed) { \ - set_pte_at((__vma)->vm_mm, (__address), __ptep, __entry); \ - flush_tlb_page(__vma, __address); \ - } \ - __changed; \ -}) +extern int ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, + pte_t entry, int dirty); +#endif + +#ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS +extern int pmdp_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp, + pmd_t entry, int dirty); #endif #ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG -#define ptep_test_and_clear_young(__vma, __address, __ptep) \ -({ \ - pte_t __pte = *(__ptep); \ - int r = 1; \ - if (!pte_young(__pte)) \ - r = 0; \ - else \ - set_pte_at((__vma)->vm_mm, (__address), \ - (__ptep), pte_mkold(__pte)); \ - r; \ -}) +static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, + pte_t *ptep) +{ + pte_t pte = *ptep; + int r = 1; + if (!pte_young(pte)) + r = 0; + else + set_pte_at(vma->vm_mm, address, ptep, pte_mkold(pte)); + return r; +} +#endif + +#ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, + pmd_t *pmdp) +{ + pmd_t pmd = *pmdp; + int r = 1; + if (!pmd_young(pmd)) + r = 0; + else + set_pmd_at(vma->vm_mm, address, pmdp, pmd_mkold(pmd)); + return r; +} +#else /* CONFIG_TRANSPARENT_HUGEPAGE */ +static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, + pmd_t *pmdp) +{ + BUG(); + return 0; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif #ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH -#define ptep_clear_flush_young(__vma, __address, __ptep) \ -({ \ - int __young; \ - __young = ptep_test_and_clear_young(__vma, __address, __ptep); \ - if (__young) \ - flush_tlb_page(__vma, __address); \ - __young; \ -}) +int ptep_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep); +#endif + +#ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH +int pmdp_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp); #endif #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR -#define ptep_get_and_clear(__mm, __address, __ptep) \ -({ \ - pte_t __pte = *(__ptep); \ - pte_clear((__mm), (__address), (__ptep)); \ - __pte; \ +static inline pte_t ptep_get_and_clear(struct mm_struct *mm, + unsigned long address, + pte_t *ptep) +{ + pte_t pte = *ptep; + pte_clear(mm, address, ptep); + return pte; +} +#endif + +#ifndef __HAVE_ARCH_PMDP_GET_AND_CLEAR +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, + unsigned long address, + pmd_t *pmdp) +{ + pmd_t pmd = *pmdp; + pmd_clear(mm, address, pmdp); + return pmd; }) +#else /* CONFIG_TRANSPARENT_HUGEPAGE */ +static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, + unsigned long address, + pmd_t *pmdp) +{ + BUG(); + return __pmd(0); +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL -#define ptep_get_and_clear_full(__mm, __address, __ptep, __full) \ -({ \ - pte_t __pte; \ - __pte = ptep_get_and_clear((__mm), (__address), (__ptep)); \ - __pte; \ -}) +static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, + unsigned long address, pte_t *ptep, + int full) +{ + pte_t pte; + pte = ptep_get_and_clear(mm, address, ptep); + return pte; +} #endif /* @@ -74,20 +115,25 @@ * not present, or in the process of an address space destruction. */ #ifndef __HAVE_ARCH_PTE_CLEAR_NOT_PRESENT_FULL -#define pte_clear_not_present_full(__mm, __address, __ptep, __full) \ -do { \ - pte_clear((__mm), (__address), (__ptep)); \ -} while (0) +static inline void pte_clear_not_present_full(struct mm_struct *mm, + unsigned long address, + pte_t *ptep, + int full) +{ + pte_clear(mm, address, ptep); +} #endif #ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH -#define ptep_clear_flush(__vma, __address, __ptep) \ -({ \ - pte_t __pte; \ - __pte = ptep_get_and_clear((__vma)->vm_mm, __address, __ptep); \ - flush_tlb_page(__vma, __address); \ - __pte; \ -}) +extern pte_t ptep_clear_flush(struct vm_area_struct *vma, + unsigned long address, + pte_t *ptep); +#endif + +#ifndef __HAVE_ARCH_PMDP_CLEAR_FLUSH +extern pmd_t pmdp_clear_flush(struct vm_area_struct *vma, + unsigned long address, + pmd_t *pmdp); #endif #ifndef __HAVE_ARCH_PTEP_SET_WRPROTECT @@ -99,8 +145,49 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres } #endif +#ifndef __HAVE_ARCH_PMDP_SET_WRPROTECT +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline void pmdp_set_wrprotect(struct mm_struct *mm, + unsigned long address, pmd_t *pmdp) +{ + pmd_t old_pmd = *pmdp; + set_pmd_at(mm, address, pmdp, pmd_wrprotect(old_pmd)); +} +#else /* CONFIG_TRANSPARENT_HUGEPAGE */ +static inline void pmdp_set_wrprotect(struct mm_struct *mm, + unsigned long address, pmd_t *pmdp) +{ + BUG(); +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +#endif + +#ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH +extern pmd_t pmdp_clear_flush(struct vm_area_struct *vma, + unsigned long address, + pmd_t *pmdp); +#endif + #ifndef __HAVE_ARCH_PTE_SAME -#define pte_same(A,B) (pte_val(A) == pte_val(B)) +static inline int pte_same(pte_t pte_a, pte_t pte_b) +{ + return pte_val(pte_a) == pte_val(pte_b); +} +#endif + +#ifndef __HAVE_ARCH_PMD_SAME +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) +{ + return pmd_val(pmd_a) == pmd_val(pmd_b); +} +#else /* CONFIG_TRANSPARENT_HUGEPAGE */ +static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) +{ + BUG(); + return 0; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif #ifndef __HAVE_ARCH_PAGE_TEST_DIRTY @@ -348,6 +435,24 @@ extern void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn, unsigned long size); #endif +#ifndef CONFIG_TRANSPARENT_HUGEPAGE +static inline int pmd_trans_huge(pmd_t pmd) +{ + return 0; +} +static inline int pmd_trans_splitting(pmd_t pmd) +{ + return 0; +} +#ifndef __HAVE_ARCH_PMD_WRITE +static inline int pmd_write(pmd_t pmd) +{ + BUG(); + return 0; +} +#endif /* __HAVE_ARCH_PMD_WRITE */ +#endif + #endif /* !__ASSEMBLY__ */ #endif /* _ASM_GENERIC_PGTABLE_H */ diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 5ac5155..dfa2ed4 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -11,6 +11,9 @@ /* The full zone was compacted */ #define COMPACT_COMPLETE 3 +#define COMPACT_MODE_DIRECT_RECLAIM 0 +#define COMPACT_MODE_KSWAPD 1 + #ifdef CONFIG_COMPACTION extern int sysctl_compact_memory; extern int sysctl_compaction_handler(struct ctl_table *table, int write, @@ -21,7 +24,12 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write, extern int fragmentation_index(struct zone *zone, unsigned int order); extern unsigned long try_to_compact_pages(struct zonelist *zonelist, - int order, gfp_t gfp_mask, nodemask_t *mask); + int order, gfp_t gfp_mask, nodemask_t *mask, + bool sync); +extern unsigned long compaction_suitable(struct zone *zone, int order); +extern unsigned long compact_zone_order(struct zone *zone, int order, + gfp_t gfp_mask, bool sync, + int compact_mode); /* Do not skip compaction more than 64 times */ #define COMPACT_MAX_DEFER_SHIFT 6 @@ -54,7 +62,20 @@ static inline bool compaction_deferred(struct zone *zone) #else static inline unsigned long try_to_compact_pages(struct zonelist *zonelist, - int order, gfp_t gfp_mask, nodemask_t *nodemask) + int order, gfp_t gfp_mask, nodemask_t *nodemask, + bool sync) +{ + return COMPACT_CONTINUE; +} + +static inline unsigned long compaction_suitable(struct zone *zone, int order) +{ + return COMPACT_SKIPPED; +} + +static inline unsigned long compact_zone_order(struct zone *zone, int order, + gfp_t gfp_mask, bool sync, + int compact_mode) { return COMPACT_CONTINUE; } diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 2970022..272496d 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -193,6 +193,13 @@ struct dm_target { char *error; }; +/* Each target can link one of these into the table */ +struct dm_target_callbacks { + struct list_head list; + int (*congested_fn) (struct dm_target_callbacks *, int); + void (*unplug_fn)(struct dm_target_callbacks *); +}; + int dm_register_target(struct target_type *t); void dm_unregister_target(struct target_type *t); @@ -269,6 +276,11 @@ int dm_table_add_target(struct dm_table *t, const char *type, sector_t start, sector_t len, char *params); /* + * Target_ctr should call this if it needs to add any callbacks. + */ +void dm_table_add_target_callbacks(struct dm_table *t, struct dm_target_callbacks *cb); + +/* * Finally call this to make the table ready for use. */ int dm_table_complete(struct dm_table *t); diff --git a/include/linux/dm-ioctl.h b/include/linux/dm-ioctl.h index 49eab36..78bbf47 100644 --- a/include/linux/dm-ioctl.h +++ b/include/linux/dm-ioctl.h @@ -44,7 +44,7 @@ * Remove a device, destroy any tables. * * DM_DEV_RENAME: - * Rename a device. + * Rename a device or set its uuid if none was previously supplied. * * DM_SUSPEND: * This performs both suspend and resume, depending which flag is @@ -267,9 +267,9 @@ enum { #define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl) #define DM_VERSION_MAJOR 4 -#define DM_VERSION_MINOR 18 -#define DM_VERSION_PATCHLEVEL 0 -#define DM_VERSION_EXTRA "-ioctl (2010-06-29)" +#define DM_VERSION_MINOR 19 +#define DM_VERSION_PATCHLEVEL 1 +#define DM_VERSION_EXTRA "-ioctl (2011-01-07)" /* Status bits */ #define DM_READONLY_FLAG (1 << 0) /* In/Out */ @@ -322,4 +322,10 @@ enum { */ #define DM_UEVENT_GENERATED_FLAG (1 << 13) /* Out */ +/* + * If set, rename changes the uuid not the name. Only permitted + * if no uuid was previously supplied: an existing uuid cannot be changed. + */ +#define DM_UUID_FLAG (1 << 14) /* In */ + #endif /* _LINUX_DM_IOCTL_H */ diff --git a/include/linux/dm-log-userspace.h b/include/linux/dm-log-userspace.h index 0c3c3a2..eeace7d 100644 --- a/include/linux/dm-log-userspace.h +++ b/include/linux/dm-log-userspace.h @@ -370,6 +370,16 @@ #define DM_ULOG_REQUEST_TYPE(request_type) \ (DM_ULOG_REQUEST_MASK & (request_type)) +/* + * DM_ULOG_REQUEST_VERSION is incremented when there is a + * change to the way information is passed between kernel + * and userspace. This could be a structure change of + * dm_ulog_request or a change in the way requests are + * issued/handled. Changes are outlined here: + * version 1: Initial implementation + */ +#define DM_ULOG_REQUEST_VERSION 1 + struct dm_ulog_request { /* * The local unique identifier (luid) and the universally unique @@ -383,8 +393,9 @@ struct dm_ulog_request { */ uint64_t luid; char uuid[DM_UUID_LEN]; - char padding[7]; /* Padding because DM_UUID_LEN = 129 */ + char padding[3]; /* Padding because DM_UUID_LEN = 129 */ + uint32_t version; /* See DM_ULOG_REQUEST_VERSION */ int32_t error; /* Used to report back processing errors */ uint32_t seq; /* Sequence number for request */ diff --git a/include/linux/gfp.h b/include/linux/gfp.h index f54adfc..a3b148a 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -34,6 +34,7 @@ struct vm_area_struct; #else #define ___GFP_NOTRACK 0 #endif +#define ___GFP_NO_KSWAPD 0x400000u /* * GFP bitmasks.. @@ -81,13 +82,15 @@ struct vm_area_struct; #define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) /* Page is reclaimable */ #define __GFP_NOTRACK ((__force gfp_t)___GFP_NOTRACK) /* Don't track with kmemcheck */ +#define __GFP_NO_KSWAPD ((__force gfp_t)___GFP_NO_KSWAPD) + /* * This may seem redundant, but it's a way of annotating false positives vs. * allocations that simply cannot be supported (e.g. page tables). */ #define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK) -#define __GFP_BITS_SHIFT 22 /* Room for 22 __GFP_FOO bits */ +#define __GFP_BITS_SHIFT 23 /* Room for 23 __GFP_FOO bits */ #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) /* This equals 0, but use constants in case they ever change */ @@ -106,6 +109,9 @@ struct vm_area_struct; __GFP_HARDWALL | __GFP_HIGHMEM | \ __GFP_MOVABLE) #define GFP_IOFS (__GFP_IO | __GFP_FS) +#define GFP_TRANSHUGE (GFP_HIGHUSER_MOVABLE | __GFP_COMP | \ + __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN | \ + __GFP_NO_KSWAPD) #ifdef CONFIG_NUMA #define GFP_THISNODE (__GFP_THISNODE | __GFP_NOWARN | __GFP_NORETRY) @@ -325,14 +331,17 @@ alloc_pages(gfp_t gfp_mask, unsigned int order) { return alloc_pages_current(gfp_mask, order); } -extern struct page *alloc_page_vma(gfp_t gfp_mask, +extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order, struct vm_area_struct *vma, unsigned long addr); #else #define alloc_pages(gfp_mask, order) \ alloc_pages_node(numa_node_id(), gfp_mask, order) -#define alloc_page_vma(gfp_mask, vma, addr) alloc_pages(gfp_mask, 0) +#define alloc_pages_vma(gfp_mask, order, vma, addr) \ + alloc_pages(gfp_mask, order) #endif #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) +#define alloc_page_vma(gfp_mask, vma, addr) \ + alloc_pages_vma(gfp_mask, 0, vma, addr) extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); extern unsigned long get_zeroed_page(gfp_t gfp_mask); diff --git a/include/linux/gpio.h b/include/linux/gpio.h index f79d67f..4b47ed9 100644 --- a/include/linux/gpio.h +++ b/include/linux/gpio.h @@ -30,7 +30,7 @@ static inline int gpio_is_valid(int number) return 0; } -static inline int __must_check gpio_request(unsigned gpio, const char *label) +static inline int gpio_request(unsigned gpio, const char *label) { return -ENOSYS; } @@ -62,12 +62,12 @@ static inline void gpio_free_array(struct gpio *array, size_t num) WARN_ON(1); } -static inline int __must_check gpio_direction_input(unsigned gpio) +static inline int gpio_direction_input(unsigned gpio) { return -ENOSYS; } -static inline int __must_check gpio_direction_output(unsigned gpio, int value) +static inline int gpio_direction_output(unsigned gpio, int value) { return -ENOSYS; } diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h new file mode 100644 index 0000000..8e6c8c4 --- /dev/null +++ b/include/linux/huge_mm.h @@ -0,0 +1,179 @@ +#ifndef _LINUX_HUGE_MM_H +#define _LINUX_HUGE_MM_H + +extern int do_huge_pmd_anonymous_page(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, + unsigned int flags); +extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, + struct vm_area_struct *vma); +extern int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, + pmd_t orig_pmd); +extern pgtable_t get_pmd_huge_pte(struct mm_struct *mm); +extern struct page *follow_trans_huge_pmd(struct mm_struct *mm, + unsigned long addr, + pmd_t *pmd, + unsigned int flags); +extern int zap_huge_pmd(struct mmu_gather *tlb, + struct vm_area_struct *vma, + pmd_t *pmd); +extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, unsigned long end, + unsigned char *vec); +extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, pgprot_t newprot); + +enum transparent_hugepage_flag { + TRANSPARENT_HUGEPAGE_FLAG, + TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, + TRANSPARENT_HUGEPAGE_DEFRAG_FLAG, + TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, + TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG, +#ifdef CONFIG_DEBUG_VM + TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG, +#endif +}; + +enum page_check_address_pmd_flag { + PAGE_CHECK_ADDRESS_PMD_FLAG, + PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG, + PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG, +}; +extern pmd_t *page_check_address_pmd(struct page *page, + struct mm_struct *mm, + unsigned long address, + enum page_check_address_pmd_flag flag); + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#define HPAGE_PMD_SHIFT HPAGE_SHIFT +#define HPAGE_PMD_MASK HPAGE_MASK +#define HPAGE_PMD_SIZE HPAGE_SIZE + +#define transparent_hugepage_enabled(__vma) \ + ((transparent_hugepage_flags & \ + (1<<TRANSPARENT_HUGEPAGE_FLAG) || \ + (transparent_hugepage_flags & \ + (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG) && \ + ((__vma)->vm_flags & VM_HUGEPAGE))) && \ + !((__vma)->vm_flags & VM_NOHUGEPAGE)) +#define transparent_hugepage_defrag(__vma) \ + ((transparent_hugepage_flags & \ + (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)) || \ + (transparent_hugepage_flags & \ + (1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG) && \ + (__vma)->vm_flags & VM_HUGEPAGE)) +#ifdef CONFIG_DEBUG_VM +#define transparent_hugepage_debug_cow() \ + (transparent_hugepage_flags & \ + (1<<TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG)) +#else /* CONFIG_DEBUG_VM */ +#define transparent_hugepage_debug_cow() 0 +#endif /* CONFIG_DEBUG_VM */ + +extern unsigned long transparent_hugepage_flags; +extern int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pmd_t *dst_pmd, pmd_t *src_pmd, + struct vm_area_struct *vma, + unsigned long addr, unsigned long end); +extern int handle_pte_fault(struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long address, + pte_t *pte, pmd_t *pmd, unsigned int flags); +extern int split_huge_page(struct page *page); +extern void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd); +#define split_huge_page_pmd(__mm, __pmd) \ + do { \ + pmd_t *____pmd = (__pmd); \ + if (unlikely(pmd_trans_huge(*____pmd))) \ + __split_huge_page_pmd(__mm, ____pmd); \ + } while (0) +#define wait_split_huge_page(__anon_vma, __pmd) \ + do { \ + pmd_t *____pmd = (__pmd); \ + spin_unlock_wait(&(__anon_vma)->root->lock); \ + /* \ + * spin_unlock_wait() is just a loop in C and so the \ + * CPU can reorder anything around it. \ + */ \ + smp_mb(); \ + BUG_ON(pmd_trans_splitting(*____pmd) || \ + pmd_trans_huge(*____pmd)); \ + } while (0) +#define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT) +#define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER) +#if HPAGE_PMD_ORDER > MAX_ORDER +#error "hugepages can't be allocated by the buddy allocator" +#endif +extern int hugepage_madvise(struct vm_area_struct *vma, + unsigned long *vm_flags, int advice); +extern void __vma_adjust_trans_huge(struct vm_area_struct *vma, + unsigned long start, + unsigned long end, + long adjust_next); +static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, + unsigned long start, + unsigned long end, + long adjust_next) +{ + if (!vma->anon_vma || vma->vm_ops || vma->vm_file) + return; + __vma_adjust_trans_huge(vma, start, end, adjust_next); +} +static inline int hpage_nr_pages(struct page *page) +{ + if (unlikely(PageTransHuge(page))) + return HPAGE_PMD_NR; + return 1; +} +static inline struct page *compound_trans_head(struct page *page) +{ + if (PageTail(page)) { + struct page *head; + head = page->first_page; + smp_rmb(); + /* + * head may be a dangling pointer. + * __split_huge_page_refcount clears PageTail before + * overwriting first_page, so if PageTail is still + * there it means the head pointer isn't dangling. + */ + if (PageTail(page)) + return head; + } + return page; +} +#else /* CONFIG_TRANSPARENT_HUGEPAGE */ +#define HPAGE_PMD_SHIFT ({ BUG(); 0; }) +#define HPAGE_PMD_MASK ({ BUG(); 0; }) +#define HPAGE_PMD_SIZE ({ BUG(); 0; }) + +#define hpage_nr_pages(x) 1 + +#define transparent_hugepage_enabled(__vma) 0 + +#define transparent_hugepage_flags 0UL +static inline int split_huge_page(struct page *page) +{ + return 0; +} +#define split_huge_page_pmd(__mm, __pmd) \ + do { } while (0) +#define wait_split_huge_page(__anon_vma, __pmd) \ + do { } while (0) +#define compound_trans_head(page) compound_head(page) +static inline int hugepage_madvise(struct vm_area_struct *vma, + unsigned long *vm_flags, int advice) +{ + BUG(); + return 0; +} +static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, + unsigned long start, + unsigned long end, + long adjust_next) +{ +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +#endif /* _LINUX_HUGE_MM_H */ diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index 979c68c..6a64c6f 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -57,7 +57,7 @@ struct irq_desc { #endif struct timer_rand_state *timer_rand_state; - unsigned int *kstat_irqs; + unsigned int __percpu *kstat_irqs; irq_flow_handler_t handle_irq; struct irqaction *action; /* IRQ action list */ unsigned int status; /* IRQ status */ diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 57dac70..5a9d905 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -600,6 +600,13 @@ struct sysinfo { #define NUMA_BUILD 0 #endif +/* This helps us avoid #ifdef CONFIG_COMPACTION */ +#ifdef CONFIG_COMPACTION +#define COMPACTION_BUILD 1 +#else +#define COMPACTION_BUILD 0 +#endif + /* Rebuild everything on CONFIG_FTRACE_MCOUNT_RECORD */ #ifdef CONFIG_FTRACE_MCOUNT_RECORD # define REBUILD_DUE_TO_FTRACE_MCOUNT_RECORD diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index 44e83ba..0cce2db 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -46,16 +46,14 @@ DECLARE_PER_CPU(struct kernel_stat, kstat); extern unsigned long long nr_context_switches(void); #ifndef CONFIG_GENERIC_HARDIRQS -#define kstat_irqs_this_cpu(irq) \ - (this_cpu_read(kstat.irqs[irq]) struct irq_desc; static inline void kstat_incr_irqs_this_cpu(unsigned int irq, struct irq_desc *desc) { - kstat_this_cpu.irqs[irq]++; - kstat_this_cpu.irqs_sum++; + __this_cpu_inc(kstat.irqs[irq]); + __this_cpu_inc(kstat.irqs_sum); } static inline unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) @@ -65,17 +63,18 @@ static inline unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) #else #include <linux/irq.h> extern unsigned int kstat_irqs_cpu(unsigned int irq, int cpu); -#define kstat_irqs_this_cpu(DESC) \ - ((DESC)->kstat_irqs[smp_processor_id()]) -#define kstat_incr_irqs_this_cpu(irqno, DESC) do {\ - ((DESC)->kstat_irqs[smp_processor_id()]++);\ - kstat_this_cpu.irqs_sum++; } while (0) + +#define kstat_incr_irqs_this_cpu(irqno, DESC) \ +do { \ + __this_cpu_inc(*(DESC)->kstat_irqs); \ + __this_cpu_inc(kstat.irqs_sum); \ +} while (0) #endif static inline void kstat_incr_softirqs_this_cpu(unsigned int irq) { - kstat_this_cpu.softirqs[irq]++; + __this_cpu_inc(kstat.softirqs[irq]); } static inline unsigned int kstat_softirqs_cpu(unsigned int irq, int cpu) diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h new file mode 100644 index 0000000..6b394f0 --- /dev/null +++ b/include/linux/khugepaged.h @@ -0,0 +1,67 @@ +#ifndef _LINUX_KHUGEPAGED_H +#define _LINUX_KHUGEPAGED_H + +#include <linux/sched.h> /* MMF_VM_HUGEPAGE */ + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +extern int __khugepaged_enter(struct mm_struct *mm); +extern void __khugepaged_exit(struct mm_struct *mm); +extern int khugepaged_enter_vma_merge(struct vm_area_struct *vma); + +#define khugepaged_enabled() \ + (transparent_hugepage_flags & \ + ((1<<TRANSPARENT_HUGEPAGE_FLAG) | \ + (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG))) +#define khugepaged_always() \ + (transparent_hugepage_flags & \ + (1<<TRANSPARENT_HUGEPAGE_FLAG)) +#define khugepaged_req_madv() \ + (transparent_hugepage_flags & \ + (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)) +#define khugepaged_defrag() \ + (transparent_hugepage_flags & \ + (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)) + +static inline int khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm) +{ + if (test_bit(MMF_VM_HUGEPAGE, &oldmm->flags)) + return __khugepaged_enter(mm); + return 0; +} + +static inline void khugepaged_exit(struct mm_struct *mm) +{ + if (test_bit(MMF_VM_HUGEPAGE, &mm->flags)) + __khugepaged_exit(mm); +} + +static inline int khugepaged_enter(struct vm_area_struct *vma) +{ + if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags)) + if ((khugepaged_always() || + (khugepaged_req_madv() && + vma->vm_flags & VM_HUGEPAGE)) && + !(vma->vm_flags & VM_NOHUGEPAGE)) + if (__khugepaged_enter(vma->vm_mm)) + return -ENOMEM; + return 0; +} +#else /* CONFIG_TRANSPARENT_HUGEPAGE */ +static inline int khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm) +{ + return 0; +} +static inline void khugepaged_exit(struct mm_struct *mm) +{ +} +static inline int khugepaged_enter(struct vm_area_struct *vma) +{ + return 0; +} +static inline int khugepaged_enter_vma_merge(struct vm_area_struct *vma) +{ + return 0; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +#endif /* _LINUX_KHUGEPAGED_H */ diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 159a0762..6a576f9 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -25,6 +25,11 @@ struct page_cgroup; struct page; struct mm_struct; +/* Stats that can be updated by kernel. */ +enum mem_cgroup_page_stat_item { + MEMCG_NR_FILE_MAPPED, /* # of pages charged as file rss */ +}; + extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, struct list_head *dst, unsigned long *scanned, int order, @@ -93,7 +98,7 @@ extern int mem_cgroup_prepare_migration(struct page *page, struct page *newpage, struct mem_cgroup **ptr); extern void mem_cgroup_end_migration(struct mem_cgroup *mem, - struct page *oldpage, struct page *newpage); + struct page *oldpage, struct page *newpage, bool migration_ok); /* * For memory reclaim. @@ -121,7 +126,22 @@ static inline bool mem_cgroup_disabled(void) return false; } -void mem_cgroup_update_file_mapped(struct page *page, int val); +void mem_cgroup_update_page_stat(struct page *page, + enum mem_cgroup_page_stat_item idx, + int val); + +static inline void mem_cgroup_inc_page_stat(struct page *page, + enum mem_cgroup_page_stat_item idx) +{ + mem_cgroup_update_page_stat(page, idx, 1); +} + +static inline void mem_cgroup_dec_page_stat(struct page *page, + enum mem_cgroup_page_stat_item idx) +{ + mem_cgroup_update_page_stat(page, idx, -1); +} + unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, gfp_t gfp_mask); u64 mem_cgroup_get_limit(struct mem_cgroup *mem); @@ -231,8 +251,7 @@ mem_cgroup_prepare_migration(struct page *page, struct page *newpage, } static inline void mem_cgroup_end_migration(struct mem_cgroup *mem, - struct page *oldpage, - struct page *newpage) + struct page *oldpage, struct page *newpage, bool migration_ok) { } @@ -293,8 +312,13 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) { } -static inline void mem_cgroup_update_file_mapped(struct page *page, - int val) +static inline void mem_cgroup_inc_page_stat(struct page *page, + enum mem_cgroup_page_stat_item idx) +{ +} + +static inline void mem_cgroup_dec_page_stat(struct page *page, + enum mem_cgroup_page_stat_item idx) { } diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 31c237a..24376fe 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -13,12 +13,16 @@ struct mem_section; #ifdef CONFIG_MEMORY_HOTPLUG /* - * Types for free bootmem. - * The normal smallest mapcount is -1. Here is smaller value than it. + * Types for free bootmem stored in page->lru.next. These have to be in + * some random range in unsigned long space for debugging purposes. */ -#define SECTION_INFO (-1 - 1) -#define MIX_SECTION_INFO (-1 - 2) -#define NODE_INFO (-1 - 3) +enum { + MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE = 12, + SECTION_INFO = MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE, + MIX_SECTION_INFO, + NODE_INFO, + MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE = NODE_INFO, +}; /* * pgdat resizing functions diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 085527f..e39aeec 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -13,9 +13,11 @@ extern void putback_lru_pages(struct list_head *l); extern int migrate_page(struct address_space *, struct page *, struct page *); extern int migrate_pages(struct list_head *l, new_page_t x, - unsigned long private, int offlining); + unsigned long private, bool offlining, + bool sync); extern int migrate_huge_pages(struct list_head *l, new_page_t x, - unsigned long private, int offlining); + unsigned long private, bool offlining, + bool sync); extern int fail_migrate_page(struct address_space *, struct page *, struct page *); @@ -33,9 +35,11 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping, static inline void putback_lru_pages(struct list_head *l) {} static inline int migrate_pages(struct list_head *l, new_page_t x, - unsigned long private, int offlining) { return -ENOSYS; } + unsigned long private, bool offlining, + bool sync) { return -ENOSYS; } static inline int migrate_huge_pages(struct list_head *l, new_page_t x, - unsigned long private, int offlining) { return -ENOSYS; } + unsigned long private, bool offlining, + bool sync) { return -ENOSYS; } static inline int migrate_prep(void) { return -ENOSYS; } static inline int migrate_prep_local(void) { return -ENOSYS; } diff --git a/include/linux/mm.h b/include/linux/mm.h index 721f451..956a355 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -14,6 +14,7 @@ #include <linux/mm_types.h> #include <linux/range.h> #include <linux/pfn.h> +#include <linux/bit_spinlock.h> struct mempolicy; struct anon_vma; @@ -82,6 +83,7 @@ extern unsigned int kobjsize(const void *objp); #define VM_GROWSUP 0x00000200 #else #define VM_GROWSUP 0x00000000 +#define VM_NOHUGEPAGE 0x00000200 /* MADV_NOHUGEPAGE marked this vma */ #endif #define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */ #define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */ @@ -101,7 +103,11 @@ extern unsigned int kobjsize(const void *objp); #define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */ #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ #define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */ +#ifndef CONFIG_TRANSPARENT_HUGEPAGE #define VM_MAPPED_COPY 0x01000000 /* T if mapped copy of data (nommu mmap) */ +#else +#define VM_HUGEPAGE 0x01000000 /* MADV_HUGEPAGE marked this vma */ +#endif #define VM_INSERTPAGE 0x02000000 /* The vma has had "vm_insert_page()" done on it */ #define VM_ALWAYSDUMP 0x04000000 /* Always include in core dumps */ @@ -242,6 +248,7 @@ struct inode; * files which need it (119 of them) */ #include <linux/page-flags.h> +#include <linux/huge_mm.h> /* * Methods to modify the page usage count. @@ -305,6 +312,39 @@ static inline int is_vmalloc_or_module_addr(const void *x) } #endif +static inline void compound_lock(struct page *page) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + bit_spin_lock(PG_compound_lock, &page->flags); +#endif +} + +static inline void compound_unlock(struct page *page) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + bit_spin_unlock(PG_compound_lock, &page->flags); +#endif +} + +static inline unsigned long compound_lock_irqsave(struct page *page) +{ + unsigned long uninitialized_var(flags); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + local_irq_save(flags); + compound_lock(page); +#endif + return flags; +} + +static inline void compound_unlock_irqrestore(struct page *page, + unsigned long flags) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + compound_unlock(page); + local_irq_restore(flags); +#endif +} + static inline struct page *compound_head(struct page *page) { if (unlikely(PageTail(page))) @@ -319,9 +359,29 @@ static inline int page_count(struct page *page) static inline void get_page(struct page *page) { - page = compound_head(page); - VM_BUG_ON(atomic_read(&page->_count) == 0); + /* + * Getting a normal page or the head of a compound page + * requires to already have an elevated page->_count. Only if + * we're getting a tail page, the elevated page->_count is + * required only in the head page, so for tail pages the + * bugcheck only verifies that the page->_count isn't + * negative. + */ + VM_BUG_ON(atomic_read(&page->_count) < !PageTail(page)); atomic_inc(&page->_count); + /* + * Getting a tail page will elevate both the head and tail + * page->_count(s). + */ + if (unlikely(PageTail(page))) { + /* + * This is safe only because + * __split_huge_page_refcount can't run under + * get_page(). + */ + VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0); + atomic_inc(&page->first_page->_count); + } } static inline struct page *virt_to_head_page(const void *x) @@ -339,6 +399,27 @@ static inline void init_page_count(struct page *page) atomic_set(&page->_count, 1); } +/* + * PageBuddy() indicate that the page is free and in the buddy system + * (see mm/page_alloc.c). + */ +static inline int PageBuddy(struct page *page) +{ + return atomic_read(&page->_mapcount) == -2; +} + +static inline void __SetPageBuddy(struct page *page) +{ + VM_BUG_ON(atomic_read(&page->_mapcount) != -1); + atomic_set(&page->_mapcount, -2); +} + +static inline void __ClearPageBuddy(struct page *page) +{ + VM_BUG_ON(!PageBuddy(page)); + atomic_set(&page->_mapcount, -1); +} + void put_page(struct page *page); void put_pages_list(struct list_head *pages); @@ -370,12 +451,39 @@ static inline int compound_order(struct page *page) return (unsigned long)page[1].lru.prev; } +static inline int compound_trans_order(struct page *page) +{ + int order; + unsigned long flags; + + if (!PageHead(page)) + return 0; + + flags = compound_lock_irqsave(page); + order = compound_order(page); + compound_unlock_irqrestore(page, flags); + return order; +} + static inline void set_compound_order(struct page *page, unsigned long order) { page[1].lru.prev = (void *)order; } /* + * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when + * servicing faults for write access. In the normal case, do always want + * pte_mkwrite. But get_user_pages can cause write faults for mappings + * that do not have writing enabled, when used by access_process_vm. + */ +static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) +{ + if (likely(vma->vm_flags & VM_WRITE)) + pte = pte_mkwrite(pte); + return pte; +} + +/* * Multiple processes may "see" the same page. E.g. for untouched * mappings of /dev/null, all processes see the same page full of * zeroes, and text pages of executables and shared libraries have @@ -657,7 +765,7 @@ static inline struct address_space *page_mapping(struct page *page) VM_BUG_ON(PageSlab(page)); if (unlikely(PageSwapCache(page))) mapping = &swapper_space; - else if (unlikely((unsigned long)mapping & PAGE_MAPPING_ANON)) + else if ((unsigned long)mapping & PAGE_MAPPING_ANON) mapping = NULL; return mapping; } @@ -1064,7 +1172,8 @@ static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud, int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address); #endif -int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address); +int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, + pmd_t *pmd, unsigned long address); int __pte_alloc_kernel(pmd_t *pmd, unsigned long address); /* @@ -1133,16 +1242,18 @@ static inline void pgtable_page_dtor(struct page *page) pte_unmap(pte); \ } while (0) -#define pte_alloc_map(mm, pmd, address) \ - ((unlikely(!pmd_present(*(pmd))) && __pte_alloc(mm, pmd, address))? \ - NULL: pte_offset_map(pmd, address)) +#define pte_alloc_map(mm, vma, pmd, address) \ + ((unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, vma, \ + pmd, address))? \ + NULL: pte_offset_map(pmd, address)) #define pte_alloc_map_lock(mm, pmd, address, ptlp) \ - ((unlikely(!pmd_present(*(pmd))) && __pte_alloc(mm, pmd, address))? \ + ((unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, NULL, \ + pmd, address))? \ NULL: pte_offset_map_lock(mm, pmd, address, ptlp)) #define pte_alloc_kernel(pmd, address) \ - ((unlikely(!pmd_present(*(pmd))) && __pte_alloc_kernel(pmd, address))? \ + ((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd, address))? \ NULL: pte_offset_kernel(pmd, address)) extern void free_area_init(unsigned long * zones_size); @@ -1415,6 +1526,8 @@ struct page *follow_page(struct vm_area_struct *, unsigned long address, #define FOLL_GET 0x04 /* do get_page on page */ #define FOLL_DUMP 0x08 /* give error on hole if it would be zero */ #define FOLL_FORCE 0x10 /* get_user_pages read/write w/o permission */ +#define FOLL_MLOCK 0x40 /* mark page as mlocked */ +#define FOLL_SPLIT 0x80 /* don't return transhuge pages, split them */ typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr, void *data); @@ -1518,5 +1631,14 @@ static inline int is_hwpoison_address(unsigned long addr) extern void dump_page(struct page *page); +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) +extern void clear_huge_page(struct page *page, + unsigned long addr, + unsigned int pages_per_huge_page); +extern void copy_user_huge_page(struct page *dst, struct page *src, + unsigned long addr, struct vm_area_struct *vma, + unsigned int pages_per_huge_page); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ + #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 8835b87..8f7d247 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -1,6 +1,8 @@ #ifndef LINUX_MM_INLINE_H #define LINUX_MM_INLINE_H +#include <linux/huge_mm.h> + /** * page_is_file_cache - should the page be on a file LRU or anon LRU? * @page: the page to test @@ -20,18 +22,25 @@ static inline int page_is_file_cache(struct page *page) } static inline void -add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l) +__add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l, + struct list_head *head) { - list_add(&page->lru, &zone->lru[l].list); - __inc_zone_state(zone, NR_LRU_BASE + l); + list_add(&page->lru, head); + __mod_zone_page_state(zone, NR_LRU_BASE + l, hpage_nr_pages(page)); mem_cgroup_add_lru_list(page, l); } static inline void +add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l) +{ + __add_page_to_lru_list(zone, page, l, &zone->lru[l].list); +} + +static inline void del_page_from_lru_list(struct zone *zone, struct page *page, enum lru_list l) { list_del(&page->lru); - __dec_zone_state(zone, NR_LRU_BASE + l); + __mod_zone_page_state(zone, NR_LRU_BASE + l, -hpage_nr_pages(page)); mem_cgroup_del_lru_list(page, l); } @@ -66,7 +75,7 @@ del_page_from_lru(struct zone *zone, struct page *page) l += LRU_ACTIVE; } } - __dec_zone_state(zone, NR_LRU_BASE + l); + __mod_zone_page_state(zone, NR_LRU_BASE + l, -hpage_nr_pages(page)); mem_cgroup_del_lru_list(page, l); } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index bb7288a..26bc4e2 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -310,6 +310,9 @@ struct mm_struct { #ifdef CONFIG_MMU_NOTIFIER struct mmu_notifier_mm *mmu_notifier_mm; #endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + pgtable_t pmd_huge_pte; /* protected by page_table_lock */ +#endif /* How many tasks sharing this mm are OOM_DISABLE */ atomic_t oom_disable_count; }; diff --git a/include/linux/mmc/sh_mmcif.h b/include/linux/mmc/sh_mmcif.h index bf17350..38d3930 100644 --- a/include/linux/mmc/sh_mmcif.h +++ b/include/linux/mmc/sh_mmcif.h @@ -94,12 +94,12 @@ struct sh_mmcif_plat_data { static inline u32 sh_mmcif_readl(void __iomem *addr, int reg) { - return readl(addr + reg); + return __raw_readl(addr + reg); } static inline void sh_mmcif_writel(void __iomem *addr, int reg, u32 val) { - writel(val, addr + reg); + __raw_writel(val, addr + reg); } #define SH_MMCIF_BBS 512 /* boot block size */ diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 43dcfbd..cc2e7df 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -62,6 +62,16 @@ struct mmu_notifier_ops { unsigned long address); /* + * test_young is called to check the young/accessed bitflag in + * the secondary pte. This is used to know if the page is + * frequently used without actually clearing the flag or tearing + * down the secondary mapping on the page. + */ + int (*test_young)(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long address); + + /* * change_pte is called in cases that pte mapping to page is changed: * for example, when ksm remaps pte to point to a new shared page. */ @@ -163,6 +173,8 @@ extern void __mmu_notifier_mm_destroy(struct mm_struct *mm); extern void __mmu_notifier_release(struct mm_struct *mm); extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm, unsigned long address); +extern int __mmu_notifier_test_young(struct mm_struct *mm, + unsigned long address); extern void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, pte_t pte); extern void __mmu_notifier_invalidate_page(struct mm_struct *mm, @@ -186,6 +198,14 @@ static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm, return 0; } +static inline int mmu_notifier_test_young(struct mm_struct *mm, + unsigned long address) +{ + if (mm_has_notifiers(mm)) + return __mmu_notifier_test_young(mm, address); + return 0; +} + static inline void mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, pte_t pte) { @@ -243,6 +263,32 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) __pte; \ }) +#define pmdp_clear_flush_notify(__vma, __address, __pmdp) \ +({ \ + pmd_t __pmd; \ + struct vm_area_struct *___vma = __vma; \ + unsigned long ___address = __address; \ + VM_BUG_ON(__address & ~HPAGE_PMD_MASK); \ + mmu_notifier_invalidate_range_start(___vma->vm_mm, ___address, \ + (__address)+HPAGE_PMD_SIZE);\ + __pmd = pmdp_clear_flush(___vma, ___address, __pmdp); \ + mmu_notifier_invalidate_range_end(___vma->vm_mm, ___address, \ + (__address)+HPAGE_PMD_SIZE); \ + __pmd; \ +}) + +#define pmdp_splitting_flush_notify(__vma, __address, __pmdp) \ +({ \ + struct vm_area_struct *___vma = __vma; \ + unsigned long ___address = __address; \ + VM_BUG_ON(__address & ~HPAGE_PMD_MASK); \ + mmu_notifier_invalidate_range_start(___vma->vm_mm, ___address, \ + (__address)+HPAGE_PMD_SIZE);\ + pmdp_splitting_flush(___vma, ___address, __pmdp); \ + mmu_notifier_invalidate_range_end(___vma->vm_mm, ___address, \ + (__address)+HPAGE_PMD_SIZE); \ +}) + #define ptep_clear_flush_young_notify(__vma, __address, __ptep) \ ({ \ int __young; \ @@ -254,6 +300,17 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) __young; \ }) +#define pmdp_clear_flush_young_notify(__vma, __address, __pmdp) \ +({ \ + int __young; \ + struct vm_area_struct *___vma = __vma; \ + unsigned long ___address = __address; \ + __young = pmdp_clear_flush_young(___vma, ___address, __pmdp); \ + __young |= mmu_notifier_clear_flush_young(___vma->vm_mm, \ + ___address); \ + __young; \ +}) + #define set_pte_at_notify(__mm, __address, __ptep, __pte) \ ({ \ struct mm_struct *___mm = __mm; \ @@ -276,6 +333,12 @@ static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm, return 0; } +static inline int mmu_notifier_test_young(struct mm_struct *mm, + unsigned long address) +{ + return 0; +} + static inline void mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, pte_t pte) { @@ -305,7 +368,10 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) } #define ptep_clear_flush_young_notify ptep_clear_flush_young +#define pmdp_clear_flush_young_notify pmdp_clear_flush_young #define ptep_clear_flush_notify ptep_clear_flush +#define pmdp_clear_flush_notify pmdp_clear_flush +#define pmdp_splitting_flush_notify pmdp_splitting_flush #define set_pte_at_notify set_pte_at #endif /* CONFIG_MMU_NOTIFIER */ diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 39c24eb..02ecb01 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -114,6 +114,7 @@ enum zone_stat_item { NUMA_LOCAL, /* allocation from local node */ NUMA_OTHER, /* allocation from other node */ #endif + NR_ANON_TRANSPARENT_HUGEPAGES, NR_VM_ZONE_STAT_ITEMS }; /* @@ -458,12 +459,6 @@ static inline int zone_is_oom_locked(const struct zone *zone) return test_bit(ZONE_OOM_LOCKED, &zone->flags); } -#ifdef CONFIG_SMP -unsigned long zone_nr_free_pages(struct zone *zone); -#else -#define zone_nr_free_pages(zone) zone_page_state(zone, NR_FREE_PAGES) -#endif /* CONFIG_SMP */ - /* * The "priority" of VM scanning is how much of the queues we will scan in one * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the @@ -645,6 +640,7 @@ typedef struct pglist_data { wait_queue_head_t kswapd_wait; struct task_struct *kswapd; int kswapd_max_order; + enum zone_type classzone_idx; } pg_data_t; #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) @@ -660,8 +656,10 @@ typedef struct pglist_data { extern struct mutex zonelists_mutex; void build_all_zonelists(void *data); -void wakeup_kswapd(struct zone *zone, int order); -int zone_watermark_ok(struct zone *z, int order, unsigned long mark, +void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx); +bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, + int classzone_idx, int alloc_flags); +bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, int classzone_idx, int alloc_flags); enum memmap_context { MEMMAP_EARLY, diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 5f38c46..0db8037 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -48,9 +48,6 @@ * struct page (these bits with information) are always mapped into kernel * address space... * - * PG_buddy is set to indicate that the page is free and in the buddy system - * (see mm/page_alloc.c). - * * PG_hwpoison indicates that a page got corrupted in hardware and contains * data with incorrect ECC bits that triggered a machine check. Accessing is * not safe since it may cause another machine check. Don't touch! @@ -96,7 +93,6 @@ enum pageflags { PG_swapcache, /* Swap page: swp_entry_t in private */ PG_mappedtodisk, /* Has blocks allocated on-disk */ PG_reclaim, /* To be reclaimed asap */ - PG_buddy, /* Page is free, on buddy lists */ PG_swapbacked, /* Page is backed by RAM/swap */ PG_unevictable, /* Page is "unevictable" */ #ifdef CONFIG_MMU @@ -108,6 +104,9 @@ enum pageflags { #ifdef CONFIG_MEMORY_FAILURE PG_hwpoison, /* hardware poisoned page. Don't touch */ #endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + PG_compound_lock, +#endif __NR_PAGEFLAGS, /* Filesystems */ @@ -198,7 +197,7 @@ static inline int __TestClearPage##uname(struct page *page) { return 0; } struct page; /* forward declaration */ TESTPAGEFLAG(Locked, locked) TESTSETFLAG(Locked, locked) -PAGEFLAG(Error, error) +PAGEFLAG(Error, error) TESTCLEARFLAG(Error, error) PAGEFLAG(Referenced, referenced) TESTCLEARFLAG(Referenced, referenced) PAGEFLAG(Dirty, dirty) TESTSCFLAG(Dirty, dirty) __CLEARPAGEFLAG(Dirty, dirty) PAGEFLAG(LRU, lru) __CLEARPAGEFLAG(LRU, lru) @@ -230,7 +229,6 @@ PAGEFLAG(OwnerPriv1, owner_priv_1) TESTCLEARFLAG(OwnerPriv1, owner_priv_1) * risky: they bypass page accounting. */ TESTPAGEFLAG(Writeback, writeback) TESTSCFLAG(Writeback, writeback) -__PAGEFLAG(Buddy, buddy) PAGEFLAG(MappedToDisk, mappedtodisk) /* PG_readahead is only used for file reads; PG_reclaim is only for writes */ @@ -344,7 +342,7 @@ static inline void set_page_writeback(struct page *page) * tests can be used in performance sensitive paths. PageCompound is * generally not used in hot code paths. */ -__PAGEFLAG(Head, head) +__PAGEFLAG(Head, head) CLEARPAGEFLAG(Head, head) __PAGEFLAG(Tail, tail) static inline int PageCompound(struct page *page) @@ -352,6 +350,13 @@ static inline int PageCompound(struct page *page) return page->flags & ((1L << PG_head) | (1L << PG_tail)); } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline void ClearPageCompound(struct page *page) +{ + BUG_ON(!PageHead(page)); + ClearPageHead(page); +} +#endif #else /* * Reduce page flag use as much as possible by overlapping @@ -389,14 +394,61 @@ static inline void __ClearPageTail(struct page *page) page->flags &= ~PG_head_tail_mask; } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline void ClearPageCompound(struct page *page) +{ + BUG_ON((page->flags & PG_head_tail_mask) != (1 << PG_compound)); + clear_bit(PG_compound, &page->flags); +} +#endif + #endif /* !PAGEFLAGS_EXTENDED */ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +/* + * PageHuge() only returns true for hugetlbfs pages, but not for + * normal or transparent huge pages. + * + * PageTransHuge() returns true for both transparent huge and + * hugetlbfs pages, but not normal pages. PageTransHuge() can only be + * called only in the core VM paths where hugetlbfs pages can't exist. + */ +static inline int PageTransHuge(struct page *page) +{ + VM_BUG_ON(PageTail(page)); + return PageHead(page); +} + +static inline int PageTransCompound(struct page *page) +{ + return PageCompound(page); +} + +#else + +static inline int PageTransHuge(struct page *page) +{ + return 0; +} + +static inline int PageTransCompound(struct page *page) +{ + return 0; +} +#endif + #ifdef CONFIG_MMU #define __PG_MLOCKED (1 << PG_mlocked) #else #define __PG_MLOCKED 0 #endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#define __PG_COMPOUND_LOCK (1 << PG_compound_lock) +#else +#define __PG_COMPOUND_LOCK 0 +#endif + /* * Flags checked when a page is freed. Pages being freed should not have * these flags set. It they are, there is a problem. @@ -404,9 +456,10 @@ static inline void __ClearPageTail(struct page *page) #define PAGE_FLAGS_CHECK_AT_FREE \ (1 << PG_lru | 1 << PG_locked | \ 1 << PG_private | 1 << PG_private_2 | \ - 1 << PG_buddy | 1 << PG_writeback | 1 << PG_reserved | \ + 1 << PG_writeback | 1 << PG_reserved | \ 1 << PG_slab | 1 << PG_swapcache | 1 << PG_active | \ - 1 << PG_unevictable | __PG_MLOCKED | __PG_HWPOISON) + 1 << PG_unevictable | __PG_MLOCKED | __PG_HWPOISON | \ + __PG_COMPOUND_LOCK) /* * Flags checked when a page is prepped for return by the page allocator. diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h index b02195d..5b0c971 100644 --- a/include/linux/page_cgroup.h +++ b/include/linux/page_cgroup.h @@ -35,12 +35,18 @@ struct page_cgroup *lookup_page_cgroup(struct page *page); enum { /* flags for mem_cgroup */ - PCG_LOCK, /* page cgroup is locked */ + PCG_LOCK, /* Lock for pc->mem_cgroup and following bits. */ PCG_CACHE, /* charged as cache */ PCG_USED, /* this object is in use. */ - PCG_ACCT_LRU, /* page has been accounted for */ - PCG_FILE_MAPPED, /* page is accounted as "mapped" */ PCG_MIGRATION, /* under page migration */ + /* flags for mem_cgroup and file and I/O status */ + PCG_MOVE_LOCK, /* For race between move_account v.s. following bits */ + PCG_FILE_MAPPED, /* page is accounted as "mapped" */ + PCG_FILE_DIRTY, /* page is dirty */ + PCG_FILE_WRITEBACK, /* page is under writeback */ + PCG_FILE_UNSTABLE_NFS, /* page is NFS unstable */ + /* No lock in page_cgroup */ + PCG_ACCT_LRU, /* page has been accounted for (under lru_lock) */ }; #define TESTPCGFLAG(uname, lname) \ @@ -59,6 +65,10 @@ static inline void ClearPageCgroup##uname(struct page_cgroup *pc) \ static inline int TestClearPageCgroup##uname(struct page_cgroup *pc) \ { return test_and_clear_bit(PCG_##lname, &pc->flags); } +#define TESTSETPCGFLAG(uname, lname) \ +static inline int TestSetPageCgroup##uname(struct page_cgroup *pc) \ + { return test_and_set_bit(PCG_##lname, &pc->flags); } + /* Cache flag is set only once (at allocation) */ TESTPCGFLAG(Cache, CACHE) CLEARPCGFLAG(Cache, CACHE) @@ -78,6 +88,22 @@ SETPCGFLAG(FileMapped, FILE_MAPPED) CLEARPCGFLAG(FileMapped, FILE_MAPPED) TESTPCGFLAG(FileMapped, FILE_MAPPED) +SETPCGFLAG(FileDirty, FILE_DIRTY) +CLEARPCGFLAG(FileDirty, FILE_DIRTY) +TESTPCGFLAG(FileDirty, FILE_DIRTY) +TESTCLEARPCGFLAG(FileDirty, FILE_DIRTY) +TESTSETPCGFLAG(FileDirty, FILE_DIRTY) + +SETPCGFLAG(FileWriteback, FILE_WRITEBACK) +CLEARPCGFLAG(FileWriteback, FILE_WRITEBACK) +TESTPCGFLAG(FileWriteback, FILE_WRITEBACK) + +SETPCGFLAG(FileUnstableNFS, FILE_UNSTABLE_NFS) +CLEARPCGFLAG(FileUnstableNFS, FILE_UNSTABLE_NFS) +TESTPCGFLAG(FileUnstableNFS, FILE_UNSTABLE_NFS) +TESTCLEARPCGFLAG(FileUnstableNFS, FILE_UNSTABLE_NFS) +TESTSETPCGFLAG(FileUnstableNFS, FILE_UNSTABLE_NFS) + SETPCGFLAG(Migration, MIGRATION) CLEARPCGFLAG(Migration, MIGRATION) TESTPCGFLAG(Migration, MIGRATION) @@ -94,6 +120,10 @@ static inline enum zone_type page_cgroup_zid(struct page_cgroup *pc) static inline void lock_page_cgroup(struct page_cgroup *pc) { + /* + * Don't take this lock in IRQ context. + * This lock is for pc->mem_cgroup, USED, CACHE, MIGRATION + */ bit_spin_lock(PCG_LOCK, &pc->flags); } @@ -107,6 +137,24 @@ static inline int page_is_cgroup_locked(struct page_cgroup *pc) return bit_spin_is_locked(PCG_LOCK, &pc->flags); } +static inline void move_lock_page_cgroup(struct page_cgroup *pc, + unsigned long *flags) +{ + /* + * We know updates to pc->flags of page cache's stats are from both of + * usual context or IRQ context. Disable IRQ to avoid deadlock. + */ + local_irq_save(*flags); + bit_spin_lock(PCG_MOVE_LOCK, &pc->flags); +} + +static inline void move_unlock_page_cgroup(struct page_cgroup *pc, + unsigned long *flags) +{ + bit_spin_unlock(PCG_MOVE_LOCK, &pc->flags); + local_irq_restore(*flags); +} + #else /* CONFIG_CGROUP_MEM_RES_CTLR */ struct page_cgroup; diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 2d1ffe3..9c66e99 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -48,7 +48,7 @@ static inline void mapping_clear_unevictable(struct address_space *mapping) static inline int mapping_unevictable(struct address_space *mapping) { - if (likely(mapping)) + if (mapping) return test_bit(AS_UNEVICTABLE, &mapping->flags); return !!mapping; } diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index ab2baa5..23241c2 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -146,6 +146,22 @@ static inline void *radix_tree_deref_slot(void **pslot) } /** + * radix_tree_deref_slot_protected - dereference a slot without RCU lock but with tree lock held + * @pslot: pointer to slot, returned by radix_tree_lookup_slot + * Returns: item that was stored in that slot with any direct pointer flag + * removed. + * + * Similar to radix_tree_deref_slot but only used during migration when a pages + * mapping is being moved. The caller does not hold the RCU read lock but it + * must hold the tree lock to prevent parallel updates. + */ +static inline void *radix_tree_deref_slot_protected(void **pslot, + spinlock_t *treelock) +{ + return rcu_dereference_protected(*pslot, lockdep_is_held(treelock)); +} + +/** * radix_tree_deref_retry - check radix_tree_deref_slot * @arg: pointer returned by radix_tree_deref_slot * Returns: 0 if retry is not required, otherwise retry is required diff --git a/include/linux/rmap.h b/include/linux/rmap.h index bb83c0d..e9fd04c 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -198,6 +198,8 @@ enum ttu_flags { }; #define TTU_ACTION(x) ((x) & TTU_ACTION_MASK) +bool is_vma_temporary_stack(struct vm_area_struct *vma); + int try_to_unmap(struct page *, enum ttu_flags flags); int try_to_unmap_one(struct page *, struct vm_area_struct *, unsigned long address, enum ttu_flags flags); diff --git a/include/linux/sched.h b/include/linux/sched.h index 96e2321..d747f94 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -21,7 +21,8 @@ #define CLONE_DETACHED 0x00400000 /* Unused, ignored */ #define CLONE_UNTRACED 0x00800000 /* set if the tracing process can't force CLONE_PTRACE on this clone */ #define CLONE_CHILD_SETTID 0x01000000 /* set the TID in the child */ -#define CLONE_STOPPED 0x02000000 /* Start in stopped state */ +/* 0x02000000 was previously the unused CLONE_STOPPED (Start in stopped state) + and is now available for re-use. */ #define CLONE_NEWUTS 0x04000000 /* New utsname group? */ #define CLONE_NEWIPC 0x08000000 /* New ipcs */ #define CLONE_NEWUSER 0x10000000 /* New user namespace */ @@ -433,6 +434,7 @@ extern int get_dumpable(struct mm_struct *mm); #endif /* leave room for more dump flags */ #define MMF_VM_MERGEABLE 16 /* KSM may merge identical pages */ +#define MMF_VM_HUGEPAGE 17 /* set when VM_HUGEPAGE is set on vma */ #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK) @@ -633,6 +635,8 @@ struct signal_struct { int oom_adj; /* OOM kill score adjustment (bit shift) */ int oom_score_adj; /* OOM kill score adjustment */ + int oom_score_adj_min; /* OOM kill score adjustment minimum value. + * Only settable by CAP_SYS_RESOURCE. */ struct mutex cred_guard_mutex; /* guard against foreign influences on * credential calculations diff --git a/include/linux/swap.h b/include/linux/swap.h index eba53e7..4d55932 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -208,6 +208,8 @@ extern unsigned int nr_free_pagecache_pages(void); /* linux/mm/swap.c */ extern void __lru_cache_add(struct page *, enum lru_list lru); extern void lru_cache_add_lru(struct page *, enum lru_list lru); +extern void lru_add_page_tail(struct zone* zone, + struct page *page, struct page *page_tail); extern void activate_page(struct page *); extern void mark_page_accessed(struct page *); extern void lru_add_drain(void); diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 44b54f6..4ed6fcd 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -59,8 +59,9 @@ extern void *vmalloc_exec(unsigned long size); extern void *vmalloc_32(unsigned long size); extern void *vmalloc_32_user(unsigned long size); extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot); -extern void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, - pgprot_t prot); +extern void *__vmalloc_node_range(unsigned long size, unsigned long align, + unsigned long start, unsigned long end, gfp_t gfp_mask, + pgprot_t prot, int node, void *caller); extern void vfree(const void *addr); extern void *vmap(struct page **pages, unsigned int count, @@ -90,9 +91,6 @@ extern struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, unsigned long start, unsigned long end, void *caller); -extern struct vm_struct *get_vm_area_node(unsigned long size, - unsigned long flags, int node, - gfp_t gfp_mask); extern struct vm_struct *remove_vm_area(const void *addr); extern int map_vm_area(struct vm_struct *area, pgprot_t prot, @@ -120,7 +118,7 @@ extern __init void vm_area_register_early(struct vm_struct *vm, size_t align); #ifdef CONFIG_SMP struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, const size_t *sizes, int nr_vms, - size_t align, gfp_t gfp_mask); + size_t align); void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms); #endif diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index eaaea37..833e676 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -254,6 +254,11 @@ extern void dec_zone_state(struct zone *, enum zone_stat_item); extern void __dec_zone_state(struct zone *, enum zone_stat_item); void refresh_cpu_vm_stats(int); + +int calculate_pressure_threshold(struct zone *zone); +int calculate_normal_threshold(struct zone *zone); +void set_pgdat_percpu_threshold(pg_data_t *pgdat, + int (*calculate_pressure)(struct zone *)); #else /* CONFIG_SMP */ /* @@ -298,6 +303,8 @@ static inline void __dec_zone_page_state(struct page *page, #define dec_zone_page_state __dec_zone_page_state #define mod_zone_page_state __mod_zone_page_state +#define set_pgdat_percpu_threshold(pgdat, callback) { } + static inline void refresh_cpu_vm_stats(int cpu) { } #endif diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h new file mode 100644 index 0000000..388bcdd --- /dev/null +++ b/include/trace/events/compaction.h @@ -0,0 +1,74 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM compaction + +#if !defined(_TRACE_COMPACTION_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_COMPACTION_H + +#include <linux/types.h> +#include <linux/tracepoint.h> +#include "gfpflags.h" + +DECLARE_EVENT_CLASS(mm_compaction_isolate_template, + + TP_PROTO(unsigned long nr_scanned, + unsigned long nr_taken), + + TP_ARGS(nr_scanned, nr_taken), + + TP_STRUCT__entry( + __field(unsigned long, nr_scanned) + __field(unsigned long, nr_taken) + ), + + TP_fast_assign( + __entry->nr_scanned = nr_scanned; + __entry->nr_taken = nr_taken; + ), + + TP_printk("nr_scanned=%lu nr_taken=%lu", + __entry->nr_scanned, + __entry->nr_taken) +); + +DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_migratepages, + + TP_PROTO(unsigned long nr_scanned, + unsigned long nr_taken), + + TP_ARGS(nr_scanned, nr_taken) +); + +DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_freepages, + TP_PROTO(unsigned long nr_scanned, + unsigned long nr_taken), + + TP_ARGS(nr_scanned, nr_taken) +); + +TRACE_EVENT(mm_compaction_migratepages, + + TP_PROTO(unsigned long nr_migrated, + unsigned long nr_failed), + + TP_ARGS(nr_migrated, nr_failed), + + TP_STRUCT__entry( + __field(unsigned long, nr_migrated) + __field(unsigned long, nr_failed) + ), + + TP_fast_assign( + __entry->nr_migrated = nr_migrated; + __entry->nr_failed = nr_failed; + ), + + TP_printk("nr_migrated=%lu nr_failed=%lu", + __entry->nr_migrated, + __entry->nr_failed) +); + + +#endif /* _TRACE_COMPACTION_H */ + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index c255fcc..ea422aa 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -25,13 +25,13 @@ #define trace_reclaim_flags(page, sync) ( \ (page_is_file_cache(page) ? RECLAIM_WB_FILE : RECLAIM_WB_ANON) | \ - (sync == LUMPY_MODE_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \ + (sync & RECLAIM_MODE_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \ ) #define trace_shrink_flags(file, sync) ( \ - (sync == LUMPY_MODE_SYNC ? RECLAIM_WB_MIXED : \ + (sync & RECLAIM_MODE_SYNC ? RECLAIM_WB_MIXED : \ (file ? RECLAIM_WB_FILE : RECLAIM_WB_ANON)) | \ - (sync == LUMPY_MODE_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \ + (sync & RECLAIM_MODE_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \ ) TRACE_EVENT(mm_vmscan_kswapd_sleep, diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 89a2b2d..4e249b9 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -81,6 +81,7 @@ DEFINE_EVENT(writeback_class, name, \ TP_ARGS(bdi)) DEFINE_WRITEBACK_EVENT(writeback_nowork); +DEFINE_WRITEBACK_EVENT(writeback_wake_background); DEFINE_WRITEBACK_EVENT(writeback_wake_thread); DEFINE_WRITEBACK_EVENT(writeback_wake_forker_thread); DEFINE_WRITEBACK_EVENT(writeback_bdi_register); diff --git a/include/xen/gntdev.h b/include/xen/gntdev.h new file mode 100644 index 0000000..eb23f41 --- /dev/null +++ b/include/xen/gntdev.h @@ -0,0 +1,119 @@ +/****************************************************************************** + * gntdev.h + * + * Interface to /dev/xen/gntdev. + * + * Copyright (c) 2007, D G Murray + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef __LINUX_PUBLIC_GNTDEV_H__ +#define __LINUX_PUBLIC_GNTDEV_H__ + +struct ioctl_gntdev_grant_ref { + /* The domain ID of the grant to be mapped. */ + uint32_t domid; + /* The grant reference of the grant to be mapped. */ + uint32_t ref; +}; + +/* + * Inserts the grant references into the mapping table of an instance + * of gntdev. N.B. This does not perform the mapping, which is deferred + * until mmap() is called with @index as the offset. + */ +#define IOCTL_GNTDEV_MAP_GRANT_REF \ +_IOC(_IOC_NONE, 'G', 0, sizeof(struct ioctl_gntdev_map_grant_ref)) +struct ioctl_gntdev_map_grant_ref { + /* IN parameters */ + /* The number of grants to be mapped. */ + uint32_t count; + uint32_t pad; + /* OUT parameters */ + /* The offset to be used on a subsequent call to mmap(). */ + uint64_t index; + /* Variable IN parameter. */ + /* Array of grant references, of size @count. */ + struct ioctl_gntdev_grant_ref refs[1]; +}; + +/* + * Removes the grant references from the mapping table of an instance of + * of gntdev. N.B. munmap() must be called on the relevant virtual address(es) + * before this ioctl is called, or an error will result. + */ +#define IOCTL_GNTDEV_UNMAP_GRANT_REF \ +_IOC(_IOC_NONE, 'G', 1, sizeof(struct ioctl_gntdev_unmap_grant_ref)) +struct ioctl_gntdev_unmap_grant_ref { + /* IN parameters */ + /* The offset was returned by the corresponding map operation. */ + uint64_t index; + /* The number of pages to be unmapped. */ + uint32_t count; + uint32_t pad; +}; + +/* + * Returns the offset in the driver's address space that corresponds + * to @vaddr. This can be used to perform a munmap(), followed by an + * UNMAP_GRANT_REF ioctl, where no state about the offset is retained by + * the caller. The number of pages that were allocated at the same time as + * @vaddr is returned in @count. + * + * N.B. Where more than one page has been mapped into a contiguous range, the + * supplied @vaddr must correspond to the start of the range; otherwise + * an error will result. It is only possible to munmap() the entire + * contiguously-allocated range at once, and not any subrange thereof. + */ +#define IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR \ +_IOC(_IOC_NONE, 'G', 2, sizeof(struct ioctl_gntdev_get_offset_for_vaddr)) +struct ioctl_gntdev_get_offset_for_vaddr { + /* IN parameters */ + /* The virtual address of the first mapped page in a range. */ + uint64_t vaddr; + /* OUT parameters */ + /* The offset that was used in the initial mmap() operation. */ + uint64_t offset; + /* The number of pages mapped in the VM area that begins at @vaddr. */ + uint32_t count; + uint32_t pad; +}; + +/* + * Sets the maximum number of grants that may mapped at once by this gntdev + * instance. + * + * N.B. This must be called before any other ioctl is performed on the device. + */ +#define IOCTL_GNTDEV_SET_MAX_GRANTS \ +_IOC(_IOC_NONE, 'G', 3, sizeof(struct ioctl_gntdev_set_max_grants)) +struct ioctl_gntdev_set_max_grants { + /* IN parameter */ + /* The maximum number of grants that may be mapped at once. */ + uint32_t count; +}; + +#endif /* __LINUX_PUBLIC_GNTDEV_H__ */ diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h index 9a73170..b1fab6b 100644 --- a/include/xen/grant_table.h +++ b/include/xen/grant_table.h @@ -37,10 +37,16 @@ #ifndef __ASM_GNTTAB_H__ #define __ASM_GNTTAB_H__ -#include <asm/xen/hypervisor.h> +#include <asm/page.h> + +#include <xen/interface/xen.h> #include <xen/interface/grant_table.h> + +#include <asm/xen/hypervisor.h> #include <asm/xen/grant_table.h> +#include <xen/features.h> + /* NR_GRANT_FRAMES must be less than or equal to that configured in Xen */ #define NR_GRANT_FRAMES 4 @@ -107,6 +113,37 @@ void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid, void gnttab_grant_foreign_transfer_ref(grant_ref_t, domid_t domid, unsigned long pfn); +static inline void +gnttab_set_map_op(struct gnttab_map_grant_ref *map, phys_addr_t addr, + uint32_t flags, grant_ref_t ref, domid_t domid) +{ + if (flags & GNTMAP_contains_pte) + map->host_addr = addr; + else if (xen_feature(XENFEAT_auto_translated_physmap)) + map->host_addr = __pa(addr); + else + map->host_addr = addr; + + map->flags = flags; + map->ref = ref; + map->dom = domid; +} + +static inline void +gnttab_set_unmap_op(struct gnttab_unmap_grant_ref *unmap, phys_addr_t addr, + uint32_t flags, grant_handle_t handle) +{ + if (flags & GNTMAP_contains_pte) + unmap->host_addr = addr; + else if (xen_feature(XENFEAT_auto_translated_physmap)) + unmap->host_addr = __pa(addr); + else + unmap->host_addr = addr; + + unmap->handle = handle; + unmap->dev_bus_addr = 0; +} + int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes, unsigned long max_nr_gframes, struct grant_entry **__shared); @@ -118,4 +155,9 @@ unsigned int gnttab_max_grant_frames(void); #define gnttab_map_vaddr(map) ((void *)(map.host_virt_addr)) +int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops, + struct page **pages, unsigned int count); +int gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops, + struct page **pages, unsigned int count); + #endif /* __ASM_GNTTAB_H__ */ diff --git a/kernel/fork.c b/kernel/fork.c index d9b44f2..25e4291 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -66,6 +66,7 @@ #include <linux/posix-timers.h> #include <linux/user-return-notifier.h> #include <linux/oom.h> +#include <linux/khugepaged.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> @@ -330,6 +331,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) retval = ksm_fork(mm, oldmm); if (retval) goto out; + retval = khugepaged_fork(mm, oldmm); + if (retval) + goto out; prev = NULL; for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { @@ -529,6 +533,9 @@ void __mmdrop(struct mm_struct *mm) mm_free_pgd(mm); destroy_context(mm); mmu_notifier_mm_destroy(mm); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + VM_BUG_ON(mm->pmd_huge_pte); +#endif free_mm(mm); } EXPORT_SYMBOL_GPL(__mmdrop); @@ -543,6 +550,7 @@ void mmput(struct mm_struct *mm) if (atomic_dec_and_test(&mm->mm_users)) { exit_aio(mm); ksm_exit(mm); + khugepaged_exit(mm); /* must run before exit_mmap */ exit_mmap(mm); set_mm_exe_file(mm, NULL); if (!list_empty(&mm->mmlist)) { @@ -669,6 +677,10 @@ struct mm_struct *dup_mm(struct task_struct *tsk) mm->token_priority = 0; mm->last_interval = 0; +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + mm->pmd_huge_pte = NULL; +#endif + if (!mm_init(mm, tsk)) goto fail_nomem; @@ -910,6 +922,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->oom_adj = current->signal->oom_adj; sig->oom_score_adj = current->signal->oom_score_adj; + sig->oom_score_adj_min = current->signal->oom_score_adj_min; mutex_init(&sig->cred_guard_mutex); @@ -1410,23 +1423,6 @@ long do_fork(unsigned long clone_flags, } /* - * We hope to recycle these flags after 2.6.26 - */ - if (unlikely(clone_flags & CLONE_STOPPED)) { - static int __read_mostly count = 100; - - if (count > 0 && printk_ratelimit()) { - char comm[TASK_COMM_LEN]; - - count--; - printk(KERN_INFO "fork(): process `%s' used deprecated " - "clone flags 0x%lx\n", - get_task_comm(comm, current), - clone_flags & CLONE_STOPPED); - } - } - - /* * When called from kernel_thread, don't do user tracing stuff. */ if (likely(user_mode(regs))) @@ -1464,16 +1460,7 @@ long do_fork(unsigned long clone_flags, */ p->flags &= ~PF_STARTING; - if (unlikely(clone_flags & CLONE_STOPPED)) { - /* - * We'll start up with an immediate SIGSTOP. - */ - sigaddset(&p->pending.signal, SIGSTOP); - set_tsk_thread_flag(p, TIF_SIGPENDING); - __set_task_state(p, TASK_STOPPED); - } else { - wake_up_new_task(p, clone_flags); - } + wake_up_new_task(p, clone_flags); tracehook_report_clone_complete(trace, regs, clone_flags, nr, p); diff --git a/kernel/futex.c b/kernel/futex.c index 3019b92..5207563 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -233,7 +233,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key) { unsigned long address = (unsigned long)uaddr; struct mm_struct *mm = current->mm; - struct page *page; + struct page *page, *page_head; int err; /* @@ -265,11 +265,46 @@ again: if (err < 0) return err; - page = compound_head(page); - lock_page(page); - if (!page->mapping) { - unlock_page(page); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + page_head = page; + if (unlikely(PageTail(page))) { put_page(page); + /* serialize against __split_huge_page_splitting() */ + local_irq_disable(); + if (likely(__get_user_pages_fast(address, 1, 1, &page) == 1)) { + page_head = compound_head(page); + /* + * page_head is valid pointer but we must pin + * it before taking the PG_lock and/or + * PG_compound_lock. The moment we re-enable + * irqs __split_huge_page_splitting() can + * return and the head page can be freed from + * under us. We can't take the PG_lock and/or + * PG_compound_lock on a page that could be + * freed from under us. + */ + if (page != page_head) { + get_page(page_head); + put_page(page); + } + local_irq_enable(); + } else { + local_irq_enable(); + goto again; + } + } +#else + page_head = compound_head(page); + if (page != page_head) { + get_page(page_head); + put_page(page); + } +#endif + + lock_page(page_head); + if (!page_head->mapping) { + unlock_page(page_head); + put_page(page_head); goto again; } @@ -280,20 +315,20 @@ again: * it's a read-only handle, it's expected that futexes attach to * the object not the particular process. */ - if (PageAnon(page)) { + if (PageAnon(page_head)) { key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ key->private.mm = mm; key->private.address = address; } else { key->both.offset |= FUT_OFF_INODE; /* inode-based key */ - key->shared.inode = page->mapping->host; - key->shared.pgoff = page->index; + key->shared.inode = page_head->mapping->host; + key->shared.pgoff = page_head->index; } get_futex_key_refs(key); - unlock_page(page); - put_page(page); + unlock_page(page_head); + put_page(page_head); return 0; } diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 9988d03..282f202 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -72,6 +72,8 @@ static inline int desc_node(struct irq_desc *desc) { return 0; } static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node) { + int cpu; + desc->irq_data.irq = irq; desc->irq_data.chip = &no_irq_chip; desc->irq_data.chip_data = NULL; @@ -83,7 +85,8 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node) desc->irq_count = 0; desc->irqs_unhandled = 0; desc->name = NULL; - memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs))); + for_each_possible_cpu(cpu) + *per_cpu_ptr(desc->kstat_irqs, cpu) = 0; desc_smp_init(desc, node); } @@ -133,8 +136,7 @@ static struct irq_desc *alloc_desc(int irq, int node) if (!desc) return NULL; /* allocate based on nr_cpu_ids */ - desc->kstat_irqs = kzalloc_node(nr_cpu_ids * sizeof(*desc->kstat_irqs), - gfp, node); + desc->kstat_irqs = alloc_percpu(unsigned int); if (!desc->kstat_irqs) goto err_desc; @@ -149,7 +151,7 @@ static struct irq_desc *alloc_desc(int irq, int node) return desc; err_kstat: - kfree(desc->kstat_irqs); + free_percpu(desc->kstat_irqs); err_desc: kfree(desc); return NULL; @@ -166,7 +168,7 @@ static void free_desc(unsigned int irq) mutex_unlock(&sparse_irq_lock); free_masks(desc); - kfree(desc->kstat_irqs); + free_percpu(desc->kstat_irqs); kfree(desc); } @@ -234,7 +236,6 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { } }; -static unsigned int kstat_irqs_all[NR_IRQS][NR_CPUS]; int __init early_irq_init(void) { int count, i, node = first_online_node; @@ -250,7 +251,8 @@ int __init early_irq_init(void) for (i = 0; i < count; i++) { desc[i].irq_data.irq = i; desc[i].irq_data.chip = &no_irq_chip; - desc[i].kstat_irqs = kstat_irqs_all[i]; + /* TODO : do this allocation on-demand ... */ + desc[i].kstat_irqs = alloc_percpu(unsigned int); alloc_masks(desc + i, GFP_KERNEL, node); desc_smp_init(desc + i, node); lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); @@ -275,6 +277,22 @@ static void free_desc(unsigned int irq) static inline int alloc_descs(unsigned int start, unsigned int cnt, int node) { +#if defined(CONFIG_KSTAT_IRQS_ONDEMAND) + struct irq_desc *desc; + unsigned int i; + + for (i = 0; i < cnt; i++) { + desc = irq_to_desc(start + i); + if (desc && !desc->kstat_irqs) { + unsigned int __percpu *stats = alloc_percpu(unsigned int); + + if (!stats) + return -1; + if (cmpxchg(&desc->kstat_irqs, NULL, stats) != NULL) + free_percpu(stats); + } + } +#endif return start; } #endif /* !CONFIG_SPARSE_IRQ */ @@ -391,7 +409,9 @@ void dynamic_irq_cleanup(unsigned int irq) unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) { struct irq_desc *desc = irq_to_desc(irq); - return desc ? desc->kstat_irqs[cpu] : 0; + + return desc && desc->kstat_irqs ? + *per_cpu_ptr(desc->kstat_irqs, cpu) : 0; } #ifdef CONFIG_GENERIC_HARDIRQS @@ -401,10 +421,10 @@ unsigned int kstat_irqs(unsigned int irq) int cpu; int sum = 0; - if (!desc) + if (!desc || !desc->kstat_irqs) return 0; for_each_possible_cpu(cpu) - sum += desc->kstat_irqs[cpu]; + sum += *per_cpu_ptr(desc->kstat_irqs, cpu); return sum; } #endif /* CONFIG_GENERIC_HARDIRQS */ @@ -302,6 +302,44 @@ config NOMMU_INITIAL_TRIM_EXCESS See Documentation/nommu-mmap.txt for more information. +config TRANSPARENT_HUGEPAGE + bool "Transparent Hugepage Support" + depends on X86 && MMU + select COMPACTION + help + Transparent Hugepages allows the kernel to use huge pages and + huge tlb transparently to the applications whenever possible. + This feature can improve computing performance to certain + applications by speeding up page faults during memory + allocation, by reducing the number of tlb misses and by speeding + up the pagetable walking. + + If memory constrained on embedded, you may want to say N. + +choice + prompt "Transparent Hugepage Support sysfs defaults" + depends on TRANSPARENT_HUGEPAGE + default TRANSPARENT_HUGEPAGE_ALWAYS + help + Selects the sysfs defaults for Transparent Hugepage Support. + + config TRANSPARENT_HUGEPAGE_ALWAYS + bool "always" + help + Enabling Transparent Hugepage always, can increase the + memory footprint of applications without a guaranteed + benefit but it will work automatically for all applications. + + config TRANSPARENT_HUGEPAGE_MADVISE + bool "madvise" + help + Enabling Transparent Hugepage madvise, will only provide a + performance improvement benefit to the applications using + madvise(MADV_HUGEPAGE) but it won't risk to increase the + memory footprint of applications without a guaranteed + benefit. +endchoice + # # UP and nommu archs use km based percpu allocator # diff --git a/mm/Makefile b/mm/Makefile index f73f75a..2b1b575 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -5,7 +5,7 @@ mmu-y := nommu.o mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ - vmalloc.o pagewalk.o + vmalloc.o pagewalk.o pgtable-generic.o obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ maccess.o page_alloc.o page-writeback.o \ @@ -37,6 +37,7 @@ obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o obj-$(CONFIG_FS_XIP) += filemap_xip.o obj-$(CONFIG_MIGRATION) += migrate.o obj-$(CONFIG_QUICKLIST) += quicklist.o +obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o diff --git a/mm/compaction.c b/mm/compaction.c index 1a8894e..6d592a0 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -16,6 +16,9 @@ #include <linux/sysfs.h> #include "internal.h" +#define CREATE_TRACE_POINTS +#include <trace/events/compaction.h> + /* * compact_control is used to track pages being migrated and the free pages * they are being migrated to during memory compaction. The free_pfn starts @@ -30,6 +33,7 @@ struct compact_control { unsigned long nr_migratepages; /* Number of pages to migrate */ unsigned long free_pfn; /* isolate_freepages search base */ unsigned long migrate_pfn; /* isolate_migratepages search base */ + bool sync; /* Synchronous migration */ /* Account for isolated anon and file pages */ unsigned long nr_anon; @@ -38,6 +42,8 @@ struct compact_control { unsigned int order; /* order a direct compactor needs */ int migratetype; /* MOVABLE, RECLAIMABLE etc */ struct zone *zone; + + int compact_mode; }; static unsigned long release_freepages(struct list_head *freelist) @@ -60,7 +66,7 @@ static unsigned long isolate_freepages_block(struct zone *zone, struct list_head *freelist) { unsigned long zone_end_pfn, end_pfn; - int total_isolated = 0; + int nr_scanned = 0, total_isolated = 0; struct page *cursor; /* Get the last PFN we should scan for free pages at */ @@ -81,6 +87,7 @@ static unsigned long isolate_freepages_block(struct zone *zone, if (!pfn_valid_within(blockpfn)) continue; + nr_scanned++; if (!PageBuddy(page)) continue; @@ -100,6 +107,7 @@ static unsigned long isolate_freepages_block(struct zone *zone, } } + trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated); return total_isolated; } @@ -234,6 +242,8 @@ static unsigned long isolate_migratepages(struct zone *zone, struct compact_control *cc) { unsigned long low_pfn, end_pfn; + unsigned long last_pageblock_nr = 0, pageblock_nr; + unsigned long nr_scanned = 0, nr_isolated = 0; struct list_head *migratelist = &cc->migratepages; /* Do not scan outside zone boundaries */ @@ -266,20 +276,51 @@ static unsigned long isolate_migratepages(struct zone *zone, struct page *page; if (!pfn_valid_within(low_pfn)) continue; + nr_scanned++; /* Get the page and skip if free */ page = pfn_to_page(low_pfn); if (PageBuddy(page)) continue; + /* + * For async migration, also only scan in MOVABLE blocks. Async + * migration is optimistic to see if the minimum amount of work + * satisfies the allocation + */ + pageblock_nr = low_pfn >> pageblock_order; + if (!cc->sync && last_pageblock_nr != pageblock_nr && + get_pageblock_migratetype(page) != MIGRATE_MOVABLE) { + low_pfn += pageblock_nr_pages; + low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1; + last_pageblock_nr = pageblock_nr; + continue; + } + + if (!PageLRU(page)) + continue; + + /* + * PageLRU is set, and lru_lock excludes isolation, + * splitting and collapsing (collapsing has already + * happened if PageLRU is set). + */ + if (PageTransHuge(page)) { + low_pfn += (1 << compound_order(page)) - 1; + continue; + } + /* Try isolate the page */ if (__isolate_lru_page(page, ISOLATE_BOTH, 0) != 0) continue; + VM_BUG_ON(PageTransCompound(page)); + /* Successfully isolated */ del_page_from_lru_list(zone, page, page_lru(page)); list_add(&page->lru, migratelist); cc->nr_migratepages++; + nr_isolated++; /* Avoid isolating too much */ if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) @@ -291,6 +332,8 @@ static unsigned long isolate_migratepages(struct zone *zone, spin_unlock_irq(&zone->lru_lock); cc->migrate_pfn = low_pfn; + trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); + return cc->nr_migratepages; } @@ -341,10 +384,10 @@ static void update_nr_listpages(struct compact_control *cc) } static int compact_finished(struct zone *zone, - struct compact_control *cc) + struct compact_control *cc) { unsigned int order; - unsigned long watermark = low_wmark_pages(zone) + (1 << cc->order); + unsigned long watermark; if (fatal_signal_pending(current)) return COMPACT_PARTIAL; @@ -354,12 +397,27 @@ static int compact_finished(struct zone *zone, return COMPACT_COMPLETE; /* Compaction run is not finished if the watermark is not met */ + if (cc->compact_mode != COMPACT_MODE_KSWAPD) + watermark = low_wmark_pages(zone); + else + watermark = high_wmark_pages(zone); + watermark += (1 << cc->order); + if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0)) return COMPACT_CONTINUE; if (cc->order == -1) return COMPACT_CONTINUE; + /* + * Generating only one page of the right order is not enough + * for kswapd, we must continue until we're above the high + * watermark as a pool for high order GFP_ATOMIC allocations + * too. + */ + if (cc->compact_mode == COMPACT_MODE_KSWAPD) + return COMPACT_CONTINUE; + /* Direct compactor: Is a suitable page free? */ for (order = cc->order; order < MAX_ORDER; order++) { /* Job done if page is free of the right migratetype */ @@ -374,10 +432,62 @@ static int compact_finished(struct zone *zone, return COMPACT_CONTINUE; } +/* + * compaction_suitable: Is this suitable to run compaction on this zone now? + * Returns + * COMPACT_SKIPPED - If there are too few free pages for compaction + * COMPACT_PARTIAL - If the allocation would succeed without compaction + * COMPACT_CONTINUE - If compaction should run now + */ +unsigned long compaction_suitable(struct zone *zone, int order) +{ + int fragindex; + unsigned long watermark; + + /* + * Watermarks for order-0 must be met for compaction. Note the 2UL. + * This is because during migration, copies of pages need to be + * allocated and for a short time, the footprint is higher + */ + watermark = low_wmark_pages(zone) + (2UL << order); + if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) + return COMPACT_SKIPPED; + + /* + * fragmentation index determines if allocation failures are due to + * low memory or external fragmentation + * + * index of -1 implies allocations might succeed dependingon watermarks + * index towards 0 implies failure is due to lack of memory + * index towards 1000 implies failure is due to fragmentation + * + * Only compact if a failure would be due to fragmentation. + */ + fragindex = fragmentation_index(zone, order); + if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) + return COMPACT_SKIPPED; + + if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0)) + return COMPACT_PARTIAL; + + return COMPACT_CONTINUE; +} + static int compact_zone(struct zone *zone, struct compact_control *cc) { int ret; + ret = compaction_suitable(zone, cc->order); + switch (ret) { + case COMPACT_PARTIAL: + case COMPACT_SKIPPED: + /* Compaction is likely to fail */ + return ret; + case COMPACT_CONTINUE: + /* Fall through to compaction */ + ; + } + /* Setup to move all movable pages to the end of the zone */ cc->migrate_pfn = zone->zone_start_pfn; cc->free_pfn = cc->migrate_pfn + zone->spanned_pages; @@ -393,7 +503,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) nr_migrate = cc->nr_migratepages; migrate_pages(&cc->migratepages, compaction_alloc, - (unsigned long)cc, 0); + (unsigned long)cc, false, + cc->sync); update_nr_listpages(cc); nr_remaining = cc->nr_migratepages; @@ -401,6 +512,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining); if (nr_remaining) count_vm_events(COMPACTPAGEFAILED, nr_remaining); + trace_mm_compaction_migratepages(nr_migrate - nr_remaining, + nr_remaining); /* Release LRU pages not migrated */ if (!list_empty(&cc->migratepages)) { @@ -417,8 +530,10 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) return ret; } -static unsigned long compact_zone_order(struct zone *zone, - int order, gfp_t gfp_mask) +unsigned long compact_zone_order(struct zone *zone, + int order, gfp_t gfp_mask, + bool sync, + int compact_mode) { struct compact_control cc = { .nr_freepages = 0, @@ -426,6 +541,8 @@ static unsigned long compact_zone_order(struct zone *zone, .order = order, .migratetype = allocflags_to_migratetype(gfp_mask), .zone = zone, + .sync = sync, + .compact_mode = compact_mode, }; INIT_LIST_HEAD(&cc.freepages); INIT_LIST_HEAD(&cc.migratepages); @@ -441,16 +558,17 @@ int sysctl_extfrag_threshold = 500; * @order: The order of the current allocation * @gfp_mask: The GFP mask of the current allocation * @nodemask: The allowed nodes to allocate from + * @sync: Whether migration is synchronous or not * * This is the main entry point for direct page compaction. */ unsigned long try_to_compact_pages(struct zonelist *zonelist, - int order, gfp_t gfp_mask, nodemask_t *nodemask) + int order, gfp_t gfp_mask, nodemask_t *nodemask, + bool sync) { enum zone_type high_zoneidx = gfp_zone(gfp_mask); int may_enter_fs = gfp_mask & __GFP_FS; int may_perform_io = gfp_mask & __GFP_IO; - unsigned long watermark; struct zoneref *z; struct zone *zone; int rc = COMPACT_SKIPPED; @@ -460,7 +578,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, * made because an assumption is made that the page allocator can satisfy * the "cheaper" orders without taking special steps */ - if (order <= PAGE_ALLOC_COSTLY_ORDER || !may_enter_fs || !may_perform_io) + if (!order || !may_enter_fs || !may_perform_io) return rc; count_vm_event(COMPACTSTALL); @@ -468,43 +586,14 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, /* Compact each zone in the list */ for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, nodemask) { - int fragindex; int status; - /* - * Watermarks for order-0 must be met for compaction. Note - * the 2UL. This is because during migration, copies of - * pages need to be allocated and for a short time, the - * footprint is higher - */ - watermark = low_wmark_pages(zone) + (2UL << order); - if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) - continue; - - /* - * fragmentation index determines if allocation failures are - * due to low memory or external fragmentation - * - * index of -1 implies allocations might succeed depending - * on watermarks - * index towards 0 implies failure is due to lack of memory - * index towards 1000 implies failure is due to fragmentation - * - * Only compact if a failure would be due to fragmentation. - */ - fragindex = fragmentation_index(zone, order); - if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) - continue; - - if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0)) { - rc = COMPACT_PARTIAL; - break; - } - - status = compact_zone_order(zone, order, gfp_mask); + status = compact_zone_order(zone, order, gfp_mask, sync, + COMPACT_MODE_DIRECT_RECLAIM); rc = max(status, rc); - if (zone_watermark_ok(zone, order, watermark, 0, 0)) + /* If a normal allocation would succeed, stop compacting */ + if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0)) break; } @@ -531,6 +620,7 @@ static int compact_node(int nid) .nr_freepages = 0, .nr_migratepages = 0, .order = -1, + .compact_mode = COMPACT_MODE_DIRECT_RECLAIM, }; zone = &pgdat->node_zones[zoneid]; diff --git a/mm/dmapool.c b/mm/dmapool.c index 4df2de7..03bf3bb 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -324,7 +324,7 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, if (mem_flags & __GFP_WAIT) { DECLARE_WAITQUEUE(wait, current); - __set_current_state(TASK_INTERRUPTIBLE); + __set_current_state(TASK_UNINTERRUPTIBLE); __add_wait_queue(&pool->waitq, &wait); spin_unlock_irqrestore(&pool->lock, flags); @@ -355,20 +355,15 @@ EXPORT_SYMBOL(dma_pool_alloc); static struct dma_page *pool_find_page(struct dma_pool *pool, dma_addr_t dma) { - unsigned long flags; struct dma_page *page; - spin_lock_irqsave(&pool->lock, flags); list_for_each_entry(page, &pool->page_list, page_list) { if (dma < page->dma) continue; if (dma < (page->dma + pool->allocation)) - goto done; + return page; } - page = NULL; - done: - spin_unlock_irqrestore(&pool->lock, flags); - return page; + return NULL; } /** @@ -386,8 +381,10 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) unsigned long flags; unsigned int offset; + spin_lock_irqsave(&pool->lock, flags); page = pool_find_page(pool, dma); if (!page) { + spin_unlock_irqrestore(&pool->lock, flags); if (pool->dev) dev_err(pool->dev, "dma_pool_free %s, %p/%lx (bad dma)\n", @@ -401,6 +398,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) offset = vaddr - page->vaddr; #ifdef DMAPOOL_DEBUG if ((dma - page->dma) != offset) { + spin_unlock_irqrestore(&pool->lock, flags); if (pool->dev) dev_err(pool->dev, "dma_pool_free %s, %p (bad vaddr)/%Lx\n", @@ -418,6 +416,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) chain = *(int *)(page->vaddr + chain); continue; } + spin_unlock_irqrestore(&pool->lock, flags); if (pool->dev) dev_err(pool->dev, "dma_pool_free %s, dma %Lx " "already free\n", pool->name, @@ -432,7 +431,6 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) memset(vaddr, POOL_POISON_FREED, pool->size); #endif - spin_lock_irqsave(&pool->lock, flags); page->in_use--; *(int *)vaddr = page->offset; page->offset = offset; diff --git a/mm/filemap.c b/mm/filemap.c index ca38939..83a45d3 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -298,7 +298,7 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, continue; wait_on_page_writeback(page); - if (PageError(page)) + if (TestClearPageError(page)) ret = -EIO; } pagevec_release(&pvec); @@ -837,9 +837,6 @@ repeat: if (radix_tree_deref_retry(page)) goto restart; - if (page->mapping == NULL || page->index != index) - break; - if (!page_cache_get_speculative(page)) goto repeat; @@ -849,6 +846,16 @@ repeat: goto repeat; } + /* + * must check mapping and index after taking the ref. + * otherwise we can get both false positives and false + * negatives, which is just confusing to the caller. + */ + if (page->mapping == NULL || page->index != index) { + page_cache_release(page); + break; + } + pages[ret] = page; ret++; index++; @@ -2220,7 +2227,7 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping, gfp_notmask = __GFP_FS; repeat: page = find_lock_page(mapping, index); - if (likely(page)) + if (page) return page; page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~gfp_notmask); diff --git a/mm/huge_memory.c b/mm/huge_memory.c new file mode 100644 index 0000000..004c9c2 --- /dev/null +++ b/mm/huge_memory.c @@ -0,0 +1,2346 @@ +/* + * Copyright (C) 2009 Red Hat, Inc. + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include <linux/mm.h> +#include <linux/sched.h> +#include <linux/highmem.h> +#include <linux/hugetlb.h> +#include <linux/mmu_notifier.h> +#include <linux/rmap.h> +#include <linux/swap.h> +#include <linux/mm_inline.h> +#include <linux/kthread.h> +#include <linux/khugepaged.h> +#include <linux/freezer.h> +#include <linux/mman.h> +#include <asm/tlb.h> +#include <asm/pgalloc.h> +#include "internal.h" + +/* + * By default transparent hugepage support is enabled for all mappings + * and khugepaged scans all mappings. Defrag is only invoked by + * khugepaged hugepage allocations and by page faults inside + * MADV_HUGEPAGE regions to avoid the risk of slowing down short lived + * allocations. + */ +unsigned long transparent_hugepage_flags __read_mostly = +#ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS + (1<<TRANSPARENT_HUGEPAGE_FLAG)| +#endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE + (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)| +#endif + (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)| + (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); + +/* default scan 8*512 pte (or vmas) every 30 second */ +static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8; +static unsigned int khugepaged_pages_collapsed; +static unsigned int khugepaged_full_scans; +static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000; +/* during fragmentation poll the hugepage allocator once every minute */ +static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000; +static struct task_struct *khugepaged_thread __read_mostly; +static DEFINE_MUTEX(khugepaged_mutex); +static DEFINE_SPINLOCK(khugepaged_mm_lock); +static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); +/* + * default collapse hugepages if there is at least one pte mapped like + * it would have happened if the vma was large enough during page + * fault. + */ +static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1; + +static int khugepaged(void *none); +static int mm_slots_hash_init(void); +static int khugepaged_slab_init(void); +static void khugepaged_slab_free(void); + +#define MM_SLOTS_HASH_HEADS 1024 +static struct hlist_head *mm_slots_hash __read_mostly; +static struct kmem_cache *mm_slot_cache __read_mostly; + +/** + * struct mm_slot - hash lookup from mm to mm_slot + * @hash: hash collision list + * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head + * @mm: the mm that this information is valid for + */ +struct mm_slot { + struct hlist_node hash; + struct list_head mm_node; + struct mm_struct *mm; +}; + +/** + * struct khugepaged_scan - cursor for scanning + * @mm_head: the head of the mm list to scan + * @mm_slot: the current mm_slot we are scanning + * @address: the next address inside that to be scanned + * + * There is only the one khugepaged_scan instance of this cursor structure. + */ +struct khugepaged_scan { + struct list_head mm_head; + struct mm_slot *mm_slot; + unsigned long address; +} khugepaged_scan = { + .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head), +}; + + +static int set_recommended_min_free_kbytes(void) +{ + struct zone *zone; + int nr_zones = 0; + unsigned long recommended_min; + extern int min_free_kbytes; + + if (!test_bit(TRANSPARENT_HUGEPAGE_FLAG, + &transparent_hugepage_flags) && + !test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, + &transparent_hugepage_flags)) + return 0; + + for_each_populated_zone(zone) + nr_zones++; + + /* Make sure at least 2 hugepages are free for MIGRATE_RESERVE */ + recommended_min = pageblock_nr_pages * nr_zones * 2; + + /* + * Make sure that on average at least two pageblocks are almost free + * of another type, one for a migratetype to fall back to and a + * second to avoid subsequent fallbacks of other types There are 3 + * MIGRATE_TYPES we care about. + */ + recommended_min += pageblock_nr_pages * nr_zones * + MIGRATE_PCPTYPES * MIGRATE_PCPTYPES; + + /* don't ever allow to reserve more than 5% of the lowmem */ + recommended_min = min(recommended_min, + (unsigned long) nr_free_buffer_pages() / 20); + recommended_min <<= (PAGE_SHIFT-10); + + if (recommended_min > min_free_kbytes) + min_free_kbytes = recommended_min; + setup_per_zone_wmarks(); + return 0; +} +late_initcall(set_recommended_min_free_kbytes); + +static int start_khugepaged(void) +{ + int err = 0; + if (khugepaged_enabled()) { + int wakeup; + if (unlikely(!mm_slot_cache || !mm_slots_hash)) { + err = -ENOMEM; + goto out; + } + mutex_lock(&khugepaged_mutex); + if (!khugepaged_thread) + khugepaged_thread = kthread_run(khugepaged, NULL, + "khugepaged"); + if (unlikely(IS_ERR(khugepaged_thread))) { + printk(KERN_ERR + "khugepaged: kthread_run(khugepaged) failed\n"); + err = PTR_ERR(khugepaged_thread); + khugepaged_thread = NULL; + } + wakeup = !list_empty(&khugepaged_scan.mm_head); + mutex_unlock(&khugepaged_mutex); + if (wakeup) + wake_up_interruptible(&khugepaged_wait); + + set_recommended_min_free_kbytes(); + } else + /* wakeup to exit */ + wake_up_interruptible(&khugepaged_wait); +out: + return err; +} + +#ifdef CONFIG_SYSFS + +static ssize_t double_flag_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf, + enum transparent_hugepage_flag enabled, + enum transparent_hugepage_flag req_madv) +{ + if (test_bit(enabled, &transparent_hugepage_flags)) { + VM_BUG_ON(test_bit(req_madv, &transparent_hugepage_flags)); + return sprintf(buf, "[always] madvise never\n"); + } else if (test_bit(req_madv, &transparent_hugepage_flags)) + return sprintf(buf, "always [madvise] never\n"); + else + return sprintf(buf, "always madvise [never]\n"); +} +static ssize_t double_flag_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count, + enum transparent_hugepage_flag enabled, + enum transparent_hugepage_flag req_madv) +{ + if (!memcmp("always", buf, + min(sizeof("always")-1, count))) { + set_bit(enabled, &transparent_hugepage_flags); + clear_bit(req_madv, &transparent_hugepage_flags); + } else if (!memcmp("madvise", buf, + min(sizeof("madvise")-1, count))) { + clear_bit(enabled, &transparent_hugepage_flags); + set_bit(req_madv, &transparent_hugepage_flags); + } else if (!memcmp("never", buf, + min(sizeof("never")-1, count))) { + clear_bit(enabled, &transparent_hugepage_flags); + clear_bit(req_madv, &transparent_hugepage_flags); + } else + return -EINVAL; + + return count; +} + +static ssize_t enabled_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return double_flag_show(kobj, attr, buf, + TRANSPARENT_HUGEPAGE_FLAG, + TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG); +} +static ssize_t enabled_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + ssize_t ret; + + ret = double_flag_store(kobj, attr, buf, count, + TRANSPARENT_HUGEPAGE_FLAG, + TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG); + + if (ret > 0) { + int err = start_khugepaged(); + if (err) + ret = err; + } + + if (ret > 0 && + (test_bit(TRANSPARENT_HUGEPAGE_FLAG, + &transparent_hugepage_flags) || + test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, + &transparent_hugepage_flags))) + set_recommended_min_free_kbytes(); + + return ret; +} +static struct kobj_attribute enabled_attr = + __ATTR(enabled, 0644, enabled_show, enabled_store); + +static ssize_t single_flag_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf, + enum transparent_hugepage_flag flag) +{ + if (test_bit(flag, &transparent_hugepage_flags)) + return sprintf(buf, "[yes] no\n"); + else + return sprintf(buf, "yes [no]\n"); +} +static ssize_t single_flag_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count, + enum transparent_hugepage_flag flag) +{ + if (!memcmp("yes", buf, + min(sizeof("yes")-1, count))) { + set_bit(flag, &transparent_hugepage_flags); + } else if (!memcmp("no", buf, + min(sizeof("no")-1, count))) { + clear_bit(flag, &transparent_hugepage_flags); + } else + return -EINVAL; + + return count; +} + +/* + * Currently defrag only disables __GFP_NOWAIT for allocation. A blind + * __GFP_REPEAT is too aggressive, it's never worth swapping tons of + * memory just to allocate one more hugepage. + */ +static ssize_t defrag_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return double_flag_show(kobj, attr, buf, + TRANSPARENT_HUGEPAGE_DEFRAG_FLAG, + TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG); +} +static ssize_t defrag_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + return double_flag_store(kobj, attr, buf, count, + TRANSPARENT_HUGEPAGE_DEFRAG_FLAG, + TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG); +} +static struct kobj_attribute defrag_attr = + __ATTR(defrag, 0644, defrag_show, defrag_store); + +#ifdef CONFIG_DEBUG_VM +static ssize_t debug_cow_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return single_flag_show(kobj, attr, buf, + TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG); +} +static ssize_t debug_cow_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + return single_flag_store(kobj, attr, buf, count, + TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG); +} +static struct kobj_attribute debug_cow_attr = + __ATTR(debug_cow, 0644, debug_cow_show, debug_cow_store); +#endif /* CONFIG_DEBUG_VM */ + +static struct attribute *hugepage_attr[] = { + &enabled_attr.attr, + &defrag_attr.attr, +#ifdef CONFIG_DEBUG_VM + &debug_cow_attr.attr, +#endif + NULL, +}; + +static struct attribute_group hugepage_attr_group = { + .attrs = hugepage_attr, +}; + +static ssize_t scan_sleep_millisecs_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_scan_sleep_millisecs); +} + +static ssize_t scan_sleep_millisecs_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned long msecs; + int err; + + err = strict_strtoul(buf, 10, &msecs); + if (err || msecs > UINT_MAX) + return -EINVAL; + + khugepaged_scan_sleep_millisecs = msecs; + wake_up_interruptible(&khugepaged_wait); + + return count; +} +static struct kobj_attribute scan_sleep_millisecs_attr = + __ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show, + scan_sleep_millisecs_store); + +static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_alloc_sleep_millisecs); +} + +static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned long msecs; + int err; + + err = strict_strtoul(buf, 10, &msecs); + if (err || msecs > UINT_MAX) + return -EINVAL; + + khugepaged_alloc_sleep_millisecs = msecs; + wake_up_interruptible(&khugepaged_wait); + + return count; +} +static struct kobj_attribute alloc_sleep_millisecs_attr = + __ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show, + alloc_sleep_millisecs_store); + +static ssize_t pages_to_scan_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_pages_to_scan); +} +static ssize_t pages_to_scan_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + unsigned long pages; + + err = strict_strtoul(buf, 10, &pages); + if (err || !pages || pages > UINT_MAX) + return -EINVAL; + + khugepaged_pages_to_scan = pages; + + return count; +} +static struct kobj_attribute pages_to_scan_attr = + __ATTR(pages_to_scan, 0644, pages_to_scan_show, + pages_to_scan_store); + +static ssize_t pages_collapsed_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_pages_collapsed); +} +static struct kobj_attribute pages_collapsed_attr = + __ATTR_RO(pages_collapsed); + +static ssize_t full_scans_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_full_scans); +} +static struct kobj_attribute full_scans_attr = + __ATTR_RO(full_scans); + +static ssize_t khugepaged_defrag_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return single_flag_show(kobj, attr, buf, + TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); +} +static ssize_t khugepaged_defrag_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + return single_flag_store(kobj, attr, buf, count, + TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); +} +static struct kobj_attribute khugepaged_defrag_attr = + __ATTR(defrag, 0644, khugepaged_defrag_show, + khugepaged_defrag_store); + +/* + * max_ptes_none controls if khugepaged should collapse hugepages over + * any unmapped ptes in turn potentially increasing the memory + * footprint of the vmas. When max_ptes_none is 0 khugepaged will not + * reduce the available free memory in the system as it + * runs. Increasing max_ptes_none will instead potentially reduce the + * free memory in the system during the khugepaged scan. + */ +static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_max_ptes_none); +} +static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + unsigned long max_ptes_none; + + err = strict_strtoul(buf, 10, &max_ptes_none); + if (err || max_ptes_none > HPAGE_PMD_NR-1) + return -EINVAL; + + khugepaged_max_ptes_none = max_ptes_none; + + return count; +} +static struct kobj_attribute khugepaged_max_ptes_none_attr = + __ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show, + khugepaged_max_ptes_none_store); + +static struct attribute *khugepaged_attr[] = { + &khugepaged_defrag_attr.attr, + &khugepaged_max_ptes_none_attr.attr, + &pages_to_scan_attr.attr, + &pages_collapsed_attr.attr, + &full_scans_attr.attr, + &scan_sleep_millisecs_attr.attr, + &alloc_sleep_millisecs_attr.attr, + NULL, +}; + +static struct attribute_group khugepaged_attr_group = { + .attrs = khugepaged_attr, + .name = "khugepaged", +}; +#endif /* CONFIG_SYSFS */ + +static int __init hugepage_init(void) +{ + int err; +#ifdef CONFIG_SYSFS + static struct kobject *hugepage_kobj; +#endif + + err = -EINVAL; + if (!has_transparent_hugepage()) { + transparent_hugepage_flags = 0; + goto out; + } + +#ifdef CONFIG_SYSFS + err = -ENOMEM; + hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); + if (unlikely(!hugepage_kobj)) { + printk(KERN_ERR "hugepage: failed kobject create\n"); + goto out; + } + + err = sysfs_create_group(hugepage_kobj, &hugepage_attr_group); + if (err) { + printk(KERN_ERR "hugepage: failed register hugeage group\n"); + goto out; + } + + err = sysfs_create_group(hugepage_kobj, &khugepaged_attr_group); + if (err) { + printk(KERN_ERR "hugepage: failed register hugeage group\n"); + goto out; + } +#endif + + err = khugepaged_slab_init(); + if (err) + goto out; + + err = mm_slots_hash_init(); + if (err) { + khugepaged_slab_free(); + goto out; + } + + /* + * By default disable transparent hugepages on smaller systems, + * where the extra memory used could hurt more than TLB overhead + * is likely to save. The admin can still enable it through /sys. + */ + if (totalram_pages < (512 << (20 - PAGE_SHIFT))) + transparent_hugepage_flags = 0; + + start_khugepaged(); + + set_recommended_min_free_kbytes(); + +out: + return err; +} +module_init(hugepage_init) + +static int __init setup_transparent_hugepage(char *str) +{ + int ret = 0; + if (!str) + goto out; + if (!strcmp(str, "always")) { + set_bit(TRANSPARENT_HUGEPAGE_FLAG, + &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, + &transparent_hugepage_flags); + ret = 1; + } else if (!strcmp(str, "madvise")) { + clear_bit(TRANSPARENT_HUGEPAGE_FLAG, + &transparent_hugepage_flags); + set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, + &transparent_hugepage_flags); + ret = 1; + } else if (!strcmp(str, "never")) { + clear_bit(TRANSPARENT_HUGEPAGE_FLAG, + &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, + &transparent_hugepage_flags); + ret = 1; + } +out: + if (!ret) + printk(KERN_WARNING + "transparent_hugepage= cannot parse, ignored\n"); + return ret; +} +__setup("transparent_hugepage=", setup_transparent_hugepage); + +static void prepare_pmd_huge_pte(pgtable_t pgtable, + struct mm_struct *mm) +{ + assert_spin_locked(&mm->page_table_lock); + + /* FIFO */ + if (!mm->pmd_huge_pte) + INIT_LIST_HEAD(&pgtable->lru); + else + list_add(&pgtable->lru, &mm->pmd_huge_pte->lru); + mm->pmd_huge_pte = pgtable; +} + +static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) +{ + if (likely(vma->vm_flags & VM_WRITE)) + pmd = pmd_mkwrite(pmd); + return pmd; +} + +static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long haddr, pmd_t *pmd, + struct page *page) +{ + int ret = 0; + pgtable_t pgtable; + + VM_BUG_ON(!PageCompound(page)); + pgtable = pte_alloc_one(mm, haddr); + if (unlikely(!pgtable)) { + mem_cgroup_uncharge_page(page); + put_page(page); + return VM_FAULT_OOM; + } + + clear_huge_page(page, haddr, HPAGE_PMD_NR); + __SetPageUptodate(page); + + spin_lock(&mm->page_table_lock); + if (unlikely(!pmd_none(*pmd))) { + spin_unlock(&mm->page_table_lock); + mem_cgroup_uncharge_page(page); + put_page(page); + pte_free(mm, pgtable); + } else { + pmd_t entry; + entry = mk_pmd(page, vma->vm_page_prot); + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); + entry = pmd_mkhuge(entry); + /* + * The spinlocking to take the lru_lock inside + * page_add_new_anon_rmap() acts as a full memory + * barrier to be sure clear_huge_page writes become + * visible after the set_pmd_at() write. + */ + page_add_new_anon_rmap(page, vma, haddr); + set_pmd_at(mm, haddr, pmd, entry); + prepare_pmd_huge_pte(pgtable, mm); + add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); + spin_unlock(&mm->page_table_lock); + } + + return ret; +} + +static inline gfp_t alloc_hugepage_gfpmask(int defrag) +{ + return GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT); +} + +static inline struct page *alloc_hugepage_vma(int defrag, + struct vm_area_struct *vma, + unsigned long haddr) +{ + return alloc_pages_vma(alloc_hugepage_gfpmask(defrag), + HPAGE_PMD_ORDER, vma, haddr); +} + +#ifndef CONFIG_NUMA +static inline struct page *alloc_hugepage(int defrag) +{ + return alloc_pages(alloc_hugepage_gfpmask(defrag), + HPAGE_PMD_ORDER); +} +#endif + +int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, + unsigned int flags) +{ + struct page *page; + unsigned long haddr = address & HPAGE_PMD_MASK; + pte_t *pte; + + if (haddr >= vma->vm_start && haddr + HPAGE_PMD_SIZE <= vma->vm_end) { + if (unlikely(anon_vma_prepare(vma))) + return VM_FAULT_OOM; + if (unlikely(khugepaged_enter(vma))) + return VM_FAULT_OOM; + page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), + vma, haddr); + if (unlikely(!page)) + goto out; + if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) { + put_page(page); + goto out; + } + + return __do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page); + } +out: + /* + * Use __pte_alloc instead of pte_alloc_map, because we can't + * run pte_offset_map on the pmd, if an huge pmd could + * materialize from under us from a different thread. + */ + if (unlikely(__pte_alloc(mm, vma, pmd, address))) + return VM_FAULT_OOM; + /* if an huge pmd materialized from under us just retry later */ + if (unlikely(pmd_trans_huge(*pmd))) + return 0; + /* + * A regular pmd is established and it can't morph into a huge pmd + * from under us anymore at this point because we hold the mmap_sem + * read mode and khugepaged takes it in write mode. So now it's + * safe to run pte_offset_map(). + */ + pte = pte_offset_map(pmd, address); + return handle_pte_fault(mm, vma, address, pte, pmd, flags); +} + +int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, + struct vm_area_struct *vma) +{ + struct page *src_page; + pmd_t pmd; + pgtable_t pgtable; + int ret; + + ret = -ENOMEM; + pgtable = pte_alloc_one(dst_mm, addr); + if (unlikely(!pgtable)) + goto out; + + spin_lock(&dst_mm->page_table_lock); + spin_lock_nested(&src_mm->page_table_lock, SINGLE_DEPTH_NESTING); + + ret = -EAGAIN; + pmd = *src_pmd; + if (unlikely(!pmd_trans_huge(pmd))) { + pte_free(dst_mm, pgtable); + goto out_unlock; + } + if (unlikely(pmd_trans_splitting(pmd))) { + /* split huge page running from under us */ + spin_unlock(&src_mm->page_table_lock); + spin_unlock(&dst_mm->page_table_lock); + pte_free(dst_mm, pgtable); + + wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */ + goto out; + } + src_page = pmd_page(pmd); + VM_BUG_ON(!PageHead(src_page)); + get_page(src_page); + page_dup_rmap(src_page); + add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); + + pmdp_set_wrprotect(src_mm, addr, src_pmd); + pmd = pmd_mkold(pmd_wrprotect(pmd)); + set_pmd_at(dst_mm, addr, dst_pmd, pmd); + prepare_pmd_huge_pte(pgtable, dst_mm); + + ret = 0; +out_unlock: + spin_unlock(&src_mm->page_table_lock); + spin_unlock(&dst_mm->page_table_lock); +out: + return ret; +} + +/* no "address" argument so destroys page coloring of some arch */ +pgtable_t get_pmd_huge_pte(struct mm_struct *mm) +{ + pgtable_t pgtable; + + assert_spin_locked(&mm->page_table_lock); + + /* FIFO */ + pgtable = mm->pmd_huge_pte; + if (list_empty(&pgtable->lru)) + mm->pmd_huge_pte = NULL; + else { + mm->pmd_huge_pte = list_entry(pgtable->lru.next, + struct page, lru); + list_del(&pgtable->lru); + } + return pgtable; +} + +static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long address, + pmd_t *pmd, pmd_t orig_pmd, + struct page *page, + unsigned long haddr) +{ + pgtable_t pgtable; + pmd_t _pmd; + int ret = 0, i; + struct page **pages; + + pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR, + GFP_KERNEL); + if (unlikely(!pages)) { + ret |= VM_FAULT_OOM; + goto out; + } + + for (i = 0; i < HPAGE_PMD_NR; i++) { + pages[i] = alloc_page_vma(GFP_HIGHUSER_MOVABLE, + vma, address); + if (unlikely(!pages[i] || + mem_cgroup_newpage_charge(pages[i], mm, + GFP_KERNEL))) { + if (pages[i]) + put_page(pages[i]); + mem_cgroup_uncharge_start(); + while (--i >= 0) { + mem_cgroup_uncharge_page(pages[i]); + put_page(pages[i]); + } + mem_cgroup_uncharge_end(); + kfree(pages); + ret |= VM_FAULT_OOM; + goto out; + } + } + + for (i = 0; i < HPAGE_PMD_NR; i++) { + copy_user_highpage(pages[i], page + i, + haddr + PAGE_SHIFT*i, vma); + __SetPageUptodate(pages[i]); + cond_resched(); + } + + spin_lock(&mm->page_table_lock); + if (unlikely(!pmd_same(*pmd, orig_pmd))) + goto out_free_pages; + VM_BUG_ON(!PageHead(page)); + + pmdp_clear_flush_notify(vma, haddr, pmd); + /* leave pmd empty until pte is filled */ + + pgtable = get_pmd_huge_pte(mm); + pmd_populate(mm, &_pmd, pgtable); + + for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { + pte_t *pte, entry; + entry = mk_pte(pages[i], vma->vm_page_prot); + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + page_add_new_anon_rmap(pages[i], vma, haddr); + pte = pte_offset_map(&_pmd, haddr); + VM_BUG_ON(!pte_none(*pte)); + set_pte_at(mm, haddr, pte, entry); + pte_unmap(pte); + } + kfree(pages); + + mm->nr_ptes++; + smp_wmb(); /* make pte visible before pmd */ + pmd_populate(mm, pmd, pgtable); + page_remove_rmap(page); + spin_unlock(&mm->page_table_lock); + + ret |= VM_FAULT_WRITE; + put_page(page); + +out: + return ret; + +out_free_pages: + spin_unlock(&mm->page_table_lock); + mem_cgroup_uncharge_start(); + for (i = 0; i < HPAGE_PMD_NR; i++) { + mem_cgroup_uncharge_page(pages[i]); + put_page(pages[i]); + } + mem_cgroup_uncharge_end(); + kfree(pages); + goto out; +} + +int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, pmd_t orig_pmd) +{ + int ret = 0; + struct page *page, *new_page; + unsigned long haddr; + + VM_BUG_ON(!vma->anon_vma); + spin_lock(&mm->page_table_lock); + if (unlikely(!pmd_same(*pmd, orig_pmd))) + goto out_unlock; + + page = pmd_page(orig_pmd); + VM_BUG_ON(!PageCompound(page) || !PageHead(page)); + haddr = address & HPAGE_PMD_MASK; + if (page_mapcount(page) == 1) { + pmd_t entry; + entry = pmd_mkyoung(orig_pmd); + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); + if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1)) + update_mmu_cache(vma, address, entry); + ret |= VM_FAULT_WRITE; + goto out_unlock; + } + get_page(page); + spin_unlock(&mm->page_table_lock); + + if (transparent_hugepage_enabled(vma) && + !transparent_hugepage_debug_cow()) + new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), + vma, haddr); + else + new_page = NULL; + + if (unlikely(!new_page)) { + ret = do_huge_pmd_wp_page_fallback(mm, vma, address, + pmd, orig_pmd, page, haddr); + put_page(page); + goto out; + } + + if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { + put_page(new_page); + put_page(page); + ret |= VM_FAULT_OOM; + goto out; + } + + copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); + __SetPageUptodate(new_page); + + spin_lock(&mm->page_table_lock); + put_page(page); + if (unlikely(!pmd_same(*pmd, orig_pmd))) { + mem_cgroup_uncharge_page(new_page); + put_page(new_page); + } else { + pmd_t entry; + VM_BUG_ON(!PageHead(page)); + entry = mk_pmd(new_page, vma->vm_page_prot); + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); + entry = pmd_mkhuge(entry); + pmdp_clear_flush_notify(vma, haddr, pmd); + page_add_new_anon_rmap(new_page, vma, haddr); + set_pmd_at(mm, haddr, pmd, entry); + update_mmu_cache(vma, address, entry); + page_remove_rmap(page); + put_page(page); + ret |= VM_FAULT_WRITE; + } +out_unlock: + spin_unlock(&mm->page_table_lock); +out: + return ret; +} + +struct page *follow_trans_huge_pmd(struct mm_struct *mm, + unsigned long addr, + pmd_t *pmd, + unsigned int flags) +{ + struct page *page = NULL; + + assert_spin_locked(&mm->page_table_lock); + + if (flags & FOLL_WRITE && !pmd_write(*pmd)) + goto out; + + page = pmd_page(*pmd); + VM_BUG_ON(!PageHead(page)); + if (flags & FOLL_TOUCH) { + pmd_t _pmd; + /* + * We should set the dirty bit only for FOLL_WRITE but + * for now the dirty bit in the pmd is meaningless. + * And if the dirty bit will become meaningful and + * we'll only set it with FOLL_WRITE, an atomic + * set_bit will be required on the pmd to set the + * young bit, instead of the current set_pmd_at. + */ + _pmd = pmd_mkyoung(pmd_mkdirty(*pmd)); + set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd); + } + page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; + VM_BUG_ON(!PageCompound(page)); + if (flags & FOLL_GET) + get_page(page); + +out: + return page; +} + +int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, + pmd_t *pmd) +{ + int ret = 0; + + spin_lock(&tlb->mm->page_table_lock); + if (likely(pmd_trans_huge(*pmd))) { + if (unlikely(pmd_trans_splitting(*pmd))) { + spin_unlock(&tlb->mm->page_table_lock); + wait_split_huge_page(vma->anon_vma, + pmd); + } else { + struct page *page; + pgtable_t pgtable; + pgtable = get_pmd_huge_pte(tlb->mm); + page = pmd_page(*pmd); + pmd_clear(pmd); + page_remove_rmap(page); + VM_BUG_ON(page_mapcount(page) < 0); + add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); + VM_BUG_ON(!PageHead(page)); + spin_unlock(&tlb->mm->page_table_lock); + tlb_remove_page(tlb, page); + pte_free(tlb->mm, pgtable); + ret = 1; + } + } else + spin_unlock(&tlb->mm->page_table_lock); + + return ret; +} + +int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, unsigned long end, + unsigned char *vec) +{ + int ret = 0; + + spin_lock(&vma->vm_mm->page_table_lock); + if (likely(pmd_trans_huge(*pmd))) { + ret = !pmd_trans_splitting(*pmd); + spin_unlock(&vma->vm_mm->page_table_lock); + if (unlikely(!ret)) + wait_split_huge_page(vma->anon_vma, pmd); + else { + /* + * All logical pages in the range are present + * if backed by a huge page. + */ + memset(vec, 1, (end - addr) >> PAGE_SHIFT); + } + } else + spin_unlock(&vma->vm_mm->page_table_lock); + + return ret; +} + +int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, pgprot_t newprot) +{ + struct mm_struct *mm = vma->vm_mm; + int ret = 0; + + spin_lock(&mm->page_table_lock); + if (likely(pmd_trans_huge(*pmd))) { + if (unlikely(pmd_trans_splitting(*pmd))) { + spin_unlock(&mm->page_table_lock); + wait_split_huge_page(vma->anon_vma, pmd); + } else { + pmd_t entry; + + entry = pmdp_get_and_clear(mm, addr, pmd); + entry = pmd_modify(entry, newprot); + set_pmd_at(mm, addr, pmd, entry); + spin_unlock(&vma->vm_mm->page_table_lock); + flush_tlb_range(vma, addr, addr + HPAGE_PMD_SIZE); + ret = 1; + } + } else + spin_unlock(&vma->vm_mm->page_table_lock); + + return ret; +} + +pmd_t *page_check_address_pmd(struct page *page, + struct mm_struct *mm, + unsigned long address, + enum page_check_address_pmd_flag flag) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd, *ret = NULL; + + if (address & ~HPAGE_PMD_MASK) + goto out; + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + goto out; + + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + goto out; + + pmd = pmd_offset(pud, address); + if (pmd_none(*pmd)) + goto out; + if (pmd_page(*pmd) != page) + goto out; + /* + * split_vma() may create temporary aliased mappings. There is + * no risk as long as all huge pmd are found and have their + * splitting bit set before __split_huge_page_refcount + * runs. Finding the same huge pmd more than once during the + * same rmap walk is not a problem. + */ + if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG && + pmd_trans_splitting(*pmd)) + goto out; + if (pmd_trans_huge(*pmd)) { + VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG && + !pmd_trans_splitting(*pmd)); + ret = pmd; + } +out: + return ret; +} + +static int __split_huge_page_splitting(struct page *page, + struct vm_area_struct *vma, + unsigned long address) +{ + struct mm_struct *mm = vma->vm_mm; + pmd_t *pmd; + int ret = 0; + + spin_lock(&mm->page_table_lock); + pmd = page_check_address_pmd(page, mm, address, + PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG); + if (pmd) { + /* + * We can't temporarily set the pmd to null in order + * to split it, the pmd must remain marked huge at all + * times or the VM won't take the pmd_trans_huge paths + * and it won't wait on the anon_vma->root->lock to + * serialize against split_huge_page*. + */ + pmdp_splitting_flush_notify(vma, address, pmd); + ret = 1; + } + spin_unlock(&mm->page_table_lock); + + return ret; +} + +static void __split_huge_page_refcount(struct page *page) +{ + int i; + unsigned long head_index = page->index; + struct zone *zone = page_zone(page); + int zonestat; + + /* prevent PageLRU to go away from under us, and freeze lru stats */ + spin_lock_irq(&zone->lru_lock); + compound_lock(page); + + for (i = 1; i < HPAGE_PMD_NR; i++) { + struct page *page_tail = page + i; + + /* tail_page->_count cannot change */ + atomic_sub(atomic_read(&page_tail->_count), &page->_count); + BUG_ON(page_count(page) <= 0); + atomic_add(page_mapcount(page) + 1, &page_tail->_count); + BUG_ON(atomic_read(&page_tail->_count) <= 0); + + /* after clearing PageTail the gup refcount can be released */ + smp_mb(); + + page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; + page_tail->flags |= (page->flags & + ((1L << PG_referenced) | + (1L << PG_swapbacked) | + (1L << PG_mlocked) | + (1L << PG_uptodate))); + page_tail->flags |= (1L << PG_dirty); + + /* + * 1) clear PageTail before overwriting first_page + * 2) clear PageTail before clearing PageHead for VM_BUG_ON + */ + smp_wmb(); + + /* + * __split_huge_page_splitting() already set the + * splitting bit in all pmd that could map this + * hugepage, that will ensure no CPU can alter the + * mapcount on the head page. The mapcount is only + * accounted in the head page and it has to be + * transferred to all tail pages in the below code. So + * for this code to be safe, the split the mapcount + * can't change. But that doesn't mean userland can't + * keep changing and reading the page contents while + * we transfer the mapcount, so the pmd splitting + * status is achieved setting a reserved bit in the + * pmd, not by clearing the present bit. + */ + BUG_ON(page_mapcount(page_tail)); + page_tail->_mapcount = page->_mapcount; + + BUG_ON(page_tail->mapping); + page_tail->mapping = page->mapping; + + page_tail->index = ++head_index; + + BUG_ON(!PageAnon(page_tail)); + BUG_ON(!PageUptodate(page_tail)); + BUG_ON(!PageDirty(page_tail)); + BUG_ON(!PageSwapBacked(page_tail)); + + lru_add_page_tail(zone, page, page_tail); + } + + __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); + __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR); + + /* + * A hugepage counts for HPAGE_PMD_NR pages on the LRU statistics, + * so adjust those appropriately if this page is on the LRU. + */ + if (PageLRU(page)) { + zonestat = NR_LRU_BASE + page_lru(page); + __mod_zone_page_state(zone, zonestat, -(HPAGE_PMD_NR-1)); + } + + ClearPageCompound(page); + compound_unlock(page); + spin_unlock_irq(&zone->lru_lock); + + for (i = 1; i < HPAGE_PMD_NR; i++) { + struct page *page_tail = page + i; + BUG_ON(page_count(page_tail) <= 0); + /* + * Tail pages may be freed if there wasn't any mapping + * like if add_to_swap() is running on a lru page that + * had its mapping zapped. And freeing these pages + * requires taking the lru_lock so we do the put_page + * of the tail pages after the split is complete. + */ + put_page(page_tail); + } + + /* + * Only the head page (now become a regular page) is required + * to be pinned by the caller. + */ + BUG_ON(page_count(page) <= 0); +} + +static int __split_huge_page_map(struct page *page, + struct vm_area_struct *vma, + unsigned long address) +{ + struct mm_struct *mm = vma->vm_mm; + pmd_t *pmd, _pmd; + int ret = 0, i; + pgtable_t pgtable; + unsigned long haddr; + + spin_lock(&mm->page_table_lock); + pmd = page_check_address_pmd(page, mm, address, + PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG); + if (pmd) { + pgtable = get_pmd_huge_pte(mm); + pmd_populate(mm, &_pmd, pgtable); + + for (i = 0, haddr = address; i < HPAGE_PMD_NR; + i++, haddr += PAGE_SIZE) { + pte_t *pte, entry; + BUG_ON(PageCompound(page+i)); + entry = mk_pte(page + i, vma->vm_page_prot); + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + if (!pmd_write(*pmd)) + entry = pte_wrprotect(entry); + else + BUG_ON(page_mapcount(page) != 1); + if (!pmd_young(*pmd)) + entry = pte_mkold(entry); + pte = pte_offset_map(&_pmd, haddr); + BUG_ON(!pte_none(*pte)); + set_pte_at(mm, haddr, pte, entry); + pte_unmap(pte); + } + + mm->nr_ptes++; + smp_wmb(); /* make pte visible before pmd */ + /* + * Up to this point the pmd is present and huge and + * userland has the whole access to the hugepage + * during the split (which happens in place). If we + * overwrite the pmd with the not-huge version + * pointing to the pte here (which of course we could + * if all CPUs were bug free), userland could trigger + * a small page size TLB miss on the small sized TLB + * while the hugepage TLB entry is still established + * in the huge TLB. Some CPU doesn't like that. See + * http://support.amd.com/us/Processor_TechDocs/41322.pdf, + * Erratum 383 on page 93. Intel should be safe but is + * also warns that it's only safe if the permission + * and cache attributes of the two entries loaded in + * the two TLB is identical (which should be the case + * here). But it is generally safer to never allow + * small and huge TLB entries for the same virtual + * address to be loaded simultaneously. So instead of + * doing "pmd_populate(); flush_tlb_range();" we first + * mark the current pmd notpresent (atomically because + * here the pmd_trans_huge and pmd_trans_splitting + * must remain set at all times on the pmd until the + * split is complete for this pmd), then we flush the + * SMP TLB and finally we write the non-huge version + * of the pmd entry with pmd_populate. + */ + set_pmd_at(mm, address, pmd, pmd_mknotpresent(*pmd)); + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + pmd_populate(mm, pmd, pgtable); + ret = 1; + } + spin_unlock(&mm->page_table_lock); + + return ret; +} + +/* must be called with anon_vma->root->lock hold */ +static void __split_huge_page(struct page *page, + struct anon_vma *anon_vma) +{ + int mapcount, mapcount2; + struct anon_vma_chain *avc; + + BUG_ON(!PageHead(page)); + BUG_ON(PageTail(page)); + + mapcount = 0; + list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { + struct vm_area_struct *vma = avc->vma; + unsigned long addr = vma_address(page, vma); + BUG_ON(is_vma_temporary_stack(vma)); + if (addr == -EFAULT) + continue; + mapcount += __split_huge_page_splitting(page, vma, addr); + } + /* + * It is critical that new vmas are added to the tail of the + * anon_vma list. This guarantes that if copy_huge_pmd() runs + * and establishes a child pmd before + * __split_huge_page_splitting() freezes the parent pmd (so if + * we fail to prevent copy_huge_pmd() from running until the + * whole __split_huge_page() is complete), we will still see + * the newly established pmd of the child later during the + * walk, to be able to set it as pmd_trans_splitting too. + */ + if (mapcount != page_mapcount(page)) + printk(KERN_ERR "mapcount %d page_mapcount %d\n", + mapcount, page_mapcount(page)); + BUG_ON(mapcount != page_mapcount(page)); + + __split_huge_page_refcount(page); + + mapcount2 = 0; + list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { + struct vm_area_struct *vma = avc->vma; + unsigned long addr = vma_address(page, vma); + BUG_ON(is_vma_temporary_stack(vma)); + if (addr == -EFAULT) + continue; + mapcount2 += __split_huge_page_map(page, vma, addr); + } + if (mapcount != mapcount2) + printk(KERN_ERR "mapcount %d mapcount2 %d page_mapcount %d\n", + mapcount, mapcount2, page_mapcount(page)); + BUG_ON(mapcount != mapcount2); +} + +int split_huge_page(struct page *page) +{ + struct anon_vma *anon_vma; + int ret = 1; + + BUG_ON(!PageAnon(page)); + anon_vma = page_lock_anon_vma(page); + if (!anon_vma) + goto out; + ret = 0; + if (!PageCompound(page)) + goto out_unlock; + + BUG_ON(!PageSwapBacked(page)); + __split_huge_page(page, anon_vma); + + BUG_ON(PageCompound(page)); +out_unlock: + page_unlock_anon_vma(anon_vma); +out: + return ret; +} + +int hugepage_madvise(struct vm_area_struct *vma, + unsigned long *vm_flags, int advice) +{ + switch (advice) { + case MADV_HUGEPAGE: + /* + * Be somewhat over-protective like KSM for now! + */ + if (*vm_flags & (VM_HUGEPAGE | + VM_SHARED | VM_MAYSHARE | + VM_PFNMAP | VM_IO | VM_DONTEXPAND | + VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | + VM_MIXEDMAP | VM_SAO)) + return -EINVAL; + *vm_flags &= ~VM_NOHUGEPAGE; + *vm_flags |= VM_HUGEPAGE; + /* + * If the vma become good for khugepaged to scan, + * register it here without waiting a page fault that + * may not happen any time soon. + */ + if (unlikely(khugepaged_enter_vma_merge(vma))) + return -ENOMEM; + break; + case MADV_NOHUGEPAGE: + /* + * Be somewhat over-protective like KSM for now! + */ + if (*vm_flags & (VM_NOHUGEPAGE | + VM_SHARED | VM_MAYSHARE | + VM_PFNMAP | VM_IO | VM_DONTEXPAND | + VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | + VM_MIXEDMAP | VM_SAO)) + return -EINVAL; + *vm_flags &= ~VM_HUGEPAGE; + *vm_flags |= VM_NOHUGEPAGE; + /* + * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning + * this vma even if we leave the mm registered in khugepaged if + * it got registered before VM_NOHUGEPAGE was set. + */ + break; + } + + return 0; +} + +static int __init khugepaged_slab_init(void) +{ + mm_slot_cache = kmem_cache_create("khugepaged_mm_slot", + sizeof(struct mm_slot), + __alignof__(struct mm_slot), 0, NULL); + if (!mm_slot_cache) + return -ENOMEM; + + return 0; +} + +static void __init khugepaged_slab_free(void) +{ + kmem_cache_destroy(mm_slot_cache); + mm_slot_cache = NULL; +} + +static inline struct mm_slot *alloc_mm_slot(void) +{ + if (!mm_slot_cache) /* initialization failed */ + return NULL; + return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL); +} + +static inline void free_mm_slot(struct mm_slot *mm_slot) +{ + kmem_cache_free(mm_slot_cache, mm_slot); +} + +static int __init mm_slots_hash_init(void) +{ + mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head), + GFP_KERNEL); + if (!mm_slots_hash) + return -ENOMEM; + return 0; +} + +#if 0 +static void __init mm_slots_hash_free(void) +{ + kfree(mm_slots_hash); + mm_slots_hash = NULL; +} +#endif + +static struct mm_slot *get_mm_slot(struct mm_struct *mm) +{ + struct mm_slot *mm_slot; + struct hlist_head *bucket; + struct hlist_node *node; + + bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) + % MM_SLOTS_HASH_HEADS]; + hlist_for_each_entry(mm_slot, node, bucket, hash) { + if (mm == mm_slot->mm) + return mm_slot; + } + return NULL; +} + +static void insert_to_mm_slots_hash(struct mm_struct *mm, + struct mm_slot *mm_slot) +{ + struct hlist_head *bucket; + + bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) + % MM_SLOTS_HASH_HEADS]; + mm_slot->mm = mm; + hlist_add_head(&mm_slot->hash, bucket); +} + +static inline int khugepaged_test_exit(struct mm_struct *mm) +{ + return atomic_read(&mm->mm_users) == 0; +} + +int __khugepaged_enter(struct mm_struct *mm) +{ + struct mm_slot *mm_slot; + int wakeup; + + mm_slot = alloc_mm_slot(); + if (!mm_slot) + return -ENOMEM; + + /* __khugepaged_exit() must not run from under us */ + VM_BUG_ON(khugepaged_test_exit(mm)); + if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) { + free_mm_slot(mm_slot); + return 0; + } + + spin_lock(&khugepaged_mm_lock); + insert_to_mm_slots_hash(mm, mm_slot); + /* + * Insert just behind the scanning cursor, to let the area settle + * down a little. + */ + wakeup = list_empty(&khugepaged_scan.mm_head); + list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head); + spin_unlock(&khugepaged_mm_lock); + + atomic_inc(&mm->mm_count); + if (wakeup) + wake_up_interruptible(&khugepaged_wait); + + return 0; +} + +int khugepaged_enter_vma_merge(struct vm_area_struct *vma) +{ + unsigned long hstart, hend; + if (!vma->anon_vma) + /* + * Not yet faulted in so we will register later in the + * page fault if needed. + */ + return 0; + if (vma->vm_file || vma->vm_ops) + /* khugepaged not yet working on file or special mappings */ + return 0; + VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma)); + hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; + hend = vma->vm_end & HPAGE_PMD_MASK; + if (hstart < hend) + return khugepaged_enter(vma); + return 0; +} + +void __khugepaged_exit(struct mm_struct *mm) +{ + struct mm_slot *mm_slot; + int free = 0; + + spin_lock(&khugepaged_mm_lock); + mm_slot = get_mm_slot(mm); + if (mm_slot && khugepaged_scan.mm_slot != mm_slot) { + hlist_del(&mm_slot->hash); + list_del(&mm_slot->mm_node); + free = 1; + } + + if (free) { + spin_unlock(&khugepaged_mm_lock); + clear_bit(MMF_VM_HUGEPAGE, &mm->flags); + free_mm_slot(mm_slot); + mmdrop(mm); + } else if (mm_slot) { + spin_unlock(&khugepaged_mm_lock); + /* + * This is required to serialize against + * khugepaged_test_exit() (which is guaranteed to run + * under mmap sem read mode). Stop here (after we + * return all pagetables will be destroyed) until + * khugepaged has finished working on the pagetables + * under the mmap_sem. + */ + down_write(&mm->mmap_sem); + up_write(&mm->mmap_sem); + } else + spin_unlock(&khugepaged_mm_lock); +} + +static void release_pte_page(struct page *page) +{ + /* 0 stands for page_is_file_cache(page) == false */ + dec_zone_page_state(page, NR_ISOLATED_ANON + 0); + unlock_page(page); + putback_lru_page(page); +} + +static void release_pte_pages(pte_t *pte, pte_t *_pte) +{ + while (--_pte >= pte) { + pte_t pteval = *_pte; + if (!pte_none(pteval)) + release_pte_page(pte_page(pteval)); + } +} + +static void release_all_pte_pages(pte_t *pte) +{ + release_pte_pages(pte, pte + HPAGE_PMD_NR); +} + +static int __collapse_huge_page_isolate(struct vm_area_struct *vma, + unsigned long address, + pte_t *pte) +{ + struct page *page; + pte_t *_pte; + int referenced = 0, isolated = 0, none = 0; + for (_pte = pte; _pte < pte+HPAGE_PMD_NR; + _pte++, address += PAGE_SIZE) { + pte_t pteval = *_pte; + if (pte_none(pteval)) { + if (++none <= khugepaged_max_ptes_none) + continue; + else { + release_pte_pages(pte, _pte); + goto out; + } + } + if (!pte_present(pteval) || !pte_write(pteval)) { + release_pte_pages(pte, _pte); + goto out; + } + page = vm_normal_page(vma, address, pteval); + if (unlikely(!page)) { + release_pte_pages(pte, _pte); + goto out; + } + VM_BUG_ON(PageCompound(page)); + BUG_ON(!PageAnon(page)); + VM_BUG_ON(!PageSwapBacked(page)); + + /* cannot use mapcount: can't collapse if there's a gup pin */ + if (page_count(page) != 1) { + release_pte_pages(pte, _pte); + goto out; + } + /* + * We can do it before isolate_lru_page because the + * page can't be freed from under us. NOTE: PG_lock + * is needed to serialize against split_huge_page + * when invoked from the VM. + */ + if (!trylock_page(page)) { + release_pte_pages(pte, _pte); + goto out; + } + /* + * Isolate the page to avoid collapsing an hugepage + * currently in use by the VM. + */ + if (isolate_lru_page(page)) { + unlock_page(page); + release_pte_pages(pte, _pte); + goto out; + } + /* 0 stands for page_is_file_cache(page) == false */ + inc_zone_page_state(page, NR_ISOLATED_ANON + 0); + VM_BUG_ON(!PageLocked(page)); + VM_BUG_ON(PageLRU(page)); + + /* If there is no mapped pte young don't collapse the page */ + if (pte_young(pteval) || PageReferenced(page) || + mmu_notifier_test_young(vma->vm_mm, address)) + referenced = 1; + } + if (unlikely(!referenced)) + release_all_pte_pages(pte); + else + isolated = 1; +out: + return isolated; +} + +static void __collapse_huge_page_copy(pte_t *pte, struct page *page, + struct vm_area_struct *vma, + unsigned long address, + spinlock_t *ptl) +{ + pte_t *_pte; + for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++) { + pte_t pteval = *_pte; + struct page *src_page; + + if (pte_none(pteval)) { + clear_user_highpage(page, address); + add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); + } else { + src_page = pte_page(pteval); + copy_user_highpage(page, src_page, address, vma); + VM_BUG_ON(page_mapcount(src_page) != 1); + VM_BUG_ON(page_count(src_page) != 2); + release_pte_page(src_page); + /* + * ptl mostly unnecessary, but preempt has to + * be disabled to update the per-cpu stats + * inside page_remove_rmap(). + */ + spin_lock(ptl); + /* + * paravirt calls inside pte_clear here are + * superfluous. + */ + pte_clear(vma->vm_mm, address, _pte); + page_remove_rmap(src_page); + spin_unlock(ptl); + free_page_and_swap_cache(src_page); + } + + address += PAGE_SIZE; + page++; + } +} + +static void collapse_huge_page(struct mm_struct *mm, + unsigned long address, + struct page **hpage, + struct vm_area_struct *vma) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd, _pmd; + pte_t *pte; + pgtable_t pgtable; + struct page *new_page; + spinlock_t *ptl; + int isolated; + unsigned long hstart, hend; + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); +#ifndef CONFIG_NUMA + VM_BUG_ON(!*hpage); + new_page = *hpage; +#else + VM_BUG_ON(*hpage); + /* + * Allocate the page while the vma is still valid and under + * the mmap_sem read mode so there is no memory allocation + * later when we take the mmap_sem in write mode. This is more + * friendly behavior (OTOH it may actually hide bugs) to + * filesystems in userland with daemons allocating memory in + * the userland I/O paths. Allocating memory with the + * mmap_sem in read mode is good idea also to allow greater + * scalability. + */ + new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address); + if (unlikely(!new_page)) { + up_read(&mm->mmap_sem); + *hpage = ERR_PTR(-ENOMEM); + return; + } +#endif + if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { + up_read(&mm->mmap_sem); + put_page(new_page); + return; + } + + /* after allocating the hugepage upgrade to mmap_sem write mode */ + up_read(&mm->mmap_sem); + + /* + * Prevent all access to pagetables with the exception of + * gup_fast later hanlded by the ptep_clear_flush and the VM + * handled by the anon_vma lock + PG_lock. + */ + down_write(&mm->mmap_sem); + if (unlikely(khugepaged_test_exit(mm))) + goto out; + + vma = find_vma(mm, address); + hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; + hend = vma->vm_end & HPAGE_PMD_MASK; + if (address < hstart || address + HPAGE_PMD_SIZE > hend) + goto out; + + if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || + (vma->vm_flags & VM_NOHUGEPAGE)) + goto out; + + /* VM_PFNMAP vmas may have vm_ops null but vm_file set */ + if (!vma->anon_vma || vma->vm_ops || vma->vm_file) + goto out; + VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma)); + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + goto out; + + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + goto out; + + pmd = pmd_offset(pud, address); + /* pmd can't go away or become huge under us */ + if (!pmd_present(*pmd) || pmd_trans_huge(*pmd)) + goto out; + + anon_vma_lock(vma->anon_vma); + + pte = pte_offset_map(pmd, address); + ptl = pte_lockptr(mm, pmd); + + spin_lock(&mm->page_table_lock); /* probably unnecessary */ + /* + * After this gup_fast can't run anymore. This also removes + * any huge TLB entry from the CPU so we won't allow + * huge and small TLB entries for the same virtual address + * to avoid the risk of CPU bugs in that area. + */ + _pmd = pmdp_clear_flush_notify(vma, address, pmd); + spin_unlock(&mm->page_table_lock); + + spin_lock(ptl); + isolated = __collapse_huge_page_isolate(vma, address, pte); + spin_unlock(ptl); + pte_unmap(pte); + + if (unlikely(!isolated)) { + spin_lock(&mm->page_table_lock); + BUG_ON(!pmd_none(*pmd)); + set_pmd_at(mm, address, pmd, _pmd); + spin_unlock(&mm->page_table_lock); + anon_vma_unlock(vma->anon_vma); + mem_cgroup_uncharge_page(new_page); + goto out; + } + + /* + * All pages are isolated and locked so anon_vma rmap + * can't run anymore. + */ + anon_vma_unlock(vma->anon_vma); + + __collapse_huge_page_copy(pte, new_page, vma, address, ptl); + __SetPageUptodate(new_page); + pgtable = pmd_pgtable(_pmd); + VM_BUG_ON(page_count(pgtable) != 1); + VM_BUG_ON(page_mapcount(pgtable) != 0); + + _pmd = mk_pmd(new_page, vma->vm_page_prot); + _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); + _pmd = pmd_mkhuge(_pmd); + + /* + * spin_lock() below is not the equivalent of smp_wmb(), so + * this is needed to avoid the copy_huge_page writes to become + * visible after the set_pmd_at() write. + */ + smp_wmb(); + + spin_lock(&mm->page_table_lock); + BUG_ON(!pmd_none(*pmd)); + page_add_new_anon_rmap(new_page, vma, address); + set_pmd_at(mm, address, pmd, _pmd); + update_mmu_cache(vma, address, entry); + prepare_pmd_huge_pte(pgtable, mm); + mm->nr_ptes--; + spin_unlock(&mm->page_table_lock); + +#ifndef CONFIG_NUMA + *hpage = NULL; +#endif + khugepaged_pages_collapsed++; +out_up_write: + up_write(&mm->mmap_sem); + return; + +out: +#ifdef CONFIG_NUMA + put_page(new_page); +#endif + goto out_up_write; +} + +static int khugepaged_scan_pmd(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long address, + struct page **hpage) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte, *_pte; + int ret = 0, referenced = 0, none = 0; + struct page *page; + unsigned long _address; + spinlock_t *ptl; + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + goto out; + + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + goto out; + + pmd = pmd_offset(pud, address); + if (!pmd_present(*pmd) || pmd_trans_huge(*pmd)) + goto out; + + pte = pte_offset_map_lock(mm, pmd, address, &ptl); + for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR; + _pte++, _address += PAGE_SIZE) { + pte_t pteval = *_pte; + if (pte_none(pteval)) { + if (++none <= khugepaged_max_ptes_none) + continue; + else + goto out_unmap; + } + if (!pte_present(pteval) || !pte_write(pteval)) + goto out_unmap; + page = vm_normal_page(vma, _address, pteval); + if (unlikely(!page)) + goto out_unmap; + VM_BUG_ON(PageCompound(page)); + if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) + goto out_unmap; + /* cannot use mapcount: can't collapse if there's a gup pin */ + if (page_count(page) != 1) + goto out_unmap; + if (pte_young(pteval) || PageReferenced(page) || + mmu_notifier_test_young(vma->vm_mm, address)) + referenced = 1; + } + if (referenced) + ret = 1; +out_unmap: + pte_unmap_unlock(pte, ptl); + if (ret) + /* collapse_huge_page will return with the mmap_sem released */ + collapse_huge_page(mm, address, hpage, vma); +out: + return ret; +} + +static void collect_mm_slot(struct mm_slot *mm_slot) +{ + struct mm_struct *mm = mm_slot->mm; + + VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock)); + + if (khugepaged_test_exit(mm)) { + /* free mm_slot */ + hlist_del(&mm_slot->hash); + list_del(&mm_slot->mm_node); + + /* + * Not strictly needed because the mm exited already. + * + * clear_bit(MMF_VM_HUGEPAGE, &mm->flags); + */ + + /* khugepaged_mm_lock actually not necessary for the below */ + free_mm_slot(mm_slot); + mmdrop(mm); + } +} + +static unsigned int khugepaged_scan_mm_slot(unsigned int pages, + struct page **hpage) +{ + struct mm_slot *mm_slot; + struct mm_struct *mm; + struct vm_area_struct *vma; + int progress = 0; + + VM_BUG_ON(!pages); + VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock)); + + if (khugepaged_scan.mm_slot) + mm_slot = khugepaged_scan.mm_slot; + else { + mm_slot = list_entry(khugepaged_scan.mm_head.next, + struct mm_slot, mm_node); + khugepaged_scan.address = 0; + khugepaged_scan.mm_slot = mm_slot; + } + spin_unlock(&khugepaged_mm_lock); + + mm = mm_slot->mm; + down_read(&mm->mmap_sem); + if (unlikely(khugepaged_test_exit(mm))) + vma = NULL; + else + vma = find_vma(mm, khugepaged_scan.address); + + progress++; + for (; vma; vma = vma->vm_next) { + unsigned long hstart, hend; + + cond_resched(); + if (unlikely(khugepaged_test_exit(mm))) { + progress++; + break; + } + + if ((!(vma->vm_flags & VM_HUGEPAGE) && + !khugepaged_always()) || + (vma->vm_flags & VM_NOHUGEPAGE)) { + progress++; + continue; + } + + /* VM_PFNMAP vmas may have vm_ops null but vm_file set */ + if (!vma->anon_vma || vma->vm_ops || vma->vm_file) { + khugepaged_scan.address = vma->vm_end; + progress++; + continue; + } + VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma)); + + hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; + hend = vma->vm_end & HPAGE_PMD_MASK; + if (hstart >= hend) { + progress++; + continue; + } + if (khugepaged_scan.address < hstart) + khugepaged_scan.address = hstart; + if (khugepaged_scan.address > hend) { + khugepaged_scan.address = hend + HPAGE_PMD_SIZE; + progress++; + continue; + } + BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK); + + while (khugepaged_scan.address < hend) { + int ret; + cond_resched(); + if (unlikely(khugepaged_test_exit(mm))) + goto breakouterloop; + + VM_BUG_ON(khugepaged_scan.address < hstart || + khugepaged_scan.address + HPAGE_PMD_SIZE > + hend); + ret = khugepaged_scan_pmd(mm, vma, + khugepaged_scan.address, + hpage); + /* move to next address */ + khugepaged_scan.address += HPAGE_PMD_SIZE; + progress += HPAGE_PMD_NR; + if (ret) + /* we released mmap_sem so break loop */ + goto breakouterloop_mmap_sem; + if (progress >= pages) + goto breakouterloop; + } + } +breakouterloop: + up_read(&mm->mmap_sem); /* exit_mmap will destroy ptes after this */ +breakouterloop_mmap_sem: + + spin_lock(&khugepaged_mm_lock); + BUG_ON(khugepaged_scan.mm_slot != mm_slot); + /* + * Release the current mm_slot if this mm is about to die, or + * if we scanned all vmas of this mm. + */ + if (khugepaged_test_exit(mm) || !vma) { + /* + * Make sure that if mm_users is reaching zero while + * khugepaged runs here, khugepaged_exit will find + * mm_slot not pointing to the exiting mm. + */ + if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) { + khugepaged_scan.mm_slot = list_entry( + mm_slot->mm_node.next, + struct mm_slot, mm_node); + khugepaged_scan.address = 0; + } else { + khugepaged_scan.mm_slot = NULL; + khugepaged_full_scans++; + } + + collect_mm_slot(mm_slot); + } + + return progress; +} + +static int khugepaged_has_work(void) +{ + return !list_empty(&khugepaged_scan.mm_head) && + khugepaged_enabled(); +} + +static int khugepaged_wait_event(void) +{ + return !list_empty(&khugepaged_scan.mm_head) || + !khugepaged_enabled(); +} + +static void khugepaged_do_scan(struct page **hpage) +{ + unsigned int progress = 0, pass_through_head = 0; + unsigned int pages = khugepaged_pages_to_scan; + + barrier(); /* write khugepaged_pages_to_scan to local stack */ + + while (progress < pages) { + cond_resched(); + +#ifndef CONFIG_NUMA + if (!*hpage) { + *hpage = alloc_hugepage(khugepaged_defrag()); + if (unlikely(!*hpage)) + break; + } +#else + if (IS_ERR(*hpage)) + break; +#endif + + if (unlikely(kthread_should_stop() || freezing(current))) + break; + + spin_lock(&khugepaged_mm_lock); + if (!khugepaged_scan.mm_slot) + pass_through_head++; + if (khugepaged_has_work() && + pass_through_head < 2) + progress += khugepaged_scan_mm_slot(pages - progress, + hpage); + else + progress = pages; + spin_unlock(&khugepaged_mm_lock); + } +} + +static void khugepaged_alloc_sleep(void) +{ + DEFINE_WAIT(wait); + add_wait_queue(&khugepaged_wait, &wait); + schedule_timeout_interruptible( + msecs_to_jiffies( + khugepaged_alloc_sleep_millisecs)); + remove_wait_queue(&khugepaged_wait, &wait); +} + +#ifndef CONFIG_NUMA +static struct page *khugepaged_alloc_hugepage(void) +{ + struct page *hpage; + + do { + hpage = alloc_hugepage(khugepaged_defrag()); + if (!hpage) + khugepaged_alloc_sleep(); + } while (unlikely(!hpage) && + likely(khugepaged_enabled())); + return hpage; +} +#endif + +static void khugepaged_loop(void) +{ + struct page *hpage; + +#ifdef CONFIG_NUMA + hpage = NULL; +#endif + while (likely(khugepaged_enabled())) { +#ifndef CONFIG_NUMA + hpage = khugepaged_alloc_hugepage(); + if (unlikely(!hpage)) + break; +#else + if (IS_ERR(hpage)) { + khugepaged_alloc_sleep(); + hpage = NULL; + } +#endif + + khugepaged_do_scan(&hpage); +#ifndef CONFIG_NUMA + if (hpage) + put_page(hpage); +#endif + try_to_freeze(); + if (unlikely(kthread_should_stop())) + break; + if (khugepaged_has_work()) { + DEFINE_WAIT(wait); + if (!khugepaged_scan_sleep_millisecs) + continue; + add_wait_queue(&khugepaged_wait, &wait); + schedule_timeout_interruptible( + msecs_to_jiffies( + khugepaged_scan_sleep_millisecs)); + remove_wait_queue(&khugepaged_wait, &wait); + } else if (khugepaged_enabled()) + wait_event_freezable(khugepaged_wait, + khugepaged_wait_event()); + } +} + +static int khugepaged(void *none) +{ + struct mm_slot *mm_slot; + + set_freezable(); + set_user_nice(current, 19); + + /* serialize with start_khugepaged() */ + mutex_lock(&khugepaged_mutex); + + for (;;) { + mutex_unlock(&khugepaged_mutex); + BUG_ON(khugepaged_thread != current); + khugepaged_loop(); + BUG_ON(khugepaged_thread != current); + + mutex_lock(&khugepaged_mutex); + if (!khugepaged_enabled()) + break; + if (unlikely(kthread_should_stop())) + break; + } + + spin_lock(&khugepaged_mm_lock); + mm_slot = khugepaged_scan.mm_slot; + khugepaged_scan.mm_slot = NULL; + if (mm_slot) + collect_mm_slot(mm_slot); + spin_unlock(&khugepaged_mm_lock); + + khugepaged_thread = NULL; + mutex_unlock(&khugepaged_mutex); + + return 0; +} + +void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd) +{ + struct page *page; + + spin_lock(&mm->page_table_lock); + if (unlikely(!pmd_trans_huge(*pmd))) { + spin_unlock(&mm->page_table_lock); + return; + } + page = pmd_page(*pmd); + VM_BUG_ON(!page_count(page)); + get_page(page); + spin_unlock(&mm->page_table_lock); + + split_huge_page(page); + + put_page(page); + BUG_ON(pmd_trans_huge(*pmd)); +} + +static void split_huge_page_address(struct mm_struct *mm, + unsigned long address) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + + VM_BUG_ON(!(address & ~HPAGE_PMD_MASK)); + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + return; + + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + return; + + pmd = pmd_offset(pud, address); + if (!pmd_present(*pmd)) + return; + /* + * Caller holds the mmap_sem write mode, so a huge pmd cannot + * materialize from under us. + */ + split_huge_page_pmd(mm, pmd); +} + +void __vma_adjust_trans_huge(struct vm_area_struct *vma, + unsigned long start, + unsigned long end, + long adjust_next) +{ + /* + * If the new start address isn't hpage aligned and it could + * previously contain an hugepage: check if we need to split + * an huge pmd. + */ + if (start & ~HPAGE_PMD_MASK && + (start & HPAGE_PMD_MASK) >= vma->vm_start && + (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) + split_huge_page_address(vma->vm_mm, start); + + /* + * If the new end address isn't hpage aligned and it could + * previously contain an hugepage: check if we need to split + * an huge pmd. + */ + if (end & ~HPAGE_PMD_MASK && + (end & HPAGE_PMD_MASK) >= vma->vm_start && + (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) + split_huge_page_address(vma->vm_mm, end); + + /* + * If we're also updating the vma->vm_next->vm_start, if the new + * vm_next->vm_start isn't page aligned and it could previously + * contain an hugepage: check if we need to split an huge pmd. + */ + if (adjust_next > 0) { + struct vm_area_struct *next = vma->vm_next; + unsigned long nstart = next->vm_start; + nstart += adjust_next << PAGE_SHIFT; + if (nstart & ~HPAGE_PMD_MASK && + (nstart & HPAGE_PMD_MASK) >= next->vm_start && + (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end) + split_huge_page_address(next->vm_mm, nstart); + } +} diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 8585524..bb0b7c1 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -394,71 +394,6 @@ static int vma_has_reserves(struct vm_area_struct *vma) return 0; } -static void clear_gigantic_page(struct page *page, - unsigned long addr, unsigned long sz) -{ - int i; - struct page *p = page; - - might_sleep(); - for (i = 0; i < sz/PAGE_SIZE; i++, p = mem_map_next(p, page, i)) { - cond_resched(); - clear_user_highpage(p, addr + i * PAGE_SIZE); - } -} -static void clear_huge_page(struct page *page, - unsigned long addr, unsigned long sz) -{ - int i; - - if (unlikely(sz/PAGE_SIZE > MAX_ORDER_NR_PAGES)) { - clear_gigantic_page(page, addr, sz); - return; - } - - might_sleep(); - for (i = 0; i < sz/PAGE_SIZE; i++) { - cond_resched(); - clear_user_highpage(page + i, addr + i * PAGE_SIZE); - } -} - -static void copy_user_gigantic_page(struct page *dst, struct page *src, - unsigned long addr, struct vm_area_struct *vma) -{ - int i; - struct hstate *h = hstate_vma(vma); - struct page *dst_base = dst; - struct page *src_base = src; - - for (i = 0; i < pages_per_huge_page(h); ) { - cond_resched(); - copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma); - - i++; - dst = mem_map_next(dst, dst_base, i); - src = mem_map_next(src, src_base, i); - } -} - -static void copy_user_huge_page(struct page *dst, struct page *src, - unsigned long addr, struct vm_area_struct *vma) -{ - int i; - struct hstate *h = hstate_vma(vma); - - if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) { - copy_user_gigantic_page(dst, src, addr, vma); - return; - } - - might_sleep(); - for (i = 0; i < pages_per_huge_page(h); i++) { - cond_resched(); - copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); - } -} - static void copy_gigantic_page(struct page *dst, struct page *src) { int i; @@ -1428,6 +1363,7 @@ static ssize_t nr_hugepages_show_common(struct kobject *kobj, return sprintf(buf, "%lu\n", nr_huge_pages); } + static ssize_t nr_hugepages_store_common(bool obey_mempolicy, struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t len) @@ -1440,9 +1376,14 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy, err = strict_strtoul(buf, 10, &count); if (err) - return 0; + goto out; h = kobj_to_hstate(kobj, &nid); + if (h->order >= MAX_ORDER) { + err = -EINVAL; + goto out; + } + if (nid == NUMA_NO_NODE) { /* * global hstate attribute @@ -1468,6 +1409,9 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy, NODEMASK_FREE(nodes_allowed); return len; +out: + NODEMASK_FREE(nodes_allowed); + return err; } static ssize_t nr_hugepages_show(struct kobject *kobj, @@ -1510,6 +1454,7 @@ static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj, struct hstate *h = kobj_to_hstate(kobj, NULL); return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages); } + static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { @@ -1517,9 +1462,12 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, unsigned long input; struct hstate *h = kobj_to_hstate(kobj, NULL); + if (h->order >= MAX_ORDER) + return -EINVAL; + err = strict_strtoul(buf, 10, &input); if (err) - return 0; + return err; spin_lock(&hugetlb_lock); h->nr_overcommit_huge_pages = input; @@ -1922,13 +1870,19 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy, { struct hstate *h = &default_hstate; unsigned long tmp; + int ret; if (!write) tmp = h->max_huge_pages; + if (write && h->order >= MAX_ORDER) + return -EINVAL; + table->data = &tmp; table->maxlen = sizeof(unsigned long); - proc_doulongvec_minmax(table, write, buffer, length, ppos); + ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); + if (ret) + goto out; if (write) { NODEMASK_ALLOC(nodemask_t, nodes_allowed, @@ -1943,8 +1897,8 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy, if (nodes_allowed != &node_states[N_HIGH_MEMORY]) NODEMASK_FREE(nodes_allowed); } - - return 0; +out: + return ret; } int hugetlb_sysctl_handler(struct ctl_table *table, int write, @@ -1982,21 +1936,27 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, { struct hstate *h = &default_hstate; unsigned long tmp; + int ret; if (!write) tmp = h->nr_overcommit_huge_pages; + if (write && h->order >= MAX_ORDER) + return -EINVAL; + table->data = &tmp; table->maxlen = sizeof(unsigned long); - proc_doulongvec_minmax(table, write, buffer, length, ppos); + ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); + if (ret) + goto out; if (write) { spin_lock(&hugetlb_lock); h->nr_overcommit_huge_pages = tmp; spin_unlock(&hugetlb_lock); } - - return 0; +out: + return ret; } #endif /* CONFIG_SYSCTL */ @@ -2454,7 +2414,8 @@ retry_avoidcopy: return VM_FAULT_OOM; } - copy_user_huge_page(new_page, old_page, address, vma); + copy_user_huge_page(new_page, old_page, address, vma, + pages_per_huge_page(h)); __SetPageUptodate(new_page); /* @@ -2558,7 +2519,7 @@ retry: ret = -PTR_ERR(page); goto out; } - clear_huge_page(page, address, huge_page_size(h)); + clear_huge_page(page, address, pages_per_huge_page(h)); __SetPageUptodate(page); if (vma->vm_flags & VM_MAYSHARE) { diff --git a/mm/internal.h b/mm/internal.h index dedb0af..4c98630 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -39,6 +39,15 @@ static inline void __put_page(struct page *page) extern unsigned long highest_memmap_pfn; +#ifdef CONFIG_SMP +extern int putback_active_lru_page(struct zone *zone, struct page *page); +#else +static inline int putback_active_lru_page(struct zone *zone, struct page *page) +{ + return 0; +} +#endif + /* * in mm/vmscan.c: */ @@ -134,6 +143,10 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page) } } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +extern unsigned long vma_address(struct page *page, + struct vm_area_struct *vma); +#endif #else /* !CONFIG_MMU */ static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) { @@ -243,7 +256,8 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn, int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int len, unsigned int foll_flags, - struct page **pages, struct vm_area_struct **vmas); + struct page **pages, struct vm_area_struct **vmas, + int *nonblocking); #define ZONE_RECLAIM_NOSCAN -2 #define ZONE_RECLAIM_FULL -1 @@ -34,6 +34,7 @@ #include <linux/swap.h> #include <linux/ksm.h> #include <linux/hash.h> +#include <linux/freezer.h> #include <asm/tlbflush.h> #include "internal.h" @@ -411,6 +412,20 @@ out: up_read(&mm->mmap_sem); } +static struct page *page_trans_compound_anon(struct page *page) +{ + if (PageTransCompound(page)) { + struct page *head = compound_trans_head(page); + /* + * head may actually be splitted and freed from under + * us but it's ok here. + */ + if (PageAnon(head)) + return head; + } + return NULL; +} + static struct page *get_mergeable_page(struct rmap_item *rmap_item) { struct mm_struct *mm = rmap_item->mm; @@ -430,7 +445,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item) page = follow_page(vma, addr, FOLL_GET); if (IS_ERR_OR_NULL(page)) goto out; - if (PageAnon(page)) { + if (PageAnon(page) || page_trans_compound_anon(page)) { flush_anon_page(vma, page, addr); flush_dcache_page(page); } else { @@ -708,6 +723,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, if (addr == -EFAULT) goto out; + BUG_ON(PageTransCompound(page)); ptep = page_check_address(page, mm, addr, &ptl, 0); if (!ptep) goto out; @@ -783,6 +799,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, goto out; pmd = pmd_offset(pud, addr); + BUG_ON(pmd_trans_huge(*pmd)); if (!pmd_present(*pmd)) goto out; @@ -800,6 +817,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); page_remove_rmap(page); + if (!page_mapped(page)) + try_to_free_swap(page); put_page(page); pte_unmap_unlock(ptep, ptl); @@ -808,6 +827,33 @@ out: return err; } +static int page_trans_compound_anon_split(struct page *page) +{ + int ret = 0; + struct page *transhuge_head = page_trans_compound_anon(page); + if (transhuge_head) { + /* Get the reference on the head to split it. */ + if (get_page_unless_zero(transhuge_head)) { + /* + * Recheck we got the reference while the head + * was still anonymous. + */ + if (PageAnon(transhuge_head)) + ret = split_huge_page(transhuge_head); + else + /* + * Retry later if split_huge_page run + * from under us. + */ + ret = 1; + put_page(transhuge_head); + } else + /* Retry later if split_huge_page run from under us. */ + ret = 1; + } + return ret; +} + /* * try_to_merge_one_page - take two pages and merge them into one * @vma: the vma that holds the pte pointing to page @@ -828,6 +874,9 @@ static int try_to_merge_one_page(struct vm_area_struct *vma, if (!(vma->vm_flags & VM_MERGEABLE)) goto out; + if (PageTransCompound(page) && page_trans_compound_anon_split(page)) + goto out; + BUG_ON(PageTransCompound(page)); if (!PageAnon(page)) goto out; @@ -1247,6 +1296,18 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page) slot = ksm_scan.mm_slot; if (slot == &ksm_mm_head) { + /* + * A number of pages can hang around indefinitely on per-cpu + * pagevecs, raised page count preventing write_protect_page + * from merging them. Though it doesn't really matter much, + * it is puzzling to see some stuck in pages_volatile until + * other activity jostles them out, and they also prevented + * LTP's KSM test from succeeding deterministically; so drain + * them here (here rather than on entry to ksm_do_scan(), + * so we don't IPI too often when pages_to_scan is set low). + */ + lru_add_drain_all(); + root_unstable_tree = RB_ROOT; spin_lock(&ksm_mmlist_lock); @@ -1277,7 +1338,13 @@ next_mm: if (ksm_test_exit(mm)) break; *page = follow_page(vma, ksm_scan.address, FOLL_GET); - if (!IS_ERR_OR_NULL(*page) && PageAnon(*page)) { + if (IS_ERR_OR_NULL(*page)) { + ksm_scan.address += PAGE_SIZE; + cond_resched(); + continue; + } + if (PageAnon(*page) || + page_trans_compound_anon(*page)) { flush_anon_page(vma, *page, ksm_scan.address); flush_dcache_page(*page); rmap_item = get_next_rmap_item(slot, @@ -1291,8 +1358,7 @@ next_mm: up_read(&mm->mmap_sem); return rmap_item; } - if (!IS_ERR_OR_NULL(*page)) - put_page(*page); + put_page(*page); ksm_scan.address += PAGE_SIZE; cond_resched(); } @@ -1352,7 +1418,7 @@ static void ksm_do_scan(unsigned int scan_npages) struct rmap_item *rmap_item; struct page *uninitialized_var(page); - while (scan_npages--) { + while (scan_npages-- && likely(!freezing(current))) { cond_resched(); rmap_item = scan_get_next_rmap_item(&page); if (!rmap_item) @@ -1370,6 +1436,7 @@ static int ksmd_should_run(void) static int ksm_scan_thread(void *nothing) { + set_freezable(); set_user_nice(current, 5); while (!kthread_should_stop()) { @@ -1378,11 +1445,13 @@ static int ksm_scan_thread(void *nothing) ksm_do_scan(ksm_thread_pages_to_scan); mutex_unlock(&ksm_thread_mutex); + try_to_freeze(); + if (ksmd_should_run()) { schedule_timeout_interruptible( msecs_to_jiffies(ksm_thread_sleep_millisecs)); } else { - wait_event_interruptible(ksm_thread_wait, + wait_event_freezable(ksm_thread_wait, ksmd_should_run() || kthread_should_stop()); } } diff --git a/mm/madvise.c b/mm/madvise.c index 319528b..2221491 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -71,6 +71,12 @@ static long madvise_behavior(struct vm_area_struct * vma, if (error) goto out; break; + case MADV_HUGEPAGE: + case MADV_NOHUGEPAGE: + error = hugepage_madvise(vma, &new_flags, behavior); + if (error) + goto out; + break; } if (new_flags == vma->vm_flags) { @@ -283,6 +289,10 @@ madvise_behavior_valid(int behavior) case MADV_MERGEABLE: case MADV_UNMERGEABLE: #endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + case MADV_HUGEPAGE: + case MADV_NOHUGEPAGE: +#endif return 1; default: diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 00bb8a6..8ab8410 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -292,7 +292,6 @@ static struct move_charge_struct { unsigned long moved_charge; unsigned long moved_swap; struct task_struct *moving_task; /* a task moving charges */ - struct mm_struct *mm; wait_queue_head_t waitq; /* a waitq for other context */ } mc = { .lock = __SPIN_LOCK_UNLOCKED(mc.lock), @@ -821,7 +820,6 @@ void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) return; VM_BUG_ON(list_empty(&pc->lru)); list_del_init(&pc->lru); - return; } void mem_cgroup_del_lru(struct page *page) @@ -1087,7 +1085,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, case 0: list_move(&page->lru, dst); mem_cgroup_del_lru(page); - nr_taken++; + nr_taken += hpage_nr_pages(page); break; case -EBUSY: /* we don't affect global LRU but rotate in our LRU */ @@ -1312,8 +1310,9 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) u64 limit; u64 memsw; - limit = res_counter_read_u64(&memcg->res, RES_LIMIT) + - total_swap_pages; + limit = res_counter_read_u64(&memcg->res, RES_LIMIT); + limit += total_swap_pages << PAGE_SHIFT; + memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT); /* * If memsw is finite and limits the amount of swap space available @@ -1600,11 +1599,13 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) * possibility of race condition. If there is, we take a lock. */ -static void mem_cgroup_update_file_stat(struct page *page, int idx, int val) +void mem_cgroup_update_page_stat(struct page *page, + enum mem_cgroup_page_stat_item idx, int val) { struct mem_cgroup *mem; struct page_cgroup *pc = lookup_page_cgroup(page); bool need_unlock = false; + unsigned long uninitialized_var(flags); if (unlikely(!pc)) return; @@ -1616,37 +1617,34 @@ static void mem_cgroup_update_file_stat(struct page *page, int idx, int val) /* pc->mem_cgroup is unstable ? */ if (unlikely(mem_cgroup_stealed(mem))) { /* take a lock against to access pc->mem_cgroup */ - lock_page_cgroup(pc); + move_lock_page_cgroup(pc, &flags); need_unlock = true; mem = pc->mem_cgroup; if (!mem || !PageCgroupUsed(pc)) goto out; } - this_cpu_add(mem->stat->count[idx], val); - switch (idx) { - case MEM_CGROUP_STAT_FILE_MAPPED: + case MEMCG_NR_FILE_MAPPED: if (val > 0) SetPageCgroupFileMapped(pc); else if (!page_mapped(page)) ClearPageCgroupFileMapped(pc); + idx = MEM_CGROUP_STAT_FILE_MAPPED; break; default: BUG(); } + this_cpu_add(mem->stat->count[idx], val); + out: if (unlikely(need_unlock)) - unlock_page_cgroup(pc); + move_unlock_page_cgroup(pc, &flags); rcu_read_unlock(); return; } - -void mem_cgroup_update_file_mapped(struct page *page, int val) -{ - mem_cgroup_update_file_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, val); -} +EXPORT_SYMBOL(mem_cgroup_update_page_stat); /* * size of first charge trial. "32" comes from vmscan.c's magic value. @@ -1887,12 +1885,14 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, * oom-killer can be invoked. */ static int __mem_cgroup_try_charge(struct mm_struct *mm, - gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom) + gfp_t gfp_mask, + struct mem_cgroup **memcg, bool oom, + int page_size) { int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; struct mem_cgroup *mem = NULL; int ret; - int csize = CHARGE_SIZE; + int csize = max(CHARGE_SIZE, (unsigned long) page_size); /* * Unlike gloval-vm's OOM-kill, we're not in memory shortage @@ -1917,7 +1917,7 @@ again: VM_BUG_ON(css_is_removed(&mem->css)); if (mem_cgroup_is_root(mem)) goto done; - if (consume_stock(mem)) + if (page_size == PAGE_SIZE && consume_stock(mem)) goto done; css_get(&mem->css); } else { @@ -1940,7 +1940,7 @@ again: rcu_read_unlock(); goto done; } - if (consume_stock(mem)) { + if (page_size == PAGE_SIZE && consume_stock(mem)) { /* * It seems dagerous to access memcg without css_get(). * But considering how consume_stok works, it's not @@ -1981,7 +1981,7 @@ again: case CHARGE_OK: break; case CHARGE_RETRY: /* not in OOM situation but retry */ - csize = PAGE_SIZE; + csize = page_size; css_put(&mem->css); mem = NULL; goto again; @@ -2002,8 +2002,8 @@ again: } } while (ret != CHARGE_OK); - if (csize > PAGE_SIZE) - refill_stock(mem, csize - PAGE_SIZE); + if (csize > page_size) + refill_stock(mem, csize - page_size); css_put(&mem->css); done: *memcg = mem; @@ -2031,9 +2031,10 @@ static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem, } } -static void mem_cgroup_cancel_charge(struct mem_cgroup *mem) +static void mem_cgroup_cancel_charge(struct mem_cgroup *mem, + int page_size) { - __mem_cgroup_cancel_charge(mem, 1); + __mem_cgroup_cancel_charge(mem, page_size >> PAGE_SHIFT); } /* @@ -2087,22 +2088,10 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be * USED state. If already USED, uncharge and return. */ - -static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, - struct page_cgroup *pc, - enum charge_type ctype) +static void ____mem_cgroup_commit_charge(struct mem_cgroup *mem, + struct page_cgroup *pc, + enum charge_type ctype) { - /* try_charge() can return NULL to *memcg, taking care of it. */ - if (!mem) - return; - - lock_page_cgroup(pc); - if (unlikely(PageCgroupUsed(pc))) { - unlock_page_cgroup(pc); - mem_cgroup_cancel_charge(mem); - return; - } - pc->mem_cgroup = mem; /* * We access a page_cgroup asynchronously without lock_page_cgroup(). @@ -2127,6 +2116,33 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, } mem_cgroup_charge_statistics(mem, pc, true); +} + +static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, + struct page_cgroup *pc, + enum charge_type ctype, + int page_size) +{ + int i; + int count = page_size >> PAGE_SHIFT; + + /* try_charge() can return NULL to *memcg, taking care of it. */ + if (!mem) + return; + + lock_page_cgroup(pc); + if (unlikely(PageCgroupUsed(pc))) { + unlock_page_cgroup(pc); + mem_cgroup_cancel_charge(mem, page_size); + return; + } + + /* + * we don't need page_cgroup_lock about tail pages, becase they are not + * accessed by any other context at this point. + */ + for (i = 0; i < count; i++) + ____mem_cgroup_commit_charge(mem, pc + i, ctype); unlock_page_cgroup(pc); /* @@ -2173,7 +2189,7 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc, mem_cgroup_charge_statistics(from, pc, false); if (uncharge) /* This is not "cancel", but cancel_charge does all we need. */ - mem_cgroup_cancel_charge(from); + mem_cgroup_cancel_charge(from, PAGE_SIZE); /* caller should have done css_get */ pc->mem_cgroup = to; @@ -2195,9 +2211,13 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge) { int ret = -EINVAL; + unsigned long flags; + lock_page_cgroup(pc); if (PageCgroupUsed(pc) && pc->mem_cgroup == from) { + move_lock_page_cgroup(pc, &flags); __mem_cgroup_move_account(pc, from, to, uncharge); + move_unlock_page_cgroup(pc, &flags); ret = 0; } unlock_page_cgroup(pc); @@ -2234,13 +2254,14 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, goto put; parent = mem_cgroup_from_cont(pcg); - ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false); + ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, + PAGE_SIZE); if (ret || !parent) goto put_back; ret = mem_cgroup_move_account(pc, child, parent, true); if (ret) - mem_cgroup_cancel_charge(parent); + mem_cgroup_cancel_charge(parent, PAGE_SIZE); put_back: putback_lru_page(page); put: @@ -2261,6 +2282,12 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, struct mem_cgroup *mem = NULL; struct page_cgroup *pc; int ret; + int page_size = PAGE_SIZE; + + if (PageTransHuge(page)) { + page_size <<= compound_order(page); + VM_BUG_ON(!PageTransHuge(page)); + } pc = lookup_page_cgroup(page); /* can happen at boot */ @@ -2268,11 +2295,11 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, return 0; prefetchw(pc); - ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true); + ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page_size); if (ret || !mem) return ret; - __mem_cgroup_commit_charge(mem, pc, ctype); + __mem_cgroup_commit_charge(mem, pc, ctype, page_size); return 0; } @@ -2281,8 +2308,6 @@ int mem_cgroup_newpage_charge(struct page *page, { if (mem_cgroup_disabled()) return 0; - if (PageCompound(page)) - return 0; /* * If already mapped, we don't have to account. * If page cache, page->mapping has address_space. @@ -2388,13 +2413,13 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, if (!mem) goto charge_cur_mm; *ptr = mem; - ret = __mem_cgroup_try_charge(NULL, mask, ptr, true); + ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, PAGE_SIZE); css_put(&mem->css); return ret; charge_cur_mm: if (unlikely(!mm)) mm = &init_mm; - return __mem_cgroup_try_charge(mm, mask, ptr, true); + return __mem_cgroup_try_charge(mm, mask, ptr, true, PAGE_SIZE); } static void @@ -2410,7 +2435,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, cgroup_exclude_rmdir(&ptr->css); pc = lookup_page_cgroup(page); mem_cgroup_lru_del_before_commit_swapcache(page); - __mem_cgroup_commit_charge(ptr, pc, ctype); + __mem_cgroup_commit_charge(ptr, pc, ctype, PAGE_SIZE); mem_cgroup_lru_add_after_commit_swapcache(page); /* * Now swap is on-memory. This means this page may be @@ -2459,11 +2484,12 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) return; if (!mem) return; - mem_cgroup_cancel_charge(mem); + mem_cgroup_cancel_charge(mem, PAGE_SIZE); } static void -__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) +__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype, + int page_size) { struct memcg_batch_info *batch = NULL; bool uncharge_memsw = true; @@ -2490,6 +2516,9 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) if (!batch->do_batch || test_thread_flag(TIF_MEMDIE)) goto direct_uncharge; + if (page_size != PAGE_SIZE) + goto direct_uncharge; + /* * In typical case, batch->memcg == mem. This means we can * merge a series of uncharges to an uncharge of res_counter. @@ -2503,9 +2532,9 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) batch->memsw_bytes += PAGE_SIZE; return; direct_uncharge: - res_counter_uncharge(&mem->res, PAGE_SIZE); + res_counter_uncharge(&mem->res, page_size); if (uncharge_memsw) - res_counter_uncharge(&mem->memsw, PAGE_SIZE); + res_counter_uncharge(&mem->memsw, page_size); if (unlikely(batch->memcg != mem)) memcg_oom_recover(mem); return; @@ -2517,8 +2546,11 @@ direct_uncharge: static struct mem_cgroup * __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) { + int i; + int count; struct page_cgroup *pc; struct mem_cgroup *mem = NULL; + int page_size = PAGE_SIZE; if (mem_cgroup_disabled()) return NULL; @@ -2526,6 +2558,12 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) if (PageSwapCache(page)) return NULL; + if (PageTransHuge(page)) { + page_size <<= compound_order(page); + VM_BUG_ON(!PageTransHuge(page)); + } + + count = page_size >> PAGE_SHIFT; /* * Check if our page_cgroup is valid */ @@ -2558,7 +2596,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) break; } - mem_cgroup_charge_statistics(mem, pc, false); + for (i = 0; i < count; i++) + mem_cgroup_charge_statistics(mem, pc + i, false); ClearPageCgroupUsed(pc); /* @@ -2579,7 +2618,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) mem_cgroup_get(mem); } if (!mem_cgroup_is_root(mem)) - __do_uncharge(mem, ctype); + __do_uncharge(mem, ctype, page_size); return mem; @@ -2774,6 +2813,7 @@ int mem_cgroup_prepare_migration(struct page *page, enum charge_type ctype; int ret = 0; + VM_BUG_ON(PageTransHuge(page)); if (mem_cgroup_disabled()) return 0; @@ -2823,7 +2863,7 @@ int mem_cgroup_prepare_migration(struct page *page, return 0; *ptr = mem; - ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false); + ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false, PAGE_SIZE); css_put(&mem->css);/* drop extra refcnt */ if (ret || *ptr == NULL) { if (PageAnon(page)) { @@ -2850,13 +2890,13 @@ int mem_cgroup_prepare_migration(struct page *page, ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; else ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; - __mem_cgroup_commit_charge(mem, pc, ctype); + __mem_cgroup_commit_charge(mem, pc, ctype, PAGE_SIZE); return ret; } /* remove redundant charge if migration failed*/ void mem_cgroup_end_migration(struct mem_cgroup *mem, - struct page *oldpage, struct page *newpage) + struct page *oldpage, struct page *newpage, bool migration_ok) { struct page *used, *unused; struct page_cgroup *pc; @@ -2865,8 +2905,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem, return; /* blocks rmdir() */ cgroup_exclude_rmdir(&mem->css); - /* at migration success, oldpage->mapping is NULL. */ - if (oldpage->mapping) { + if (!migration_ok) { used = oldpage; unused = newpage; } else { @@ -4176,13 +4215,11 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) */ if (!node_state(node, N_NORMAL_MEMORY)) tmp = -1; - pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp); + pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp); if (!pn) return 1; mem->info.nodeinfo[node] = pn; - memset(pn, 0, sizeof(*pn)); - for (zone = 0; zone < MAX_NR_ZONES; zone++) { mz = &pn->zoneinfo[zone]; for_each_lru(l) @@ -4206,14 +4243,13 @@ static struct mem_cgroup *mem_cgroup_alloc(void) /* Can be very big if MAX_NUMNODES is very big */ if (size < PAGE_SIZE) - mem = kmalloc(size, GFP_KERNEL); + mem = kzalloc(size, GFP_KERNEL); else - mem = vmalloc(size); + mem = vzalloc(size); if (!mem) return NULL; - memset(mem, 0, size); mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu); if (!mem->stat) goto out_free; @@ -4461,7 +4497,8 @@ one_by_one: batch_count = PRECHARGE_COUNT_AT_ONCE; cond_resched(); } - ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false); + ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false, + PAGE_SIZE); if (ret || !mem) /* mem_cgroup_clear_mc() will do uncharge later */ return -ENOMEM; @@ -4623,6 +4660,7 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, pte_t *pte; spinlock_t *ptl; + VM_BUG_ON(pmd_trans_huge(*pmd)); pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); for (; addr != end; pte++, addr += PAGE_SIZE) if (is_target_pte_for_mc(vma, addr, *pte, NULL)) @@ -4638,7 +4676,7 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) unsigned long precharge; struct vm_area_struct *vma; - /* We've already held the mmap_sem */ + down_read(&mm->mmap_sem); for (vma = mm->mmap; vma; vma = vma->vm_next) { struct mm_walk mem_cgroup_count_precharge_walk = { .pmd_entry = mem_cgroup_count_precharge_pte_range, @@ -4650,6 +4688,7 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) walk_page_range(vma->vm_start, vma->vm_end, &mem_cgroup_count_precharge_walk); } + up_read(&mm->mmap_sem); precharge = mc.precharge; mc.precharge = 0; @@ -4659,10 +4698,15 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) static int mem_cgroup_precharge_mc(struct mm_struct *mm) { - return mem_cgroup_do_precharge(mem_cgroup_count_precharge(mm)); + unsigned long precharge = mem_cgroup_count_precharge(mm); + + VM_BUG_ON(mc.moving_task); + mc.moving_task = current; + return mem_cgroup_do_precharge(precharge); } -static void mem_cgroup_clear_mc(void) +/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */ +static void __mem_cgroup_clear_mc(void) { struct mem_cgroup *from = mc.from; struct mem_cgroup *to = mc.to; @@ -4697,23 +4741,28 @@ static void mem_cgroup_clear_mc(void) PAGE_SIZE * mc.moved_swap); } /* we've already done mem_cgroup_get(mc.to) */ - mc.moved_swap = 0; } - if (mc.mm) { - up_read(&mc.mm->mmap_sem); - mmput(mc.mm); - } + memcg_oom_recover(from); + memcg_oom_recover(to); + wake_up_all(&mc.waitq); +} + +static void mem_cgroup_clear_mc(void) +{ + struct mem_cgroup *from = mc.from; + + /* + * we must clear moving_task before waking up waiters at the end of + * task migration. + */ + mc.moving_task = NULL; + __mem_cgroup_clear_mc(); spin_lock(&mc.lock); mc.from = NULL; mc.to = NULL; spin_unlock(&mc.lock); - mc.moving_task = NULL; - mc.mm = NULL; mem_cgroup_end_move(from); - memcg_oom_recover(from); - memcg_oom_recover(to); - wake_up_all(&mc.waitq); } static int mem_cgroup_can_attach(struct cgroup_subsys *ss, @@ -4735,38 +4784,23 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss, return 0; /* We move charges only when we move a owner of the mm */ if (mm->owner == p) { - /* - * We do all the move charge works under one mmap_sem to - * avoid deadlock with down_write(&mmap_sem) - * -> try_charge() -> if (mc.moving_task) -> sleep. - */ - down_read(&mm->mmap_sem); - VM_BUG_ON(mc.from); VM_BUG_ON(mc.to); VM_BUG_ON(mc.precharge); VM_BUG_ON(mc.moved_charge); VM_BUG_ON(mc.moved_swap); - VM_BUG_ON(mc.moving_task); - VM_BUG_ON(mc.mm); - mem_cgroup_start_move(from); spin_lock(&mc.lock); mc.from = from; mc.to = mem; - mc.precharge = 0; - mc.moved_charge = 0; - mc.moved_swap = 0; spin_unlock(&mc.lock); - mc.moving_task = current; - mc.mm = mm; + /* We set mc.moving_task later */ ret = mem_cgroup_precharge_mc(mm); if (ret) mem_cgroup_clear_mc(); - /* We call up_read() and mmput() in clear_mc(). */ - } else - mmput(mm); + } + mmput(mm); } return ret; } @@ -4789,6 +4823,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, spinlock_t *ptl; retry: + VM_BUG_ON(pmd_trans_huge(*pmd)); pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); for (; addr != end; addr += PAGE_SIZE) { pte_t ptent = *(pte++); @@ -4854,7 +4889,19 @@ static void mem_cgroup_move_charge(struct mm_struct *mm) struct vm_area_struct *vma; lru_add_drain_all(); - /* We've already held the mmap_sem */ +retry: + if (unlikely(!down_read_trylock(&mm->mmap_sem))) { + /* + * Someone who are holding the mmap_sem might be waiting in + * waitq. So we cancel all extra charges, wake up all waiters, + * and retry. Because we cancel precharges, we might not be able + * to move enough charges, but moving charge is a best-effort + * feature anyway, so it wouldn't be a big problem. + */ + __mem_cgroup_clear_mc(); + cond_resched(); + goto retry; + } for (vma = mm->mmap; vma; vma = vma->vm_next) { int ret; struct mm_walk mem_cgroup_move_charge_walk = { @@ -4873,6 +4920,7 @@ static void mem_cgroup_move_charge(struct mm_struct *mm) */ break; } + up_read(&mm->mmap_sem); } static void mem_cgroup_move_task(struct cgroup_subsys *ss, @@ -4881,11 +4929,17 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss, struct task_struct *p, bool threadgroup) { - if (!mc.mm) + struct mm_struct *mm; + + if (!mc.to) /* no need to move charge */ return; - mem_cgroup_move_charge(mc.mm); + mm = get_task_mm(p); + if (mm) { + mem_cgroup_move_charge(mm); + mmput(mm); + } mem_cgroup_clear_mc(); } #else /* !CONFIG_MMU */ diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 46ab2c0..548fbd7 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -203,7 +203,7 @@ static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno, #ifdef __ARCH_SI_TRAPNO si.si_trapno = trapno; #endif - si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT; + si.si_addr_lsb = compound_trans_order(compound_head(page)) + PAGE_SHIFT; /* * Don't use force here, it's convenient if the signal * can be temporarily blocked. @@ -386,6 +386,8 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, struct task_struct *tsk; struct anon_vma *av; + if (!PageHuge(page) && unlikely(split_huge_page(page))) + return; read_lock(&tasklist_lock); av = page_lock_anon_vma(page); if (av == NULL) /* Not actually mapped anymore */ @@ -928,7 +930,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, static void set_page_hwpoison_huge_page(struct page *hpage) { int i; - int nr_pages = 1 << compound_order(hpage); + int nr_pages = 1 << compound_trans_order(hpage); for (i = 0; i < nr_pages; i++) SetPageHWPoison(hpage + i); } @@ -936,7 +938,7 @@ static void set_page_hwpoison_huge_page(struct page *hpage) static void clear_page_hwpoison_huge_page(struct page *hpage) { int i; - int nr_pages = 1 << compound_order(hpage); + int nr_pages = 1 << compound_trans_order(hpage); for (i = 0; i < nr_pages; i++) ClearPageHWPoison(hpage + i); } @@ -966,7 +968,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) return 0; } - nr_pages = 1 << compound_order(hpage); + nr_pages = 1 << compound_trans_order(hpage); atomic_long_add(nr_pages, &mce_bad_pages); /* @@ -1164,7 +1166,7 @@ int unpoison_memory(unsigned long pfn) return 0; } - nr_pages = 1 << compound_order(page); + nr_pages = 1 << compound_trans_order(page); if (!get_page_unless_zero(page)) { /* @@ -1290,9 +1292,10 @@ static int soft_offline_huge_page(struct page *page, int flags) /* Keep page count to indicate a given hugepage is isolated. */ list_add(&hpage->lru, &pagelist); - ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); + ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0, + true); if (ret) { - putback_lru_pages(&pagelist); + putback_lru_pages(&pagelist); pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", pfn, ret, page->flags); if (ret > 0) @@ -1301,7 +1304,7 @@ static int soft_offline_huge_page(struct page *page, int flags) } done: if (!PageHWPoison(hpage)) - atomic_long_add(1 << compound_order(hpage), &mce_bad_pages); + atomic_long_add(1 << compound_trans_order(hpage), &mce_bad_pages); set_page_hwpoison_huge_page(hpage); dequeue_hwpoisoned_huge_page(hpage); /* keep elevated page count for bad page */ @@ -1413,7 +1416,8 @@ int soft_offline_page(struct page *page, int flags) LIST_HEAD(pagelist); list_add(&page->lru, &pagelist); - ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); + ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, + 0, true); if (ret) { pr_info("soft offline: %#lx: migration failed %d, type %lx\n", pfn, ret, page->flags); diff --git a/mm/memory.c b/mm/memory.c index 02e48aa..31250fa 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -394,9 +394,11 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, } } -int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) +int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, + pmd_t *pmd, unsigned long address) { pgtable_t new = pte_alloc_one(mm, address); + int wait_split_huge_page; if (!new) return -ENOMEM; @@ -416,14 +418,18 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ spin_lock(&mm->page_table_lock); - if (!pmd_present(*pmd)) { /* Has another populated it ? */ + wait_split_huge_page = 0; + if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ mm->nr_ptes++; pmd_populate(mm, pmd, new); new = NULL; - } + } else if (unlikely(pmd_trans_splitting(*pmd))) + wait_split_huge_page = 1; spin_unlock(&mm->page_table_lock); if (new) pte_free(mm, new); + if (wait_split_huge_page) + wait_split_huge_page(vma->anon_vma, pmd); return 0; } @@ -436,10 +442,11 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address) smp_wmb(); /* See comment in __pte_alloc */ spin_lock(&init_mm.page_table_lock); - if (!pmd_present(*pmd)) { /* Has another populated it ? */ + if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ pmd_populate_kernel(&init_mm, pmd, new); new = NULL; - } + } else + VM_BUG_ON(pmd_trans_splitting(*pmd)); spin_unlock(&init_mm.page_table_lock); if (new) pte_free_kernel(&init_mm, new); @@ -719,9 +726,9 @@ out_set_pte: return 0; } -static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, - pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, - unsigned long addr, unsigned long end) +int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, + unsigned long addr, unsigned long end) { pte_t *orig_src_pte, *orig_dst_pte; pte_t *src_pte, *dst_pte; @@ -795,6 +802,17 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src src_pmd = pmd_offset(src_pud, addr); do { next = pmd_addr_end(addr, end); + if (pmd_trans_huge(*src_pmd)) { + int err; + VM_BUG_ON(next-addr != HPAGE_PMD_SIZE); + err = copy_huge_pmd(dst_mm, src_mm, + dst_pmd, src_pmd, addr, vma); + if (err == -ENOMEM) + return -ENOMEM; + if (!err) + continue; + /* fall through */ + } if (pmd_none_or_clear_bad(src_pmd)) continue; if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd, @@ -997,6 +1015,16 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); + if (pmd_trans_huge(*pmd)) { + if (next-addr != HPAGE_PMD_SIZE) { + VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem)); + split_huge_page_pmd(vma->vm_mm, pmd); + } else if (zap_huge_pmd(tlb, vma, pmd)) { + (*zap_work)--; + continue; + } + /* fall through */ + } if (pmd_none_or_clear_bad(pmd)) { (*zap_work)--; continue; @@ -1262,7 +1290,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, pud = pud_offset(pgd, address); if (pud_none(*pud)) goto no_page_table; - if (pud_huge(*pud)) { + if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { BUG_ON(flags & FOLL_GET); page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); goto out; @@ -1273,11 +1301,32 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, pmd = pmd_offset(pud, address); if (pmd_none(*pmd)) goto no_page_table; - if (pmd_huge(*pmd)) { + if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { BUG_ON(flags & FOLL_GET); page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); goto out; } + if (pmd_trans_huge(*pmd)) { + if (flags & FOLL_SPLIT) { + split_huge_page_pmd(mm, pmd); + goto split_fallthrough; + } + spin_lock(&mm->page_table_lock); + if (likely(pmd_trans_huge(*pmd))) { + if (unlikely(pmd_trans_splitting(*pmd))) { + spin_unlock(&mm->page_table_lock); + wait_split_huge_page(vma->anon_vma, pmd); + } else { + page = follow_trans_huge_pmd(mm, address, + pmd, flags); + spin_unlock(&mm->page_table_lock); + goto out; + } + } else + spin_unlock(&mm->page_table_lock); + /* fall through */ + } +split_fallthrough: if (unlikely(pmd_bad(*pmd))) goto no_page_table; @@ -1310,6 +1359,28 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, */ mark_page_accessed(page); } + if (flags & FOLL_MLOCK) { + /* + * The preliminary mapping check is mainly to avoid the + * pointless overhead of lock_page on the ZERO_PAGE + * which might bounce very badly if there is contention. + * + * If the page is already locked, we don't need to + * handle it now - vmscan will handle it later if and + * when it attempts to reclaim the page. + */ + if (page->mapping && trylock_page(page)) { + lru_add_drain(); /* push cached pages to LRU */ + /* + * Because we lock page here and migration is + * blocked by the pte's page reference, we need + * only check for file-cache page truncation. + */ + if (page->mapping) + mlock_vma_page(page); + unlock_page(page); + } + } unlock: pte_unmap_unlock(ptep, ptl); out: @@ -1341,7 +1412,8 @@ no_page_table: int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int nr_pages, unsigned int gup_flags, - struct page **pages, struct vm_area_struct **vmas) + struct page **pages, struct vm_area_struct **vmas, + int *nonblocking) { int i; unsigned long vm_flags; @@ -1386,6 +1458,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, pmd = pmd_offset(pud, pg); if (pmd_none(*pmd)) return i ? : -EFAULT; + VM_BUG_ON(pmd_trans_huge(*pmd)); pte = pte_offset_map(pmd, pg); if (pte_none(*pte)) { pte_unmap(pte); @@ -1441,10 +1514,15 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, cond_resched(); while (!(page = follow_page(vma, start, foll_flags))) { int ret; + unsigned int fault_flags = 0; + + if (foll_flags & FOLL_WRITE) + fault_flags |= FAULT_FLAG_WRITE; + if (nonblocking) + fault_flags |= FAULT_FLAG_ALLOW_RETRY; ret = handle_mm_fault(mm, vma, start, - (foll_flags & FOLL_WRITE) ? - FAULT_FLAG_WRITE : 0); + fault_flags); if (ret & VM_FAULT_ERROR) { if (ret & VM_FAULT_OOM) @@ -1460,6 +1538,11 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, else tsk->min_flt++; + if (ret & VM_FAULT_RETRY) { + *nonblocking = 0; + return i; + } + /* * The VM_FAULT_WRITE bit tells us that * do_wp_page has broken COW when necessary, @@ -1559,7 +1642,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, if (force) flags |= FOLL_FORCE; - return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas); + return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas, + NULL); } EXPORT_SYMBOL(get_user_pages); @@ -1584,7 +1668,8 @@ struct page *get_dump_page(unsigned long addr) struct page *page; if (__get_user_pages(current, current->mm, addr, 1, - FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma) < 1) + FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma, + NULL) < 1) return NULL; flush_cache_page(vma, addr, page_to_pfn(page)); return page; @@ -1598,8 +1683,10 @@ pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, pud_t * pud = pud_alloc(mm, pgd, addr); if (pud) { pmd_t * pmd = pmd_alloc(mm, pud, addr); - if (pmd) + if (pmd) { + VM_BUG_ON(pmd_trans_huge(*pmd)); return pte_alloc_map_lock(mm, pmd, addr, ptl); + } } return NULL; } @@ -1818,6 +1905,7 @@ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud, pmd = pmd_alloc(mm, pud, addr); if (!pmd) return -ENOMEM; + VM_BUG_ON(pmd_trans_huge(*pmd)); do { next = pmd_addr_end(addr, end); if (remap_pte_range(mm, pmd, addr, next, @@ -2048,19 +2136,6 @@ static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, return same; } -/* - * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when - * servicing faults for write access. In the normal case, do always want - * pte_mkwrite. But get_user_pages can cause write faults for mappings - * that do not have writing enabled, when used by access_process_vm. - */ -static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) -{ - if (likely(vma->vm_flags & VM_WRITE)) - pte = pte_mkwrite(pte); - return pte; -} - static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma) { /* @@ -2112,7 +2187,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, { struct page *old_page, *new_page; pte_t entry; - int reuse = 0, ret = 0; + int ret = 0; int page_mkwrite = 0; struct page *dirty_page = NULL; @@ -2149,14 +2224,16 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, } page_cache_release(old_page); } - reuse = reuse_swap_page(old_page); - if (reuse) + if (reuse_swap_page(old_page)) { /* * The page is all ours. Move it to our anon_vma so * the rmap code will not search our parent or siblings. * Protected against the rmap code by the page lock. */ page_move_anon_rmap(old_page, vma, address); + unlock_page(old_page); + goto reuse; + } unlock_page(old_page); } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED))) { @@ -2220,18 +2297,52 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, } dirty_page = old_page; get_page(dirty_page); - reuse = 1; - } - if (reuse) { reuse: flush_cache_page(vma, address, pte_pfn(orig_pte)); entry = pte_mkyoung(orig_pte); entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (ptep_set_access_flags(vma, address, page_table, entry,1)) update_mmu_cache(vma, address, page_table); + pte_unmap_unlock(page_table, ptl); ret |= VM_FAULT_WRITE; - goto unlock; + + if (!dirty_page) + return ret; + + /* + * Yes, Virginia, this is actually required to prevent a race + * with clear_page_dirty_for_io() from clearing the page dirty + * bit after it clear all dirty ptes, but before a racing + * do_wp_page installs a dirty pte. + * + * do_no_page is protected similarly. + */ + if (!page_mkwrite) { + wait_on_page_locked(dirty_page); + set_page_dirty_balance(dirty_page, page_mkwrite); + } + put_page(dirty_page); + if (page_mkwrite) { + struct address_space *mapping = dirty_page->mapping; + + set_page_dirty(dirty_page); + unlock_page(dirty_page); + page_cache_release(dirty_page); + if (mapping) { + /* + * Some device drivers do not set page.mapping + * but still dirty their pages + */ + balance_dirty_pages_ratelimited(mapping); + } + } + + /* file_update_time outside page_lock */ + if (vma->vm_file) + file_update_time(vma->vm_file); + + return ret; } /* @@ -2337,39 +2448,6 @@ gotten: page_cache_release(old_page); unlock: pte_unmap_unlock(page_table, ptl); - if (dirty_page) { - /* - * Yes, Virginia, this is actually required to prevent a race - * with clear_page_dirty_for_io() from clearing the page dirty - * bit after it clear all dirty ptes, but before a racing - * do_wp_page installs a dirty pte. - * - * do_no_page is protected similarly. - */ - if (!page_mkwrite) { - wait_on_page_locked(dirty_page); - set_page_dirty_balance(dirty_page, page_mkwrite); - } - put_page(dirty_page); - if (page_mkwrite) { - struct address_space *mapping = dirty_page->mapping; - - set_page_dirty(dirty_page); - unlock_page(dirty_page); - page_cache_release(dirty_page); - if (mapping) { - /* - * Some device drivers do not set page.mapping - * but still dirty their pages - */ - balance_dirty_pages_ratelimited(mapping); - } - } - - /* file_update_time outside page_lock */ - if (vma->vm_file) - file_update_time(vma->vm_file); - } return ret; oom_free_new: page_cache_release(new_page); @@ -3147,9 +3225,9 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, * but allow concurrent faults), and pte mapped but not yet locked. * We return with mmap_sem still held, but pte unmapped and unlocked. */ -static inline int handle_pte_fault(struct mm_struct *mm, - struct vm_area_struct *vma, unsigned long address, - pte_t *pte, pmd_t *pmd, unsigned int flags) +int handle_pte_fault(struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long address, + pte_t *pte, pmd_t *pmd, unsigned int flags) { pte_t entry; spinlock_t *ptl; @@ -3228,9 +3306,40 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, pmd = pmd_alloc(mm, pud, address); if (!pmd) return VM_FAULT_OOM; - pte = pte_alloc_map(mm, pmd, address); - if (!pte) + if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) { + if (!vma->vm_ops) + return do_huge_pmd_anonymous_page(mm, vma, address, + pmd, flags); + } else { + pmd_t orig_pmd = *pmd; + barrier(); + if (pmd_trans_huge(orig_pmd)) { + if (flags & FAULT_FLAG_WRITE && + !pmd_write(orig_pmd) && + !pmd_trans_splitting(orig_pmd)) + return do_huge_pmd_wp_page(mm, vma, address, + pmd, orig_pmd); + return 0; + } + } + + /* + * Use __pte_alloc instead of pte_alloc_map, because we can't + * run pte_offset_map on the pmd, if an huge pmd could + * materialize from under us from a different thread. + */ + if (unlikely(__pte_alloc(mm, vma, pmd, address))) return VM_FAULT_OOM; + /* if an huge pmd materialized from under us just retry later */ + if (unlikely(pmd_trans_huge(*pmd))) + return 0; + /* + * A regular pmd is established and it can't morph into a huge pmd + * from under us anymore at this point because we hold the mmap_sem + * read mode and khugepaged takes it in write mode. So now it's + * safe to run pte_offset_map(). + */ + pte = pte_offset_map(pmd, address); return handle_pte_fault(mm, vma, address, pte, pmd, flags); } @@ -3296,7 +3405,12 @@ int make_pages_present(unsigned long addr, unsigned long end) vma = find_vma(current->mm, addr); if (!vma) return -ENOMEM; - write = (vma->vm_flags & VM_WRITE) != 0; + /* + * We want to touch writable mappings with a write fault in order + * to break COW, except for shared mappings because these don't COW + * and we would not want to dirty them for nothing. + */ + write = (vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE; BUG_ON(addr >= end); BUG_ON(end > vma->vm_end); len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE; @@ -3368,6 +3482,7 @@ static int __follow_pte(struct mm_struct *mm, unsigned long address, goto out; pmd = pmd_offset(pud, address); + VM_BUG_ON(pmd_trans_huge(*pmd)); if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) goto out; @@ -3608,3 +3723,74 @@ void might_fault(void) } EXPORT_SYMBOL(might_fault); #endif + +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) +static void clear_gigantic_page(struct page *page, + unsigned long addr, + unsigned int pages_per_huge_page) +{ + int i; + struct page *p = page; + + might_sleep(); + for (i = 0; i < pages_per_huge_page; + i++, p = mem_map_next(p, page, i)) { + cond_resched(); + clear_user_highpage(p, addr + i * PAGE_SIZE); + } +} +void clear_huge_page(struct page *page, + unsigned long addr, unsigned int pages_per_huge_page) +{ + int i; + + if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) { + clear_gigantic_page(page, addr, pages_per_huge_page); + return; + } + + might_sleep(); + for (i = 0; i < pages_per_huge_page; i++) { + cond_resched(); + clear_user_highpage(page + i, addr + i * PAGE_SIZE); + } +} + +static void copy_user_gigantic_page(struct page *dst, struct page *src, + unsigned long addr, + struct vm_area_struct *vma, + unsigned int pages_per_huge_page) +{ + int i; + struct page *dst_base = dst; + struct page *src_base = src; + + for (i = 0; i < pages_per_huge_page; ) { + cond_resched(); + copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma); + + i++; + dst = mem_map_next(dst, dst_base, i); + src = mem_map_next(src, src_base, i); + } +} + +void copy_user_huge_page(struct page *dst, struct page *src, + unsigned long addr, struct vm_area_struct *vma, + unsigned int pages_per_huge_page) +{ + int i; + + if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) { + copy_user_gigantic_page(dst, src, addr, vma, + pages_per_huge_page); + return; + } + + might_sleep(); + for (i = 0; i < pages_per_huge_page; i++) { + cond_resched(); + copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); + } +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 2c6523a..e92f047 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -82,9 +82,10 @@ static void release_memory_resource(struct resource *res) #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE #ifndef CONFIG_SPARSEMEM_VMEMMAP -static void get_page_bootmem(unsigned long info, struct page *page, int type) +static void get_page_bootmem(unsigned long info, struct page *page, + unsigned long type) { - atomic_set(&page->_mapcount, type); + page->lru.next = (struct list_head *) type; SetPagePrivate(page); set_page_private(page, info); atomic_inc(&page->_count); @@ -94,15 +95,16 @@ static void get_page_bootmem(unsigned long info, struct page *page, int type) * so use __ref to tell modpost not to generate a warning */ void __ref put_page_bootmem(struct page *page) { - int type; + unsigned long type; - type = atomic_read(&page->_mapcount); - BUG_ON(type >= -1); + type = (unsigned long) page->lru.next; + BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || + type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE); if (atomic_dec_return(&page->_count) == 1) { ClearPagePrivate(page); set_page_private(page, 0); - reset_page_mapcount(page); + INIT_LIST_HEAD(&page->lru); __free_pages_bootmem(page, 0); } @@ -733,7 +735,8 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) goto out; } /* this function returns # of failed pages */ - ret = migrate_pages(&source, hotremove_migrate_alloc, 0, 1); + ret = migrate_pages(&source, hotremove_migrate_alloc, 0, + true, true); if (ret) putback_lru_pages(&source); } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 11ff260..368fc9d 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -514,6 +514,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); + split_huge_page_pmd(vma->vm_mm, pmd); if (pmd_none_or_clear_bad(pmd)) continue; if (check_pte_range(vma, pmd, addr, next, nodes, @@ -935,7 +936,8 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, return PTR_ERR(vma); if (!list_empty(&pagelist)) { - err = migrate_pages(&pagelist, new_node_page, dest, 0); + err = migrate_pages(&pagelist, new_node_page, dest, + false, true); if (err) putback_lru_pages(&pagelist); } @@ -1155,7 +1157,8 @@ static long do_mbind(unsigned long start, unsigned long len, if (!list_empty(&pagelist)) { nr_failed = migrate_pages(&pagelist, new_vma_page, - (unsigned long)vma, 0); + (unsigned long)vma, + false, true); if (nr_failed) putback_lru_pages(&pagelist); } @@ -1308,16 +1311,13 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, /* Find the mm_struct */ rcu_read_lock(); - read_lock(&tasklist_lock); task = pid ? find_task_by_vpid(pid) : current; if (!task) { - read_unlock(&tasklist_lock); rcu_read_unlock(); err = -ESRCH; goto out; } mm = get_task_mm(task); - read_unlock(&tasklist_lock); rcu_read_unlock(); err = -EINVAL; @@ -1796,7 +1796,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, } /** - * alloc_page_vma - Allocate a page for a VMA. + * alloc_pages_vma - Allocate a page for a VMA. * * @gfp: * %GFP_USER user allocation. @@ -1805,6 +1805,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, * %GFP_FS allocation should not call back into a file system. * %GFP_ATOMIC don't sleep. * + * @order:Order of the GFP allocation. * @vma: Pointer to VMA or NULL if not available. * @addr: Virtual Address of the allocation. Must be inside the VMA. * @@ -1818,7 +1819,8 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, * Should be called with the mm_sem of the vma hold. */ struct page * -alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) +alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, + unsigned long addr) { struct mempolicy *pol = get_vma_policy(current, vma, addr); struct zonelist *zl; @@ -1830,7 +1832,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); mpol_cond_put(pol); - page = alloc_page_interleave(gfp, 0, nid); + page = alloc_page_interleave(gfp, order, nid); put_mems_allowed(); return page; } @@ -1839,7 +1841,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) /* * slow path: ref counted shared policy */ - struct page *page = __alloc_pages_nodemask(gfp, 0, + struct page *page = __alloc_pages_nodemask(gfp, order, zl, policy_nodemask(gfp, pol)); __mpol_put(pol); put_mems_allowed(); @@ -1848,7 +1850,8 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) /* * fast path: default or task policy */ - page = __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol)); + page = __alloc_pages_nodemask(gfp, order, zl, + policy_nodemask(gfp, pol)); put_mems_allowed(); return page; } diff --git a/mm/migrate.c b/mm/migrate.c index 6ae8a66..46fe8cc 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -113,6 +113,8 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, goto out; pmd = pmd_offset(pud, addr); + if (pmd_trans_huge(*pmd)) + goto out; if (!pmd_present(*pmd)) goto out; @@ -246,7 +248,7 @@ static int migrate_page_move_mapping(struct address_space *mapping, expected_count = 2 + page_has_private(page); if (page_count(page) != expected_count || - (struct page *)radix_tree_deref_slot(pslot) != page) { + radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) { spin_unlock_irq(&mapping->tree_lock); return -EAGAIN; } @@ -318,7 +320,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, expected_count = 2 + page_has_private(page); if (page_count(page) != expected_count || - (struct page *)radix_tree_deref_slot(pslot) != page) { + radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) { spin_unlock_irq(&mapping->tree_lock); return -EAGAIN; } @@ -614,13 +616,12 @@ static int move_to_new_page(struct page *newpage, struct page *page, * to the newly allocated page in newpage. */ static int unmap_and_move(new_page_t get_new_page, unsigned long private, - struct page *page, int force, int offlining) + struct page *page, int force, bool offlining, bool sync) { int rc = 0; int *result = NULL; struct page *newpage = get_new_page(page, private, &result); int remap_swapcache = 1; - int rcu_locked = 0; int charge = 0; struct mem_cgroup *mem = NULL; struct anon_vma *anon_vma = NULL; @@ -632,6 +633,9 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, /* page was freed from under us. So we are done. */ goto move_newpage; } + if (unlikely(PageTransHuge(page))) + if (unlikely(split_huge_page(page))) + goto move_newpage; /* prepare cgroup just returns 0 or -ENOMEM */ rc = -EAGAIN; @@ -639,6 +643,23 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, if (!trylock_page(page)) { if (!force) goto move_newpage; + + /* + * It's not safe for direct compaction to call lock_page. + * For example, during page readahead pages are added locked + * to the LRU. Later, when the IO completes the pages are + * marked uptodate and unlocked. However, the queueing + * could be merging multiple pages for one bio (e.g. + * mpage_readpages). If an allocation happens for the + * second or third page, the process can end up locking + * the same page twice and deadlocking. Rather than + * trying to be clever about what pages can be locked, + * avoid the use of lock_page for direct compaction + * altogether. + */ + if (current->flags & PF_MEMALLOC) + goto move_newpage; + lock_page(page); } @@ -665,27 +686,33 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, BUG_ON(charge); if (PageWriteback(page)) { - if (!force) + if (!force || !sync) goto uncharge; wait_on_page_writeback(page); } /* * By try_to_unmap(), page->mapcount goes down to 0 here. In this case, * we cannot notice that anon_vma is freed while we migrates a page. - * This rcu_read_lock() delays freeing anon_vma pointer until the end + * This get_anon_vma() delays freeing anon_vma pointer until the end * of migration. File cache pages are no problem because of page_lock() * File Caches may use write_page() or lock_page() in migration, then, * just care Anon page here. */ if (PageAnon(page)) { - rcu_read_lock(); - rcu_locked = 1; - - /* Determine how to safely use anon_vma */ - if (!page_mapped(page)) { - if (!PageSwapCache(page)) - goto rcu_unlock; - + /* + * Only page_lock_anon_vma() understands the subtleties of + * getting a hold on an anon_vma from outside one of its mms. + */ + anon_vma = page_lock_anon_vma(page); + if (anon_vma) { + /* + * Take a reference count on the anon_vma if the + * page is mapped so that it is guaranteed to + * exist when the page is remapped later + */ + get_anon_vma(anon_vma); + page_unlock_anon_vma(anon_vma); + } else if (PageSwapCache(page)) { /* * We cannot be sure that the anon_vma of an unmapped * swapcache page is safe to use because we don't @@ -700,13 +727,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, */ remap_swapcache = 0; } else { - /* - * Take a reference count on the anon_vma if the - * page is mapped so that it is guaranteed to - * exist when the page is remapped later - */ - anon_vma = page_anon_vma(page); - get_anon_vma(anon_vma); + goto uncharge; } } @@ -723,16 +744,10 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, * free the metadata, so the page can be freed. */ if (!page->mapping) { - if (!PageAnon(page) && page_has_private(page)) { - /* - * Go direct to try_to_free_buffers() here because - * a) that's what try_to_release_page() would do anyway - * b) we may be under rcu_read_lock() here, so we can't - * use GFP_KERNEL which is what try_to_release_page() - * needs to be effective. - */ + VM_BUG_ON(PageAnon(page)); + if (page_has_private(page)) { try_to_free_buffers(page); - goto rcu_unlock; + goto uncharge; } goto skip_unmap; } @@ -746,17 +761,14 @@ skip_unmap: if (rc && remap_swapcache) remove_migration_ptes(page, page); -rcu_unlock: /* Drop an anon_vma reference if we took one */ if (anon_vma) drop_anon_vma(anon_vma); - if (rcu_locked) - rcu_read_unlock(); uncharge: if (!charge) - mem_cgroup_end_migration(mem, page, newpage); + mem_cgroup_end_migration(mem, page, newpage, rc == 0); unlock: unlock_page(page); @@ -810,12 +822,11 @@ move_newpage: */ static int unmap_and_move_huge_page(new_page_t get_new_page, unsigned long private, struct page *hpage, - int force, int offlining) + int force, bool offlining, bool sync) { int rc = 0; int *result = NULL; struct page *new_hpage = get_new_page(hpage, private, &result); - int rcu_locked = 0; struct anon_vma *anon_vma = NULL; if (!new_hpage) @@ -824,18 +835,16 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, rc = -EAGAIN; if (!trylock_page(hpage)) { - if (!force) + if (!force || !sync) goto out; lock_page(hpage); } if (PageAnon(hpage)) { - rcu_read_lock(); - rcu_locked = 1; - - if (page_mapped(hpage)) { - anon_vma = page_anon_vma(hpage); - atomic_inc(&anon_vma->external_refcount); + anon_vma = page_lock_anon_vma(hpage); + if (anon_vma) { + get_anon_vma(anon_vma); + page_unlock_anon_vma(anon_vma); } } @@ -847,16 +856,8 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, if (rc) remove_migration_ptes(hpage, hpage); - if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount, - &anon_vma->lock)) { - int empty = list_empty(&anon_vma->head); - spin_unlock(&anon_vma->lock); - if (empty) - anon_vma_free(anon_vma); - } - - if (rcu_locked) - rcu_read_unlock(); + if (anon_vma) + drop_anon_vma(anon_vma); out: unlock_page(hpage); @@ -892,7 +893,8 @@ out: * Return: Number of pages not migrated or error code. */ int migrate_pages(struct list_head *from, - new_page_t get_new_page, unsigned long private, int offlining) + new_page_t get_new_page, unsigned long private, bool offlining, + bool sync) { int retry = 1; int nr_failed = 0; @@ -912,7 +914,8 @@ int migrate_pages(struct list_head *from, cond_resched(); rc = unmap_and_move(get_new_page, private, - page, pass > 2, offlining); + page, pass > 2, offlining, + sync); switch(rc) { case -ENOMEM: @@ -941,7 +944,8 @@ out: } int migrate_huge_pages(struct list_head *from, - new_page_t get_new_page, unsigned long private, int offlining) + new_page_t get_new_page, unsigned long private, bool offlining, + bool sync) { int retry = 1; int nr_failed = 0; @@ -957,7 +961,8 @@ int migrate_huge_pages(struct list_head *from, cond_resched(); rc = unmap_and_move_huge_page(get_new_page, - private, page, pass > 2, offlining); + private, page, pass > 2, offlining, + sync); switch(rc) { case -ENOMEM: @@ -1042,7 +1047,7 @@ static int do_move_page_to_node_array(struct mm_struct *mm, if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma)) goto set_status; - page = follow_page(vma, pp->addr, FOLL_GET); + page = follow_page(vma, pp->addr, FOLL_GET|FOLL_SPLIT); err = PTR_ERR(page); if (IS_ERR(page)) @@ -1090,7 +1095,7 @@ set_status: err = 0; if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, new_page_node, - (unsigned long)pm, 0); + (unsigned long)pm, 0, true); if (err) putback_lru_pages(&pagelist); } diff --git a/mm/mincore.c b/mm/mincore.c index 9ac42dc..a4e6b9d 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -154,6 +154,13 @@ static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); + if (pmd_trans_huge(*pmd)) { + if (mincore_huge_pmd(vma, pmd, addr, next, vec)) { + vec += (next - addr) >> PAGE_SHIFT; + continue; + } + /* fall through */ + } if (pmd_none_or_clear_bad(pmd)) mincore_unmapped_range(vma, addr, next, vec); else @@ -155,13 +155,12 @@ static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long add * vma->vm_mm->mmap_sem must be held for at least read. */ static long __mlock_vma_pages_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end) + unsigned long start, unsigned long end, + int *nonblocking) { struct mm_struct *mm = vma->vm_mm; unsigned long addr = start; - struct page *pages[16]; /* 16 gives a reasonable batch */ int nr_pages = (end - start) / PAGE_SIZE; - int ret = 0; int gup_flags; VM_BUG_ON(start & ~PAGE_MASK); @@ -170,73 +169,26 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma, VM_BUG_ON(end > vma->vm_end); VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); - gup_flags = FOLL_TOUCH | FOLL_GET; - if (vma->vm_flags & VM_WRITE) + gup_flags = FOLL_TOUCH; + /* + * We want to touch writable mappings with a write fault in order + * to break COW, except for shared mappings because these don't COW + * and we would not want to dirty them for nothing. + */ + if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE) gup_flags |= FOLL_WRITE; + if (vma->vm_flags & VM_LOCKED) + gup_flags |= FOLL_MLOCK; + /* We don't try to access the guard page of a stack vma */ if (stack_guard_page(vma, start)) { addr += PAGE_SIZE; nr_pages--; } - while (nr_pages > 0) { - int i; - - cond_resched(); - - /* - * get_user_pages makes pages present if we are - * setting mlock. and this extra reference count will - * disable migration of this page. However, page may - * still be truncated out from under us. - */ - ret = __get_user_pages(current, mm, addr, - min_t(int, nr_pages, ARRAY_SIZE(pages)), - gup_flags, pages, NULL); - /* - * This can happen for, e.g., VM_NONLINEAR regions before - * a page has been allocated and mapped at a given offset, - * or for addresses that map beyond end of a file. - * We'll mlock the pages if/when they get faulted in. - */ - if (ret < 0) - break; - - lru_add_drain(); /* push cached pages to LRU */ - - for (i = 0; i < ret; i++) { - struct page *page = pages[i]; - - if (page->mapping) { - /* - * That preliminary check is mainly to avoid - * the pointless overhead of lock_page on the - * ZERO_PAGE: which might bounce very badly if - * there is contention. However, we're still - * dirtying its cacheline with get/put_page: - * we'll add another __get_user_pages flag to - * avoid it if that case turns out to matter. - */ - lock_page(page); - /* - * Because we lock page here and migration is - * blocked by the elevated reference, we need - * only check for file-cache page truncation. - */ - if (page->mapping) - mlock_vma_page(page); - unlock_page(page); - } - put_page(page); /* ref from get_user_pages() */ - } - - addr += ret * PAGE_SIZE; - nr_pages -= ret; - ret = 0; - } - - return ret; /* 0 or negative error code */ + return __get_user_pages(current, mm, addr, nr_pages, gup_flags, + NULL, NULL, nonblocking); } /* @@ -280,7 +232,7 @@ long mlock_vma_pages_range(struct vm_area_struct *vma, is_vm_hugetlb_page(vma) || vma == get_gate_vma(current))) { - __mlock_vma_pages_range(vma, start, end); + __mlock_vma_pages_range(vma, start, end, NULL); /* Hide errors from mmap() and other callers */ return 0; @@ -372,18 +324,10 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, int ret = 0; int lock = newflags & VM_LOCKED; - if (newflags == vma->vm_flags || - (vma->vm_flags & (VM_IO | VM_PFNMAP))) + if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) || + is_vm_hugetlb_page(vma) || vma == get_gate_vma(current)) goto out; /* don't set VM_LOCKED, don't count */ - if ((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) || - is_vm_hugetlb_page(vma) || - vma == get_gate_vma(current)) { - if (lock) - make_pages_present(start, end); - goto out; /* don't set VM_LOCKED, don't count */ - } - pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma)); @@ -419,14 +363,10 @@ success: * set VM_LOCKED, __mlock_vma_pages_range will bring it back. */ - if (lock) { + if (lock) vma->vm_flags = newflags; - ret = __mlock_vma_pages_range(vma, start, end); - if (ret < 0) - ret = __mlock_posix_error_return(ret); - } else { + else munlock_vma_pages_range(vma, start, end); - } out: *prev = vma; @@ -439,7 +379,8 @@ static int do_mlock(unsigned long start, size_t len, int on) struct vm_area_struct * vma, * prev; int error; - len = PAGE_ALIGN(len); + VM_BUG_ON(start & ~PAGE_MASK); + VM_BUG_ON(len != PAGE_ALIGN(len)); end = start + len; if (end < start) return -EINVAL; @@ -482,6 +423,62 @@ static int do_mlock(unsigned long start, size_t len, int on) return error; } +static int do_mlock_pages(unsigned long start, size_t len, int ignore_errors) +{ + struct mm_struct *mm = current->mm; + unsigned long end, nstart, nend; + struct vm_area_struct *vma = NULL; + int locked = 0; + int ret = 0; + + VM_BUG_ON(start & ~PAGE_MASK); + VM_BUG_ON(len != PAGE_ALIGN(len)); + end = start + len; + + for (nstart = start; nstart < end; nstart = nend) { + /* + * We want to fault in pages for [nstart; end) address range. + * Find first corresponding VMA. + */ + if (!locked) { + locked = 1; + down_read(&mm->mmap_sem); + vma = find_vma(mm, nstart); + } else if (nstart >= vma->vm_end) + vma = vma->vm_next; + if (!vma || vma->vm_start >= end) + break; + /* + * Set [nstart; nend) to intersection of desired address + * range with the first VMA. Also, skip undesirable VMA types. + */ + nend = min(end, vma->vm_end); + if (vma->vm_flags & (VM_IO | VM_PFNMAP)) + continue; + if (nstart < vma->vm_start) + nstart = vma->vm_start; + /* + * Now fault in a range of pages. __mlock_vma_pages_range() + * double checks the vma flags, so that it won't mlock pages + * if the vma was already munlocked. + */ + ret = __mlock_vma_pages_range(vma, nstart, nend, &locked); + if (ret < 0) { + if (ignore_errors) { + ret = 0; + continue; /* continue at next VMA */ + } + ret = __mlock_posix_error_return(ret); + break; + } + nend = nstart + ret * PAGE_SIZE; + ret = 0; + } + if (locked) + up_read(&mm->mmap_sem); + return ret; /* 0 or negative error code */ +} + SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) { unsigned long locked; @@ -507,6 +504,8 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) if ((locked <= lock_limit) || capable(CAP_IPC_LOCK)) error = do_mlock(start, len, 1); up_write(¤t->mm->mmap_sem); + if (!error) + error = do_mlock_pages(start, len, 0); return error; } @@ -571,6 +570,10 @@ SYSCALL_DEFINE1(mlockall, int, flags) capable(CAP_IPC_LOCK)) ret = do_mlockall(flags); up_write(¤t->mm->mmap_sem); + if (!ret && (flags & MCL_CURRENT)) { + /* Ignore errors */ + do_mlock_pages(0, TASK_SIZE, 1); + } out: return ret; } @@ -29,6 +29,7 @@ #include <linux/mmu_notifier.h> #include <linux/perf_event.h> #include <linux/audit.h> +#include <linux/khugepaged.h> #include <asm/uaccess.h> #include <asm/cacheflush.h> @@ -253,7 +254,15 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) down_write(&mm->mmap_sem); #ifdef CONFIG_COMPAT_BRK - min_brk = mm->end_code; + /* + * CONFIG_COMPAT_BRK can still be overridden by setting + * randomize_va_space to 2, which will still cause mm->start_brk + * to be arbitrarily shifted + */ + if (mm->start_brk > PAGE_ALIGN(mm->end_data)) + min_brk = mm->start_brk; + else + min_brk = mm->end_data; #else min_brk = mm->start_brk; #endif @@ -588,6 +597,8 @@ again: remove_next = 1 + (end > next->vm_end); } } + vma_adjust_trans_huge(vma, start, end, adjust_next); + /* * When changing only vma->vm_end, we don't really need anon_vma * lock. This is a fairly rare case by itself, but the anon_vma @@ -815,6 +826,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, end, prev->vm_pgoff, NULL); if (err) return NULL; + khugepaged_enter_vma_merge(prev); return prev; } @@ -833,6 +845,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, next->vm_pgoff - pglen, NULL); if (err) return NULL; + khugepaged_enter_vma_merge(area); return area; } @@ -1761,6 +1774,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) } } vma_unlock_anon_vma(vma); + khugepaged_enter_vma_merge(vma); return error; } #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */ @@ -1808,6 +1822,7 @@ static int expand_downwards(struct vm_area_struct *vma, } } vma_unlock_anon_vma(vma); + khugepaged_enter_vma_merge(vma); return error; } diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 438951d..8d032de 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -100,6 +100,26 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm, return young; } +int __mmu_notifier_test_young(struct mm_struct *mm, + unsigned long address) +{ + struct mmu_notifier *mn; + struct hlist_node *n; + int young = 0; + + rcu_read_lock(); + hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { + if (mn->ops->test_young) { + young = mn->ops->test_young(mn, mm, address); + if (young) + break; + } + } + rcu_read_unlock(); + + return young; +} + void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, pte_t pte) { diff --git a/mm/mmzone.c b/mm/mmzone.c index e35bfb8..f5b7d17 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -87,24 +87,3 @@ int memmap_valid_within(unsigned long pfn, return 1; } #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ - -#ifdef CONFIG_SMP -/* Called when a more accurate view of NR_FREE_PAGES is needed */ -unsigned long zone_nr_free_pages(struct zone *zone) -{ - unsigned long nr_free_pages = zone_page_state(zone, NR_FREE_PAGES); - - /* - * While kswapd is awake, it is considered the zone is under some - * memory pressure. Under pressure, there is a risk that - * per-cpu-counter-drift will allow the min watermark to be breached - * potentially causing a live-lock. While kswapd is awake and - * free pages are low, get a better estimate for free pages - */ - if (nr_free_pages < zone->percpu_drift_mark && - !waitqueue_active(&zone->zone_pgdat->kswapd_wait)) - return zone_page_state_snapshot(zone, NR_FREE_PAGES); - - return nr_free_pages; -} -#endif /* CONFIG_SMP */ diff --git a/mm/mprotect.c b/mm/mprotect.c index 4c51338..5a688a2 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -78,7 +78,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, pte_unmap_unlock(pte - 1, ptl); } -static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud, +static inline void change_pmd_range(struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, pgprot_t newprot, int dirty_accountable) { @@ -88,13 +88,21 @@ static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); + if (pmd_trans_huge(*pmd)) { + if (next - addr != HPAGE_PMD_SIZE) + split_huge_page_pmd(vma->vm_mm, pmd); + else if (change_huge_pmd(vma, pmd, addr, newprot)) + continue; + /* fall through */ + } if (pmd_none_or_clear_bad(pmd)) continue; - change_pte_range(mm, pmd, addr, next, newprot, dirty_accountable); + change_pte_range(vma->vm_mm, pmd, addr, next, newprot, + dirty_accountable); } while (pmd++, addr = next, addr != end); } -static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd, +static inline void change_pud_range(struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, unsigned long end, pgprot_t newprot, int dirty_accountable) { @@ -106,7 +114,8 @@ static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd, next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) continue; - change_pmd_range(mm, pud, addr, next, newprot, dirty_accountable); + change_pmd_range(vma, pud, addr, next, newprot, + dirty_accountable); } while (pud++, addr = next, addr != end); } @@ -126,7 +135,8 @@ static void change_protection(struct vm_area_struct *vma, next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; - change_pud_range(mm, pgd, addr, next, newprot, dirty_accountable); + change_pud_range(vma, pgd, addr, next, newprot, + dirty_accountable); } while (pgd++, addr = next, addr != end); flush_tlb_range(vma, start, end); } diff --git a/mm/mremap.c b/mm/mremap.c index 563fbdd..9925b63 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -41,13 +41,15 @@ static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr) return NULL; pmd = pmd_offset(pud, addr); + split_huge_page_pmd(mm, pmd); if (pmd_none_or_clear_bad(pmd)) return NULL; return pmd; } -static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr) +static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr) { pgd_t *pgd; pud_t *pud; @@ -62,7 +64,8 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr) if (!pmd) return NULL; - if (!pmd_present(*pmd) && __pte_alloc(mm, pmd, addr)) + VM_BUG_ON(pmd_trans_huge(*pmd)); + if (pmd_none(*pmd) && __pte_alloc(mm, vma, pmd, addr)) return NULL; return pmd; @@ -147,7 +150,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma, old_pmd = get_old_pmd(vma->vm_mm, old_addr); if (!old_pmd) continue; - new_pmd = alloc_new_pmd(vma->vm_mm, new_addr); + new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr); if (!new_pmd) break; next = (new_addr + PMD_SIZE) & PMD_MASK; @@ -127,7 +127,8 @@ unsigned int kobjsize(const void *objp) int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int nr_pages, unsigned int foll_flags, - struct page **pages, struct vm_area_struct **vmas) + struct page **pages, struct vm_area_struct **vmas, + int *retry) { struct vm_area_struct *vma; unsigned long vm_flags; @@ -185,7 +186,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, if (force) flags |= FOLL_FORCE; - return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas); + return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas, + NULL); } EXPORT_SYMBOL(get_user_pages); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index b5d8a1f..2cb01f6 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -410,9 +410,12 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) { unsigned long background; unsigned long dirty; - unsigned long available_memory = determine_dirtyable_memory(); + unsigned long uninitialized_var(available_memory); struct task_struct *tsk; + if (!vm_dirty_bytes || !dirty_background_bytes) + available_memory = determine_dirtyable_memory(); + if (vm_dirty_bytes) dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE); else @@ -1103,7 +1106,7 @@ EXPORT_SYMBOL(write_one_page); int __set_page_dirty_no_writeback(struct page *page) { if (!PageDirty(page)) - SetPageDirty(page); + return !TestSetPageDirty(page); return 0; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 826ba69..90c1439 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -357,6 +357,7 @@ void prep_compound_page(struct page *page, unsigned long order) } } +/* update __split_huge_page_refcount if you change this function */ static int destroy_compound_page(struct page *page, unsigned long order) { int i; @@ -426,18 +427,10 @@ static inline void rmv_page_order(struct page *page) * * Assumption: *_mem_map is contiguous at least up to MAX_ORDER */ -static inline struct page * -__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order) -{ - unsigned long buddy_idx = page_idx ^ (1 << order); - - return page + (buddy_idx - page_idx); -} - static inline unsigned long -__find_combined_index(unsigned long page_idx, unsigned int order) +__find_buddy_index(unsigned long page_idx, unsigned int order) { - return (page_idx & ~(1 << order)); + return page_idx ^ (1 << order); } /* @@ -448,8 +441,8 @@ __find_combined_index(unsigned long page_idx, unsigned int order) * (c) a page and its buddy have the same order && * (d) a page and its buddy are in the same zone. * - * For recording whether a page is in the buddy system, we use PG_buddy. - * Setting, clearing, and testing PG_buddy is serialized by zone->lock. + * For recording whether a page is in the buddy system, we set ->_mapcount -2. + * Setting, clearing, and testing _mapcount -2 is serialized by zone->lock. * * For recording page's order, we use page_private(page). */ @@ -482,7 +475,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, * as necessary, plus some accounting needed to play nicely with other * parts of the VM system. * At each level, we keep a list of pages, which are heads of continuous - * free pages of length of (1 << order) and marked with PG_buddy. Page's + * free pages of length of (1 << order) and marked with _mapcount -2. Page's * order is recorded in page_private(page) field. * So when we are allocating or freeing one, we can derive the state of the * other. That is, if we allocate a small block, and both were @@ -499,6 +492,7 @@ static inline void __free_one_page(struct page *page, { unsigned long page_idx; unsigned long combined_idx; + unsigned long uninitialized_var(buddy_idx); struct page *buddy; if (unlikely(PageCompound(page))) @@ -513,7 +507,8 @@ static inline void __free_one_page(struct page *page, VM_BUG_ON(bad_range(zone, page)); while (order < MAX_ORDER-1) { - buddy = __page_find_buddy(page, page_idx, order); + buddy_idx = __find_buddy_index(page_idx, order); + buddy = page + (buddy_idx - page_idx); if (!page_is_buddy(page, buddy, order)) break; @@ -521,7 +516,7 @@ static inline void __free_one_page(struct page *page, list_del(&buddy->lru); zone->free_area[order].nr_free--; rmv_page_order(buddy); - combined_idx = __find_combined_index(page_idx, order); + combined_idx = buddy_idx & page_idx; page = page + (combined_idx - page_idx); page_idx = combined_idx; order++; @@ -538,9 +533,10 @@ static inline void __free_one_page(struct page *page, */ if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) { struct page *higher_page, *higher_buddy; - combined_idx = __find_combined_index(page_idx, order); - higher_page = page + combined_idx - page_idx; - higher_buddy = __page_find_buddy(higher_page, combined_idx, order + 1); + combined_idx = buddy_idx & page_idx; + higher_page = page + (combined_idx - page_idx); + buddy_idx = __find_buddy_index(combined_idx, order + 1); + higher_buddy = page + (buddy_idx - combined_idx); if (page_is_buddy(higher_page, higher_buddy, order + 1)) { list_add_tail(&page->lru, &zone->free_area[order].free_list[migratetype]); @@ -651,13 +647,10 @@ static bool free_pages_prepare(struct page *page, unsigned int order) trace_mm_page_free_direct(page, order); kmemcheck_free_shadow(page, order); - for (i = 0; i < (1 << order); i++) { - struct page *pg = page + i; - - if (PageAnon(pg)) - pg->mapping = NULL; - bad += free_pages_check(pg); - } + if (PageAnon(page)) + page->mapping = NULL; + for (i = 0; i < (1 << order); i++) + bad += free_pages_check(page + i); if (bad) return false; @@ -1460,24 +1453,24 @@ static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) #endif /* CONFIG_FAIL_PAGE_ALLOC */ /* - * Return 1 if free pages are above 'mark'. This takes into account the order + * Return true if free pages are above 'mark'. This takes into account the order * of the allocation. */ -int zone_watermark_ok(struct zone *z, int order, unsigned long mark, - int classzone_idx, int alloc_flags) +static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, + int classzone_idx, int alloc_flags, long free_pages) { /* free_pages my go negative - that's OK */ long min = mark; - long free_pages = zone_nr_free_pages(z) - (1 << order) + 1; int o; + free_pages -= (1 << order) + 1; if (alloc_flags & ALLOC_HIGH) min -= min / 2; if (alloc_flags & ALLOC_HARDER) min -= min / 4; if (free_pages <= min + z->lowmem_reserve[classzone_idx]) - return 0; + return false; for (o = 0; o < order; o++) { /* At the next order, this order's pages become unavailable */ free_pages -= z->free_area[o].nr_free << o; @@ -1486,9 +1479,28 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark, min >>= 1; if (free_pages <= min) - return 0; + return false; } - return 1; + return true; +} + +bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, + int classzone_idx, int alloc_flags) +{ + return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, + zone_page_state(z, NR_FREE_PAGES)); +} + +bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, + int classzone_idx, int alloc_flags) +{ + long free_pages = zone_page_state(z, NR_FREE_PAGES); + + if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) + free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); + + return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, + free_pages); } #ifdef CONFIG_NUMA @@ -1793,15 +1805,18 @@ static struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int migratetype, unsigned long *did_some_progress) + int migratetype, unsigned long *did_some_progress, + bool sync_migration) { struct page *page; if (!order || compaction_deferred(preferred_zone)) return NULL; + current->flags |= PF_MEMALLOC; *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, - nodemask); + nodemask, sync_migration); + current->flags &= ~PF_MEMALLOC; if (*did_some_progress != COMPACT_SKIPPED) { /* Page migration frees to the PCP lists but we want merging */ @@ -1837,7 +1852,8 @@ static inline struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int migratetype, unsigned long *did_some_progress) + int migratetype, unsigned long *did_some_progress, + bool sync_migration) { return NULL; } @@ -1852,23 +1868,22 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, { struct page *page = NULL; struct reclaim_state reclaim_state; - struct task_struct *p = current; bool drained = false; cond_resched(); /* We now go into synchronous reclaim */ cpuset_memory_pressure_bump(); - p->flags |= PF_MEMALLOC; + current->flags |= PF_MEMALLOC; lockdep_set_current_reclaim_state(gfp_mask); reclaim_state.reclaimed_slab = 0; - p->reclaim_state = &reclaim_state; + current->reclaim_state = &reclaim_state; *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); - p->reclaim_state = NULL; + current->reclaim_state = NULL; lockdep_clear_current_reclaim_state(); - p->flags &= ~PF_MEMALLOC; + current->flags &= ~PF_MEMALLOC; cond_resched(); @@ -1920,19 +1935,19 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, static inline void wake_all_kswapd(unsigned int order, struct zonelist *zonelist, - enum zone_type high_zoneidx) + enum zone_type high_zoneidx, + enum zone_type classzone_idx) { struct zoneref *z; struct zone *zone; for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) - wakeup_kswapd(zone, order); + wakeup_kswapd(zone, order, classzone_idx); } static inline int gfp_to_alloc_flags(gfp_t gfp_mask) { - struct task_struct *p = current; int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; const gfp_t wait = gfp_mask & __GFP_WAIT; @@ -1948,18 +1963,23 @@ gfp_to_alloc_flags(gfp_t gfp_mask) alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH); if (!wait) { - alloc_flags |= ALLOC_HARDER; + /* + * Not worth trying to allocate harder for + * __GFP_NOMEMALLOC even if it can't schedule. + */ + if (!(gfp_mask & __GFP_NOMEMALLOC)) + alloc_flags |= ALLOC_HARDER; /* * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. * See also cpuset_zone_allowed() comment in kernel/cpuset.c. */ alloc_flags &= ~ALLOC_CPUSET; - } else if (unlikely(rt_task(p)) && !in_interrupt()) + } else if (unlikely(rt_task(current)) && !in_interrupt()) alloc_flags |= ALLOC_HARDER; if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { if (!in_interrupt() && - ((p->flags & PF_MEMALLOC) || + ((current->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))) alloc_flags |= ALLOC_NO_WATERMARKS; } @@ -1978,7 +1998,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, int alloc_flags; unsigned long pages_reclaimed = 0; unsigned long did_some_progress; - struct task_struct *p = current; + bool sync_migration = false; /* * In the slowpath, we sanity check order to avoid ever trying to @@ -2003,7 +2023,9 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, goto nopage; restart: - wake_all_kswapd(order, zonelist, high_zoneidx); + if (!(gfp_mask & __GFP_NO_KSWAPD)) + wake_all_kswapd(order, zonelist, high_zoneidx, + zone_idx(preferred_zone)); /* * OK, we're below the kswapd watermark and have kicked background @@ -2034,21 +2056,26 @@ rebalance: goto nopage; /* Avoid recursion of direct reclaim */ - if (p->flags & PF_MEMALLOC) + if (current->flags & PF_MEMALLOC) goto nopage; /* Avoid allocations with no watermarks from looping endlessly */ if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) goto nopage; - /* Try direct compaction */ + /* + * Try direct compaction. The first pass is asynchronous. Subsequent + * attempts after direct reclaim are synchronous + */ page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, high_zoneidx, nodemask, alloc_flags, preferred_zone, - migratetype, &did_some_progress); + migratetype, &did_some_progress, + sync_migration); if (page) goto got_pg; + sync_migration = true; /* Try direct reclaim and then allocating */ page = __alloc_pages_direct_reclaim(gfp_mask, order, @@ -2102,13 +2129,27 @@ rebalance: /* Wait for some write requests to complete then retry */ wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); goto rebalance; + } else { + /* + * High-order allocations do not necessarily loop after + * direct reclaim and reclaim/compaction depends on compaction + * being called after reclaim so call directly if necessary + */ + page = __alloc_pages_direct_compact(gfp_mask, order, + zonelist, high_zoneidx, + nodemask, + alloc_flags, preferred_zone, + migratetype, &did_some_progress, + sync_migration); + if (page) + goto got_pg; } nopage: if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) { printk(KERN_WARNING "%s: page allocation failure." " order:%d, mode:0x%x\n", - p->comm, order, gfp_mask); + current->comm, order, gfp_mask); dump_stack(); show_mem(); } @@ -2442,7 +2483,7 @@ void show_free_areas(void) " all_unreclaimable? %s" "\n", zone->name, - K(zone_nr_free_pages(zone)), + K(zone_page_state(zone, NR_FREE_PAGES)), K(min_wmark_pages(zone)), K(low_wmark_pages(zone)), K(high_wmark_pages(zone)), @@ -2585,9 +2626,16 @@ static int __parse_numa_zonelist_order(char *s) static __init int setup_numa_zonelist_order(char *s) { - if (s) - return __parse_numa_zonelist_order(s); - return 0; + int ret; + + if (!s) + return 0; + + ret = __parse_numa_zonelist_order(s); + if (ret == 0) + strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN); + + return ret; } early_param("numa_zonelist_order", setup_numa_zonelist_order); @@ -5517,7 +5565,6 @@ static struct trace_print_flags pageflag_names[] = { {1UL << PG_swapcache, "swapcache" }, {1UL << PG_mappedtodisk, "mappedtodisk" }, {1UL << PG_reclaim, "reclaim" }, - {1UL << PG_buddy, "buddy" }, {1UL << PG_swapbacked, "swapbacked" }, {1UL << PG_unevictable, "unevictable" }, #ifdef CONFIG_MMU @@ -5565,7 +5612,7 @@ void dump_page(struct page *page) { printk(KERN_ALERT "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", - page, page_count(page), page_mapcount(page), + page, atomic_read(&page->_count), page_mapcount(page), page->mapping, page->index); dump_page_flags(page->flags); } diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 38cc58b..7cfa6ae 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -34,6 +34,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); + split_huge_page_pmd(walk->mm, pmd); if (pmd_none_or_clear_bad(pmd)) { if (walk->pte_hole) err = walk->pte_hole(addr, next, walk); diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index 7d9c1d0..ea53496 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -421,7 +421,7 @@ static struct pcpu_chunk *pcpu_create_chunk(void) return NULL; vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes, - pcpu_nr_groups, pcpu_atom_size, GFP_KERNEL); + pcpu_nr_groups, pcpu_atom_size); if (!vms) { pcpu_free_chunk(chunk); return NULL; diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c new file mode 100644 index 0000000..d030548 --- /dev/null +++ b/mm/pgtable-generic.c @@ -0,0 +1,123 @@ +/* + * mm/pgtable-generic.c + * + * Generic pgtable methods declared in asm-generic/pgtable.h + * + * Copyright (C) 2010 Linus Torvalds + */ + +#include <asm/tlb.h> +#include <asm-generic/pgtable.h> + +#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS +/* + * Only sets the access flags (dirty, accessed, and + * writable). Furthermore, we know it always gets set to a "more + * permissive" setting, which allows most architectures to optimize + * this. We return whether the PTE actually changed, which in turn + * instructs the caller to do things like update__mmu_cache. This + * used to be done in the caller, but sparc needs minor faults to + * force that call on sun4c so we changed this macro slightly + */ +int ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, + pte_t entry, int dirty) +{ + int changed = !pte_same(*ptep, entry); + if (changed) { + set_pte_at(vma->vm_mm, address, ptep, entry); + flush_tlb_page(vma, address); + } + return changed; +} +#endif + +#ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS +int pmdp_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp, + pmd_t entry, int dirty) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + int changed = !pmd_same(*pmdp, entry); + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + if (changed) { + set_pmd_at(vma->vm_mm, address, pmdp, entry); + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + } + return changed; +#else /* CONFIG_TRANSPARENT_HUGEPAGE */ + BUG(); + return 0; +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +} +#endif + +#ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH +int ptep_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep) +{ + int young; + young = ptep_test_and_clear_young(vma, address, ptep); + if (young) + flush_tlb_page(vma, address); + return young; +} +#endif + +#ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH +int pmdp_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) +{ + int young; +#ifndef CONFIG_TRANSPARENT_HUGEPAGE + BUG(); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + young = pmdp_test_and_clear_young(vma, address, pmdp); + if (young) + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + return young; +} +#endif + +#ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH +pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, + pte_t *ptep) +{ + pte_t pte; + pte = ptep_get_and_clear((vma)->vm_mm, address, ptep); + flush_tlb_page(vma, address); + return pte; +} +#endif + +#ifndef __HAVE_ARCH_PMDP_CLEAR_FLUSH +pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) +{ + pmd_t pmd; +#ifndef CONFIG_TRANSPARENT_HUGEPAGE + BUG(); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp); + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + return pmd; +} +#endif + +#ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH +pmd_t pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + pmd_t pmd = pmd_mksplitting(*pmdp); + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + set_pmd_at(vma->vm_mm, address, pmdp, pmd); + /* tlb flush only to serialize against gup-fast */ + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); +#else /* CONFIG_TRANSPARENT_HUGEPAGE */ + BUG(); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +} +#endif @@ -177,6 +177,10 @@ static void anon_vma_chain_link(struct vm_area_struct *vma, list_add(&avc->same_vma, &vma->anon_vma_chain); anon_vma_lock(anon_vma); + /* + * It's critical to add new vmas to the tail of the anon_vma, + * see comment in huge_memory.c:__split_huge_page(). + */ list_add_tail(&avc->same_anon_vma, &anon_vma->head); anon_vma_unlock(anon_vma); } @@ -360,7 +364,7 @@ void page_unlock_anon_vma(struct anon_vma *anon_vma) * Returns virtual address or -EFAULT if page's index/offset is not * within the range mapped the @vma. */ -static inline unsigned long +inline unsigned long vma_address(struct page *page, struct vm_area_struct *vma) { pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); @@ -435,6 +439,8 @@ pte_t *__page_check_address(struct page *page, struct mm_struct *mm, pmd = pmd_offset(pud, address); if (!pmd_present(*pmd)) return NULL; + if (pmd_trans_huge(*pmd)) + return NULL; pte = pte_offset_map(pmd, address); /* Make a quick check before getting the lock */ @@ -489,35 +495,17 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, unsigned long *vm_flags) { struct mm_struct *mm = vma->vm_mm; - pte_t *pte; - spinlock_t *ptl; int referenced = 0; - pte = page_check_address(page, mm, address, &ptl, 0); - if (!pte) - goto out; - /* * Don't want to elevate referenced for mlocked page that gets this far, * in order that it progresses to try_to_unmap and is moved to the * unevictable list. */ if (vma->vm_flags & VM_LOCKED) { - *mapcount = 1; /* break early from loop */ + *mapcount = 0; /* break early from loop */ *vm_flags |= VM_LOCKED; - goto out_unmap; - } - - if (ptep_clear_flush_young_notify(vma, address, pte)) { - /* - * Don't treat a reference through a sequentially read - * mapping as such. If the page has been used in - * another mapping, we will catch it; if this other - * mapping is already gone, the unmap path will have - * set PG_referenced or activated the page. - */ - if (likely(!VM_SequentialReadHint(vma))) - referenced++; + goto out; } /* Pretend the page is referenced if the task has the @@ -526,9 +514,39 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, rwsem_is_locked(&mm->mmap_sem)) referenced++; -out_unmap: + if (unlikely(PageTransHuge(page))) { + pmd_t *pmd; + + spin_lock(&mm->page_table_lock); + pmd = page_check_address_pmd(page, mm, address, + PAGE_CHECK_ADDRESS_PMD_FLAG); + if (pmd && !pmd_trans_splitting(*pmd) && + pmdp_clear_flush_young_notify(vma, address, pmd)) + referenced++; + spin_unlock(&mm->page_table_lock); + } else { + pte_t *pte; + spinlock_t *ptl; + + pte = page_check_address(page, mm, address, &ptl, 0); + if (!pte) + goto out; + + if (ptep_clear_flush_young_notify(vma, address, pte)) { + /* + * Don't treat a reference through a sequentially read + * mapping as such. If the page has been used in + * another mapping, we will catch it; if this other + * mapping is already gone, the unmap path will have + * set PG_referenced or activated the page. + */ + if (likely(!VM_SequentialReadHint(vma))) + referenced++; + } + pte_unmap_unlock(pte, ptl); + } + (*mapcount)--; - pte_unmap_unlock(pte, ptl); if (referenced) *vm_flags |= vma->vm_flags; @@ -864,8 +882,13 @@ void do_page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address, int exclusive) { int first = atomic_inc_and_test(&page->_mapcount); - if (first) - __inc_zone_page_state(page, NR_ANON_PAGES); + if (first) { + if (!PageTransHuge(page)) + __inc_zone_page_state(page, NR_ANON_PAGES); + else + __inc_zone_page_state(page, + NR_ANON_TRANSPARENT_HUGEPAGES); + } if (unlikely(PageKsm(page))) return; @@ -893,7 +916,10 @@ void page_add_new_anon_rmap(struct page *page, VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); SetPageSwapBacked(page); atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ - __inc_zone_page_state(page, NR_ANON_PAGES); + if (!PageTransHuge(page)) + __inc_zone_page_state(page, NR_ANON_PAGES); + else + __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); __page_set_anon_rmap(page, vma, address, 1); if (page_evictable(page, vma)) lru_cache_add_lru(page, LRU_ACTIVE_ANON); @@ -911,7 +937,7 @@ void page_add_file_rmap(struct page *page) { if (atomic_inc_and_test(&page->_mapcount)) { __inc_zone_page_state(page, NR_FILE_MAPPED); - mem_cgroup_update_file_mapped(page, 1); + mem_cgroup_inc_page_stat(page, MEMCG_NR_FILE_MAPPED); } } @@ -946,10 +972,14 @@ void page_remove_rmap(struct page *page) return; if (PageAnon(page)) { mem_cgroup_uncharge_page(page); - __dec_zone_page_state(page, NR_ANON_PAGES); + if (!PageTransHuge(page)) + __dec_zone_page_state(page, NR_ANON_PAGES); + else + __dec_zone_page_state(page, + NR_ANON_TRANSPARENT_HUGEPAGES); } else { __dec_zone_page_state(page, NR_FILE_MAPPED); - mem_cgroup_update_file_mapped(page, -1); + mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_MAPPED); } /* * It would be tidy to reset the PageAnon mapping here, @@ -1202,7 +1232,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, return ret; } -static bool is_vma_temporary_stack(struct vm_area_struct *vma) +bool is_vma_temporary_stack(struct vm_area_struct *vma) { int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); @@ -1400,6 +1430,7 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) int ret; BUG_ON(!PageLocked(page)); + VM_BUG_ON(!PageHuge(page) && PageTransHuge(page)); if (unlikely(PageKsm(page))) ret = try_to_unmap_ksm(page, flags); @@ -3636,7 +3636,7 @@ static int list_locations(struct kmem_cache *s, char *buf, len += sprintf(buf + len, "%7ld ", l->count); if (l->addr) - len += sprint_symbol(buf + len, (unsigned long)l->addr); + len += sprintf(buf + len, "%pS", (void *)l->addr); else len += sprintf(buf + len, "<not-available>"); @@ -3946,12 +3946,9 @@ SLAB_ATTR(min_partial); static ssize_t ctor_show(struct kmem_cache *s, char *buf) { - if (s->ctor) { - int n = sprint_symbol(buf, (unsigned long)s->ctor); - - return n + sprintf(buf + n, "\n"); - } - return 0; + if (!s->ctor) + return 0; + return sprintf(buf, "%pS\n", s->ctor); } SLAB_ATTR_RO(ctor); diff --git a/mm/sparse.c b/mm/sparse.c index 95ac219..9325020 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -671,10 +671,10 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) static void free_map_bootmem(struct page *page, unsigned long nr_pages) { unsigned long maps_section_nr, removing_section_nr, i; - int magic; + unsigned long magic; for (i = 0; i < nr_pages; i++, page++) { - magic = atomic_read(&page->_mapcount); + magic = (unsigned long) page->lru.next; BUG_ON(magic == NODE_INFO); @@ -56,17 +56,97 @@ static void __page_cache_release(struct page *page) del_page_from_lru(zone, page); spin_unlock_irqrestore(&zone->lru_lock, flags); } +} + +static void __put_single_page(struct page *page) +{ + __page_cache_release(page); free_hot_cold_page(page, 0); } -static void put_compound_page(struct page *page) +static void __put_compound_page(struct page *page) { - page = compound_head(page); - if (put_page_testzero(page)) { - compound_page_dtor *dtor; + compound_page_dtor *dtor; - dtor = get_compound_page_dtor(page); - (*dtor)(page); + __page_cache_release(page); + dtor = get_compound_page_dtor(page); + (*dtor)(page); +} + +static void put_compound_page(struct page *page) +{ + if (unlikely(PageTail(page))) { + /* __split_huge_page_refcount can run under us */ + struct page *page_head = page->first_page; + smp_rmb(); + /* + * If PageTail is still set after smp_rmb() we can be sure + * that the page->first_page we read wasn't a dangling pointer. + * See __split_huge_page_refcount() smp_wmb(). + */ + if (likely(PageTail(page) && get_page_unless_zero(page_head))) { + unsigned long flags; + /* + * Verify that our page_head wasn't converted + * to a a regular page before we got a + * reference on it. + */ + if (unlikely(!PageHead(page_head))) { + /* PageHead is cleared after PageTail */ + smp_rmb(); + VM_BUG_ON(PageTail(page)); + goto out_put_head; + } + /* + * Only run compound_lock on a valid PageHead, + * after having it pinned with + * get_page_unless_zero() above. + */ + smp_mb(); + /* page_head wasn't a dangling pointer */ + flags = compound_lock_irqsave(page_head); + if (unlikely(!PageTail(page))) { + /* __split_huge_page_refcount run before us */ + compound_unlock_irqrestore(page_head, flags); + VM_BUG_ON(PageHead(page_head)); + out_put_head: + if (put_page_testzero(page_head)) + __put_single_page(page_head); + out_put_single: + if (put_page_testzero(page)) + __put_single_page(page); + return; + } + VM_BUG_ON(page_head != page->first_page); + /* + * We can release the refcount taken by + * get_page_unless_zero now that + * split_huge_page_refcount is blocked on the + * compound_lock. + */ + if (put_page_testzero(page_head)) + VM_BUG_ON(1); + /* __split_huge_page_refcount will wait now */ + VM_BUG_ON(atomic_read(&page->_count) <= 0); + atomic_dec(&page->_count); + VM_BUG_ON(atomic_read(&page_head->_count) <= 0); + compound_unlock_irqrestore(page_head, flags); + if (put_page_testzero(page_head)) { + if (PageHead(page_head)) + __put_compound_page(page_head); + else + __put_single_page(page_head); + } + } else { + /* page_head is a dangling pointer */ + VM_BUG_ON(PageTail(page)); + goto out_put_single; + } + } else if (put_page_testzero(page)) { + if (PageHead(page)) + __put_compound_page(page); + else + __put_single_page(page); } } @@ -75,7 +155,7 @@ void put_page(struct page *page) if (unlikely(PageCompound(page))) put_compound_page(page); else if (put_page_testzero(page)) - __page_cache_release(page); + __put_single_page(page); } EXPORT_SYMBOL(put_page); @@ -98,15 +178,13 @@ void put_pages_list(struct list_head *pages) } EXPORT_SYMBOL(put_pages_list); -/* - * pagevec_move_tail() must be called with IRQ disabled. - * Otherwise this may cause nasty races. - */ -static void pagevec_move_tail(struct pagevec *pvec) +static void pagevec_lru_move_fn(struct pagevec *pvec, + void (*move_fn)(struct page *page, void *arg), + void *arg) { int i; - int pgmoved = 0; struct zone *zone = NULL; + unsigned long flags = 0; for (i = 0; i < pagevec_count(pvec); i++) { struct page *page = pvec->pages[i]; @@ -114,29 +192,49 @@ static void pagevec_move_tail(struct pagevec *pvec) if (pagezone != zone) { if (zone) - spin_unlock(&zone->lru_lock); + spin_unlock_irqrestore(&zone->lru_lock, flags); zone = pagezone; - spin_lock(&zone->lru_lock); - } - if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { - int lru = page_lru_base_type(page); - list_move_tail(&page->lru, &zone->lru[lru].list); - pgmoved++; + spin_lock_irqsave(&zone->lru_lock, flags); } + + (*move_fn)(page, arg); } if (zone) - spin_unlock(&zone->lru_lock); - __count_vm_events(PGROTATED, pgmoved); - release_pages(pvec->pages, pvec->nr, pvec->cold); + spin_unlock_irqrestore(&zone->lru_lock, flags); + release_pages(pvec->pages, pagevec_count(pvec), pvec->cold); pagevec_reinit(pvec); } +static void pagevec_move_tail_fn(struct page *page, void *arg) +{ + int *pgmoved = arg; + struct zone *zone = page_zone(page); + + if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { + int lru = page_lru_base_type(page); + list_move_tail(&page->lru, &zone->lru[lru].list); + (*pgmoved)++; + } +} + +/* + * pagevec_move_tail() must be called with IRQ disabled. + * Otherwise this may cause nasty races. + */ +static void pagevec_move_tail(struct pagevec *pvec) +{ + int pgmoved = 0; + + pagevec_lru_move_fn(pvec, pagevec_move_tail_fn, &pgmoved); + __count_vm_events(PGROTATED, pgmoved); +} + /* * Writeback is about to end against a page which has been marked for immediate * reclaim. If it still appears to be reclaimable, move it to the tail of the * inactive list. */ -void rotate_reclaimable_page(struct page *page) +void rotate_reclaimable_page(struct page *page) { if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) && !PageUnevictable(page) && PageLRU(page)) { @@ -173,27 +271,94 @@ static void update_page_reclaim_stat(struct zone *zone, struct page *page, } /* - * FIXME: speed this up? + * A page will go to active list either by activate_page or putback_lru_page. + * In the activate_page case, the page hasn't active bit set. The page might + * not in LRU list because it's isolated before it gets a chance to be moved to + * active list. The window is small because pagevec just stores several pages. + * For such case, we do nothing for such page. + * In the putback_lru_page case, the page isn't in lru list but has active + * bit set */ -void activate_page(struct page *page) +static void __activate_page(struct page *page, void *arg) { struct zone *zone = page_zone(page); + int file = page_is_file_cache(page); + int lru = page_lru_base_type(page); + bool putback = !PageLRU(page); - spin_lock_irq(&zone->lru_lock); - if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { - int file = page_is_file_cache(page); - int lru = page_lru_base_type(page); + /* The page is isolated before it's moved to active list */ + if (!PageLRU(page) && !PageActive(page)) + return; + if ((PageLRU(page) && PageActive(page)) || PageUnevictable(page)) + return; + + if (!putback) del_page_from_lru_list(zone, page, lru); + else + SetPageLRU(page); - SetPageActive(page); - lru += LRU_ACTIVE; - add_page_to_lru_list(zone, page, lru); - __count_vm_event(PGACTIVATE); + SetPageActive(page); + lru += LRU_ACTIVE; + add_page_to_lru_list(zone, page, lru); - update_page_reclaim_stat(zone, page, file, 1); + if (putback) + return; + __count_vm_event(PGACTIVATE); + update_page_reclaim_stat(zone, page, file, 1); +} + +#ifdef CONFIG_SMP +static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs); + +static void activate_page_drain(int cpu) +{ + struct pagevec *pvec = &per_cpu(activate_page_pvecs, cpu); + + if (pagevec_count(pvec)) + pagevec_lru_move_fn(pvec, __activate_page, NULL); +} + +void activate_page(struct page *page) +{ + if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { + struct pagevec *pvec = &get_cpu_var(activate_page_pvecs); + + page_cache_get(page); + if (!pagevec_add(pvec, page)) + pagevec_lru_move_fn(pvec, __activate_page, NULL); + put_cpu_var(activate_page_pvecs); + } +} + +/* Caller should hold zone->lru_lock */ +int putback_active_lru_page(struct zone *zone, struct page *page) +{ + struct pagevec *pvec = &get_cpu_var(activate_page_pvecs); + + if (!pagevec_add(pvec, page)) { + spin_unlock_irq(&zone->lru_lock); + pagevec_lru_move_fn(pvec, __activate_page, NULL); + spin_lock_irq(&zone->lru_lock); } + put_cpu_var(activate_page_pvecs); + return 1; +} + +#else +static inline void activate_page_drain(int cpu) +{ +} + +void activate_page(struct page *page) +{ + struct zone *zone = page_zone(page); + + spin_lock_irq(&zone->lru_lock); + if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) + __activate_page(page, NULL); spin_unlock_irq(&zone->lru_lock); } +#endif /* * Mark a page as having seen activity. @@ -292,6 +457,7 @@ static void drain_cpu_pagevecs(int cpu) pagevec_move_tail(pvec); local_irq_restore(flags); } + activate_page_drain(cpu); } void lru_add_drain(void) @@ -399,44 +565,70 @@ void __pagevec_release(struct pagevec *pvec) EXPORT_SYMBOL(__pagevec_release); +/* used by __split_huge_page_refcount() */ +void lru_add_page_tail(struct zone* zone, + struct page *page, struct page *page_tail) +{ + int active; + enum lru_list lru; + const int file = 0; + struct list_head *head; + + VM_BUG_ON(!PageHead(page)); + VM_BUG_ON(PageCompound(page_tail)); + VM_BUG_ON(PageLRU(page_tail)); + VM_BUG_ON(!spin_is_locked(&zone->lru_lock)); + + SetPageLRU(page_tail); + + if (page_evictable(page_tail, NULL)) { + if (PageActive(page)) { + SetPageActive(page_tail); + active = 1; + lru = LRU_ACTIVE_ANON; + } else { + active = 0; + lru = LRU_INACTIVE_ANON; + } + update_page_reclaim_stat(zone, page_tail, file, active); + if (likely(PageLRU(page))) + head = page->lru.prev; + else + head = &zone->lru[lru].list; + __add_page_to_lru_list(zone, page_tail, lru, head); + } else { + SetPageUnevictable(page_tail); + add_page_to_lru_list(zone, page_tail, LRU_UNEVICTABLE); + } +} + +static void ____pagevec_lru_add_fn(struct page *page, void *arg) +{ + enum lru_list lru = (enum lru_list)arg; + struct zone *zone = page_zone(page); + int file = is_file_lru(lru); + int active = is_active_lru(lru); + + VM_BUG_ON(PageActive(page)); + VM_BUG_ON(PageUnevictable(page)); + VM_BUG_ON(PageLRU(page)); + + SetPageLRU(page); + if (active) + SetPageActive(page); + update_page_reclaim_stat(zone, page, file, active); + add_page_to_lru_list(zone, page, lru); +} + /* * Add the passed pages to the LRU, then drop the caller's refcount * on them. Reinitialises the caller's pagevec. */ void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru) { - int i; - struct zone *zone = NULL; - VM_BUG_ON(is_unevictable_lru(lru)); - for (i = 0; i < pagevec_count(pvec); i++) { - struct page *page = pvec->pages[i]; - struct zone *pagezone = page_zone(page); - int file; - int active; - - if (pagezone != zone) { - if (zone) - spin_unlock_irq(&zone->lru_lock); - zone = pagezone; - spin_lock_irq(&zone->lru_lock); - } - VM_BUG_ON(PageActive(page)); - VM_BUG_ON(PageUnevictable(page)); - VM_BUG_ON(PageLRU(page)); - SetPageLRU(page); - active = is_active_lru(lru); - file = is_file_lru(lru); - if (active) - SetPageActive(page); - update_page_reclaim_stat(zone, page, file, active); - add_page_to_lru_list(zone, page, lru); - } - if (zone) - spin_unlock_irq(&zone->lru_lock); - release_pages(pvec->pages, pvec->nr, pvec->cold); - pagevec_reinit(pvec); + pagevec_lru_move_fn(pvec, ____pagevec_lru_add_fn, (void *)lru); } EXPORT_SYMBOL(____pagevec_lru_add); diff --git a/mm/swap_state.c b/mm/swap_state.c index e10f583..5c8cfab 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -157,6 +157,12 @@ int add_to_swap(struct page *page) if (!entry.val) return 0; + if (unlikely(PageTransHuge(page))) + if (unlikely(split_huge_page(page))) { + swapcache_free(entry, NULL); + return 0; + } + /* * Radix-tree node allocations from PF_MEMALLOC contexts could * completely exhaust the page allocator. __GFP_NOMEMALLOC diff --git a/mm/swapfile.c b/mm/swapfile.c index b6adcfb..07a458d 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -964,6 +964,8 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); + if (unlikely(pmd_trans_huge(*pmd))) + continue; if (pmd_none_or_clear_bad(pmd)) continue; ret = unuse_pte_range(vma, pmd, addr, next, entry, page); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index eb5cc7d..cac13b4 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -748,7 +748,7 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask) va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE, VMALLOC_START, VMALLOC_END, node, gfp_mask); - if (unlikely(IS_ERR(va))) { + if (IS_ERR(va)) { kfree(vb); return ERR_CAST(va); } @@ -1315,13 +1315,6 @@ struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, -1, GFP_KERNEL, caller); } -struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, - int node, gfp_t gfp_mask) -{ - return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, - node, gfp_mask, __builtin_return_address(0)); -} - static struct vm_struct *find_vm_area(const void *addr) { struct vmap_area *va; @@ -1537,25 +1530,12 @@ fail: return NULL; } -void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) -{ - void *addr = __vmalloc_area_node(area, gfp_mask, prot, -1, - __builtin_return_address(0)); - - /* - * A ref_count = 3 is needed because the vm_struct and vmap_area - * structures allocated in the __get_vm_area_node() function contain - * references to the virtual address of the vmalloc'ed block. - */ - kmemleak_alloc(addr, area->size - PAGE_SIZE, 3, gfp_mask); - - return addr; -} - /** - * __vmalloc_node - allocate virtually contiguous memory + * __vmalloc_node_range - allocate virtually contiguous memory * @size: allocation size * @align: desired alignment + * @start: vm area range start + * @end: vm area range end * @gfp_mask: flags for the page level allocator * @prot: protection mask for the allocated pages * @node: node to use for allocation or -1 @@ -1565,9 +1545,9 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) * allocator with @gfp_mask flags. Map them into contiguous * kernel virtual space, using a pagetable protection of @prot. */ -static void *__vmalloc_node(unsigned long size, unsigned long align, - gfp_t gfp_mask, pgprot_t prot, - int node, void *caller) +void *__vmalloc_node_range(unsigned long size, unsigned long align, + unsigned long start, unsigned long end, gfp_t gfp_mask, + pgprot_t prot, int node, void *caller) { struct vm_struct *area; void *addr; @@ -1577,8 +1557,8 @@ static void *__vmalloc_node(unsigned long size, unsigned long align, if (!size || (size >> PAGE_SHIFT) > totalram_pages) return NULL; - area = __get_vm_area_node(size, align, VM_ALLOC, VMALLOC_START, - VMALLOC_END, node, gfp_mask, caller); + area = __get_vm_area_node(size, align, VM_ALLOC, start, end, node, + gfp_mask, caller); if (!area) return NULL; @@ -1595,6 +1575,27 @@ static void *__vmalloc_node(unsigned long size, unsigned long align, return addr; } +/** + * __vmalloc_node - allocate virtually contiguous memory + * @size: allocation size + * @align: desired alignment + * @gfp_mask: flags for the page level allocator + * @prot: protection mask for the allocated pages + * @node: node to use for allocation or -1 + * @caller: caller's return address + * + * Allocate enough pages to cover @size from the page level + * allocator with @gfp_mask flags. Map them into contiguous + * kernel virtual space, using a pagetable protection of @prot. + */ +static void *__vmalloc_node(unsigned long size, unsigned long align, + gfp_t gfp_mask, pgprot_t prot, + int node, void *caller) +{ + return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, + gfp_mask, prot, node, caller); +} + void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) { return __vmalloc_node(size, 1, gfp_mask, prot, -1, @@ -2203,17 +2204,16 @@ static unsigned long pvm_determine_end(struct vmap_area **pnext, * @sizes: array containing size of each area * @nr_vms: the number of areas to allocate * @align: alignment, all entries in @offsets and @sizes must be aligned to this - * @gfp_mask: allocation mask * * Returns: kmalloc'd vm_struct pointer array pointing to allocated * vm_structs on success, %NULL on failure * * Percpu allocator wants to use congruent vm areas so that it can * maintain the offsets among percpu areas. This function allocates - * congruent vmalloc areas for it. These areas tend to be scattered - * pretty far, distance between two areas easily going up to - * gigabytes. To avoid interacting with regular vmallocs, these areas - * are allocated from top. + * congruent vmalloc areas for it with GFP_KERNEL. These areas tend to + * be scattered pretty far, distance between two areas easily going up + * to gigabytes. To avoid interacting with regular vmallocs, these + * areas are allocated from top. * * Despite its complicated look, this allocator is rather simple. It * does everything top-down and scans areas from the end looking for @@ -2224,7 +2224,7 @@ static unsigned long pvm_determine_end(struct vmap_area **pnext, */ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, const size_t *sizes, int nr_vms, - size_t align, gfp_t gfp_mask) + size_t align) { const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align); const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); @@ -2234,8 +2234,6 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, unsigned long base, start, end, last_end; bool purged = false; - gfp_mask &= GFP_RECLAIM_MASK; - /* verify parameters and allocate data structures */ BUG_ON(align & ~PAGE_MASK || !is_power_of_2(align)); for (last_area = 0, area = 0; area < nr_vms; area++) { @@ -2268,14 +2266,14 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, return NULL; } - vms = kzalloc(sizeof(vms[0]) * nr_vms, gfp_mask); - vas = kzalloc(sizeof(vas[0]) * nr_vms, gfp_mask); + vms = kzalloc(sizeof(vms[0]) * nr_vms, GFP_KERNEL); + vas = kzalloc(sizeof(vas[0]) * nr_vms, GFP_KERNEL); if (!vas || !vms) goto err_free; for (area = 0; area < nr_vms; area++) { - vas[area] = kzalloc(sizeof(struct vmap_area), gfp_mask); - vms[area] = kzalloc(sizeof(struct vm_struct), gfp_mask); + vas[area] = kzalloc(sizeof(struct vmap_area), GFP_KERNEL); + vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL); if (!vas[area] || !vms[area]) goto err_free; } @@ -2456,13 +2454,8 @@ static int s_show(struct seq_file *m, void *p) seq_printf(m, "0x%p-0x%p %7ld", v->addr, v->addr + v->size, v->size); - if (v->caller) { - char buff[KSYM_SYMBOL_LEN]; - - seq_putc(m, ' '); - sprint_symbol(buff, (unsigned long)v->caller); - seq_puts(m, buff); - } + if (v->caller) + seq_printf(m, " %pS", v->caller); if (v->nr_pages) seq_printf(m, " pages=%d", v->nr_pages); diff --git a/mm/vmscan.c b/mm/vmscan.c index 9ca587c..99999a9 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -32,6 +32,7 @@ #include <linux/topology.h> #include <linux/cpu.h> #include <linux/cpuset.h> +#include <linux/compaction.h> #include <linux/notifier.h> #include <linux/rwsem.h> #include <linux/delay.h> @@ -40,6 +41,7 @@ #include <linux/memcontrol.h> #include <linux/delayacct.h> #include <linux/sysctl.h> +#include <linux/compaction.h> #include <asm/tlbflush.h> #include <asm/div64.h> @@ -51,11 +53,23 @@ #define CREATE_TRACE_POINTS #include <trace/events/vmscan.h> -enum lumpy_mode { - LUMPY_MODE_NONE, - LUMPY_MODE_ASYNC, - LUMPY_MODE_SYNC, -}; +/* + * reclaim_mode determines how the inactive list is shrunk + * RECLAIM_MODE_SINGLE: Reclaim only order-0 pages + * RECLAIM_MODE_ASYNC: Do not block + * RECLAIM_MODE_SYNC: Allow blocking e.g. call wait_on_page_writeback + * RECLAIM_MODE_LUMPYRECLAIM: For high-order allocations, take a reference + * page from the LRU and reclaim all pages within a + * naturally aligned range + * RECLAIM_MODE_COMPACTION: For high-order allocations, reclaim a number of + * order-0 pages and then compact the zone + */ +typedef unsigned __bitwise__ reclaim_mode_t; +#define RECLAIM_MODE_SINGLE ((__force reclaim_mode_t)0x01u) +#define RECLAIM_MODE_ASYNC ((__force reclaim_mode_t)0x02u) +#define RECLAIM_MODE_SYNC ((__force reclaim_mode_t)0x04u) +#define RECLAIM_MODE_LUMPYRECLAIM ((__force reclaim_mode_t)0x08u) +#define RECLAIM_MODE_COMPACTION ((__force reclaim_mode_t)0x10u) struct scan_control { /* Incremented by the number of inactive pages that were scanned */ @@ -88,7 +102,7 @@ struct scan_control { * Intend to reclaim enough continuous memory rather than reclaim * enough amount of memory. i.e, mode for high order allocation. */ - enum lumpy_mode lumpy_reclaim_mode; + reclaim_mode_t reclaim_mode; /* Which cgroup do we reclaim from */ struct mem_cgroup *mem_cgroup; @@ -271,34 +285,37 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, return ret; } -static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc, +static void set_reclaim_mode(int priority, struct scan_control *sc, bool sync) { - enum lumpy_mode mode = sync ? LUMPY_MODE_SYNC : LUMPY_MODE_ASYNC; + reclaim_mode_t syncmode = sync ? RECLAIM_MODE_SYNC : RECLAIM_MODE_ASYNC; /* - * Some reclaim have alredy been failed. No worth to try synchronous - * lumpy reclaim. + * Initially assume we are entering either lumpy reclaim or + * reclaim/compaction.Depending on the order, we will either set the + * sync mode or just reclaim order-0 pages later. */ - if (sync && sc->lumpy_reclaim_mode == LUMPY_MODE_NONE) - return; + if (COMPACTION_BUILD) + sc->reclaim_mode = RECLAIM_MODE_COMPACTION; + else + sc->reclaim_mode = RECLAIM_MODE_LUMPYRECLAIM; /* - * If we need a large contiguous chunk of memory, or have - * trouble getting a small set of contiguous pages, we - * will reclaim both active and inactive pages. + * Avoid using lumpy reclaim or reclaim/compaction if possible by + * restricting when its set to either costly allocations or when + * under memory pressure */ if (sc->order > PAGE_ALLOC_COSTLY_ORDER) - sc->lumpy_reclaim_mode = mode; + sc->reclaim_mode |= syncmode; else if (sc->order && priority < DEF_PRIORITY - 2) - sc->lumpy_reclaim_mode = mode; + sc->reclaim_mode |= syncmode; else - sc->lumpy_reclaim_mode = LUMPY_MODE_NONE; + sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC; } -static void disable_lumpy_reclaim_mode(struct scan_control *sc) +static void reset_reclaim_mode(struct scan_control *sc) { - sc->lumpy_reclaim_mode = LUMPY_MODE_NONE; + sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC; } static inline int is_page_cache_freeable(struct page *page) @@ -429,7 +446,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, * first attempt to free a range of pages fails. */ if (PageWriteback(page) && - sc->lumpy_reclaim_mode == LUMPY_MODE_SYNC) + (sc->reclaim_mode & RECLAIM_MODE_SYNC)) wait_on_page_writeback(page); if (!PageWriteback(page)) { @@ -437,7 +454,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, ClearPageReclaim(page); } trace_mm_vmscan_writepage(page, - trace_reclaim_flags(page, sc->lumpy_reclaim_mode)); + trace_reclaim_flags(page, sc->reclaim_mode)); inc_zone_page_state(page, NR_VMSCAN_WRITE); return PAGE_SUCCESS; } @@ -622,7 +639,7 @@ static enum page_references page_check_references(struct page *page, referenced_page = TestClearPageReferenced(page); /* Lumpy reclaim - ignore references */ - if (sc->lumpy_reclaim_mode != LUMPY_MODE_NONE) + if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM) return PAGEREF_RECLAIM; /* @@ -739,7 +756,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, * for any page for which writeback has already * started. */ - if (sc->lumpy_reclaim_mode == LUMPY_MODE_SYNC && + if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) && may_enter_fs) wait_on_page_writeback(page); else { @@ -895,7 +912,7 @@ cull_mlocked: try_to_free_swap(page); unlock_page(page); putback_lru_page(page); - disable_lumpy_reclaim_mode(sc); + reset_reclaim_mode(sc); continue; activate_locked: @@ -908,7 +925,7 @@ activate_locked: keep_locked: unlock_page(page); keep: - disable_lumpy_reclaim_mode(sc); + reset_reclaim_mode(sc); keep_lumpy: list_add(&page->lru, &ret_pages); VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); @@ -1028,7 +1045,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, case 0: list_move(&page->lru, dst); mem_cgroup_del_lru(page); - nr_taken++; + nr_taken += hpage_nr_pages(page); break; case -EBUSY: @@ -1086,7 +1103,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, if (__isolate_lru_page(cursor_page, mode, file) == 0) { list_move(&cursor_page->lru, dst); mem_cgroup_del_lru(cursor_page); - nr_taken++; + nr_taken += hpage_nr_pages(page); nr_lumpy_taken++; if (PageDirty(cursor_page)) nr_lumpy_dirty++; @@ -1141,14 +1158,15 @@ static unsigned long clear_active_flags(struct list_head *page_list, struct page *page; list_for_each_entry(page, page_list, lru) { + int numpages = hpage_nr_pages(page); lru = page_lru_base_type(page); if (PageActive(page)) { lru += LRU_ACTIVE; ClearPageActive(page); - nr_active++; + nr_active += numpages; } if (count) - count[lru]++; + count[lru] += numpages; } return nr_active; @@ -1253,13 +1271,16 @@ putback_lru_pages(struct zone *zone, struct scan_control *sc, spin_lock_irq(&zone->lru_lock); continue; } - SetPageLRU(page); lru = page_lru(page); - add_page_to_lru_list(zone, page, lru); if (is_active_lru(lru)) { int file = is_file_lru(lru); - reclaim_stat->recent_rotated[file]++; + int numpages = hpage_nr_pages(page); + reclaim_stat->recent_rotated[file] += numpages; + if (putback_active_lru_page(zone, page)) + continue; } + SetPageLRU(page); + add_page_to_lru_list(zone, page, lru); if (!pagevec_add(&pvec, page)) { spin_unlock_irq(&zone->lru_lock); __pagevec_release(&pvec); @@ -1324,7 +1345,7 @@ static inline bool should_reclaim_stall(unsigned long nr_taken, return false; /* Only stall on lumpy reclaim */ - if (sc->lumpy_reclaim_mode == LUMPY_MODE_NONE) + if (sc->reclaim_mode & RECLAIM_MODE_SINGLE) return false; /* If we have relaimed everything on the isolated list, no stall */ @@ -1368,15 +1389,15 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, return SWAP_CLUSTER_MAX; } - set_lumpy_reclaim_mode(priority, sc, false); + set_reclaim_mode(priority, sc, false); lru_add_drain(); spin_lock_irq(&zone->lru_lock); if (scanning_global_lru(sc)) { nr_taken = isolate_pages_global(nr_to_scan, &page_list, &nr_scanned, sc->order, - sc->lumpy_reclaim_mode == LUMPY_MODE_NONE ? - ISOLATE_INACTIVE : ISOLATE_BOTH, + sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ? + ISOLATE_BOTH : ISOLATE_INACTIVE, zone, 0, file); zone->pages_scanned += nr_scanned; if (current_is_kswapd()) @@ -1388,8 +1409,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, } else { nr_taken = mem_cgroup_isolate_pages(nr_to_scan, &page_list, &nr_scanned, sc->order, - sc->lumpy_reclaim_mode == LUMPY_MODE_NONE ? - ISOLATE_INACTIVE : ISOLATE_BOTH, + sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ? + ISOLATE_BOTH : ISOLATE_INACTIVE, zone, sc->mem_cgroup, 0, file); /* @@ -1411,7 +1432,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, /* Check if we should syncronously wait for writeback */ if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { - set_lumpy_reclaim_mode(priority, sc, true); + set_reclaim_mode(priority, sc, true); nr_reclaimed += shrink_page_list(&page_list, zone, sc); } @@ -1426,7 +1447,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, zone_idx(zone), nr_scanned, nr_reclaimed, priority, - trace_shrink_flags(file, sc->lumpy_reclaim_mode)); + trace_shrink_flags(file, sc->reclaim_mode)); return nr_reclaimed; } @@ -1466,7 +1487,7 @@ static void move_active_pages_to_lru(struct zone *zone, list_move(&page->lru, &zone->lru[lru].list); mem_cgroup_add_lru_list(page, lru); - pgmoved++; + pgmoved += hpage_nr_pages(page); if (!pagevec_add(&pvec, page) || list_empty(list)) { spin_unlock_irq(&zone->lru_lock); @@ -1534,7 +1555,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, } if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) { - nr_rotated++; + nr_rotated += hpage_nr_pages(page); /* * Identify referenced, file-backed active pages and * give them one more trip around the active list. So @@ -1805,6 +1826,57 @@ out: } /* + * Reclaim/compaction depends on a number of pages being freed. To avoid + * disruption to the system, a small number of order-0 pages continue to be + * rotated and reclaimed in the normal fashion. However, by the time we get + * back to the allocator and call try_to_compact_zone(), we ensure that + * there are enough free pages for it to be likely successful + */ +static inline bool should_continue_reclaim(struct zone *zone, + unsigned long nr_reclaimed, + unsigned long nr_scanned, + struct scan_control *sc) +{ + unsigned long pages_for_compaction; + unsigned long inactive_lru_pages; + + /* If not in reclaim/compaction mode, stop */ + if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION)) + return false; + + /* + * If we failed to reclaim and have scanned the full list, stop. + * NOTE: Checking just nr_reclaimed would exit reclaim/compaction far + * faster but obviously would be less likely to succeed + * allocation. If this is desirable, use GFP_REPEAT to decide + * if both reclaimed and scanned should be checked or just + * reclaimed + */ + if (!nr_reclaimed && !nr_scanned) + return false; + + /* + * If we have not reclaimed enough pages for compaction and the + * inactive lists are large enough, continue reclaiming + */ + pages_for_compaction = (2UL << sc->order); + inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON) + + zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); + if (sc->nr_reclaimed < pages_for_compaction && + inactive_lru_pages > pages_for_compaction) + return true; + + /* If compaction would go ahead or the allocation would succeed, stop */ + switch (compaction_suitable(zone, sc->order)) { + case COMPACT_PARTIAL: + case COMPACT_CONTINUE: + return false; + default: + return true; + } +} + +/* * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. */ static void shrink_zone(int priority, struct zone *zone, @@ -1813,9 +1885,12 @@ static void shrink_zone(int priority, struct zone *zone, unsigned long nr[NR_LRU_LISTS]; unsigned long nr_to_scan; enum lru_list l; - unsigned long nr_reclaimed = sc->nr_reclaimed; + unsigned long nr_reclaimed; unsigned long nr_to_reclaim = sc->nr_to_reclaim; + unsigned long nr_scanned = sc->nr_scanned; +restart: + nr_reclaimed = 0; get_scan_count(zone, sc, nr, priority); while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || @@ -1841,8 +1916,7 @@ static void shrink_zone(int priority, struct zone *zone, if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) break; } - - sc->nr_reclaimed = nr_reclaimed; + sc->nr_reclaimed += nr_reclaimed; /* * Even if we did not try to evict anon pages at all, we want to @@ -1851,6 +1925,11 @@ static void shrink_zone(int priority, struct zone *zone, if (inactive_anon_is_low(zone, sc)) shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); + /* reclaim/compaction might need reclaim to continue */ + if (should_continue_reclaim(zone, nr_reclaimed, + sc->nr_scanned - nr_scanned, sc)) + goto restart; + throttle_vm_writeout(sc->gfp_mask); } @@ -2124,38 +2203,87 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, } #endif +/* + * pgdat_balanced is used when checking if a node is balanced for high-order + * allocations. Only zones that meet watermarks and are in a zone allowed + * by the callers classzone_idx are added to balanced_pages. The total of + * balanced pages must be at least 25% of the zones allowed by classzone_idx + * for the node to be considered balanced. Forcing all zones to be balanced + * for high orders can cause excessive reclaim when there are imbalanced zones. + * The choice of 25% is due to + * o a 16M DMA zone that is balanced will not balance a zone on any + * reasonable sized machine + * o On all other machines, the top zone must be at least a reasonable + * precentage of the middle zones. For example, on 32-bit x86, highmem + * would need to be at least 256M for it to be balance a whole node. + * Similarly, on x86-64 the Normal zone would need to be at least 1G + * to balance a node on its own. These seemed like reasonable ratios. + */ +static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages, + int classzone_idx) +{ + unsigned long present_pages = 0; + int i; + + for (i = 0; i <= classzone_idx; i++) + present_pages += pgdat->node_zones[i].present_pages; + + return balanced_pages > (present_pages >> 2); +} + /* is kswapd sleeping prematurely? */ -static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) +static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, + int classzone_idx) { int i; + unsigned long balanced = 0; + bool all_zones_ok = true; /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ if (remaining) - return 1; + return true; - /* If after HZ/10, a zone is below the high mark, it's premature */ + /* Check the watermark levels */ for (i = 0; i < pgdat->nr_zones; i++) { struct zone *zone = pgdat->node_zones + i; if (!populated_zone(zone)) continue; - if (zone->all_unreclaimable) + /* + * balance_pgdat() skips over all_unreclaimable after + * DEF_PRIORITY. Effectively, it considers them balanced so + * they must be considered balanced here as well if kswapd + * is to sleep + */ + if (zone->all_unreclaimable) { + balanced += zone->present_pages; continue; + } - if (!zone_watermark_ok(zone, order, high_wmark_pages(zone), - 0, 0)) - return 1; + if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), + classzone_idx, 0)) + all_zones_ok = false; + else + balanced += zone->present_pages; } - return 0; + /* + * For high-order requests, the balanced zones must contain at least + * 25% of the nodes pages for kswapd to sleep. For order-0, all zones + * must be balanced + */ + if (order) + return pgdat_balanced(pgdat, balanced, classzone_idx); + else + return !all_zones_ok; } /* * For kswapd, balance_pgdat() will work across all this node's zones until * they are all at high_wmark_pages(zone). * - * Returns the number of pages which were actually freed. + * Returns the final order kswapd was reclaiming at * * There is special handling here for zones which are full of pinned pages. * This can happen if the pages are all mlocked, or if they are all used by @@ -2172,11 +2300,14 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) * interoperates with the page allocator fallback scheme to ensure that aging * of pages is balanced across the zones. */ -static unsigned long balance_pgdat(pg_data_t *pgdat, int order) +static unsigned long balance_pgdat(pg_data_t *pgdat, int order, + int *classzone_idx) { int all_zones_ok; + unsigned long balanced; int priority; int i; + int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ unsigned long total_scanned; struct reclaim_state *reclaim_state = current->reclaim_state; struct scan_control sc = { @@ -2199,7 +2330,6 @@ loop_again: count_vm_event(PAGEOUTRUN); for (priority = DEF_PRIORITY; priority >= 0; priority--) { - int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ unsigned long lru_pages = 0; int has_under_min_watermark_zone = 0; @@ -2208,6 +2338,7 @@ loop_again: disable_swap_token(); all_zones_ok = 1; + balanced = 0; /* * Scan in the highmem->dma direction for the highest @@ -2230,9 +2361,10 @@ loop_again: shrink_active_list(SWAP_CLUSTER_MAX, zone, &sc, priority, 0); - if (!zone_watermark_ok(zone, order, + if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), 0, 0)) { end_zone = i; + *classzone_idx = i; break; } } @@ -2255,6 +2387,7 @@ loop_again: * cause too much scanning of the lower zones. */ for (i = 0; i <= end_zone; i++) { + int compaction; struct zone *zone = pgdat->node_zones + i; int nr_slab; @@ -2276,7 +2409,7 @@ loop_again: * We put equal pressure on every zone, unless one * zone has way too many pages free already. */ - if (!zone_watermark_ok(zone, order, + if (!zone_watermark_ok_safe(zone, order, 8*high_wmark_pages(zone), end_zone, 0)) shrink_zone(priority, zone, &sc); reclaim_state->reclaimed_slab = 0; @@ -2284,9 +2417,26 @@ loop_again: lru_pages); sc.nr_reclaimed += reclaim_state->reclaimed_slab; total_scanned += sc.nr_scanned; + + compaction = 0; + if (order && + zone_watermark_ok(zone, 0, + high_wmark_pages(zone), + end_zone, 0) && + !zone_watermark_ok(zone, order, + high_wmark_pages(zone), + end_zone, 0)) { + compact_zone_order(zone, + order, + sc.gfp_mask, false, + COMPACT_MODE_KSWAPD); + compaction = 1; + } + if (zone->all_unreclaimable) continue; - if (nr_slab == 0 && !zone_reclaimable(zone)) + if (!compaction && nr_slab == 0 && + !zone_reclaimable(zone)) zone->all_unreclaimable = 1; /* * If we've done a decent amount of scanning and @@ -2297,7 +2447,7 @@ loop_again: total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) sc.may_writepage = 1; - if (!zone_watermark_ok(zone, order, + if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), end_zone, 0)) { all_zones_ok = 0; /* @@ -2305,7 +2455,7 @@ loop_again: * means that we have a GFP_ATOMIC allocation * failure risk. Hurry up! */ - if (!zone_watermark_ok(zone, order, + if (!zone_watermark_ok_safe(zone, order, min_wmark_pages(zone), end_zone, 0)) has_under_min_watermark_zone = 1; } else { @@ -2317,10 +2467,12 @@ loop_again: * spectulatively avoid congestion waits */ zone_clear_flag(zone, ZONE_CONGESTED); + if (i <= *classzone_idx) + balanced += zone->present_pages; } } - if (all_zones_ok) + if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx))) break; /* kswapd: all done */ /* * OK, kswapd is getting into trouble. Take a nap, then take @@ -2343,7 +2495,13 @@ loop_again: break; } out: - if (!all_zones_ok) { + + /* + * order-0: All zones must meet high watermark for a balanced node + * high-order: Balanced zones must make up at least 25% of the node + * for the node to be balanced + */ + if (!(all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))) { cond_resched(); try_to_freeze(); @@ -2368,7 +2526,88 @@ out: goto loop_again; } - return sc.nr_reclaimed; + /* + * If kswapd was reclaiming at a higher order, it has the option of + * sleeping without all zones being balanced. Before it does, it must + * ensure that the watermarks for order-0 on *all* zones are met and + * that the congestion flags are cleared. The congestion flag must + * be cleared as kswapd is the only mechanism that clears the flag + * and it is potentially going to sleep here. + */ + if (order) { + for (i = 0; i <= end_zone; i++) { + struct zone *zone = pgdat->node_zones + i; + + if (!populated_zone(zone)) + continue; + + if (zone->all_unreclaimable && priority != DEF_PRIORITY) + continue; + + /* Confirm the zone is balanced for order-0 */ + if (!zone_watermark_ok(zone, 0, + high_wmark_pages(zone), 0, 0)) { + order = sc.order = 0; + goto loop_again; + } + + /* If balanced, clear the congested flag */ + zone_clear_flag(zone, ZONE_CONGESTED); + } + } + + /* + * Return the order we were reclaiming at so sleeping_prematurely() + * makes a decision on the order we were last reclaiming at. However, + * if another caller entered the allocator slow path while kswapd + * was awake, order will remain at the higher level + */ + *classzone_idx = end_zone; + return order; +} + +static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) +{ + long remaining = 0; + DEFINE_WAIT(wait); + + if (freezing(current) || kthread_should_stop()) + return; + + prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); + + /* Try to sleep for a short interval */ + if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { + remaining = schedule_timeout(HZ/10); + finish_wait(&pgdat->kswapd_wait, &wait); + prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); + } + + /* + * After a short sleep, check if it was a premature sleep. If not, then + * go fully to sleep until explicitly woken up. + */ + if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { + trace_mm_vmscan_kswapd_sleep(pgdat->node_id); + + /* + * vmstat counters are not perfectly accurate and the estimated + * value for counters such as NR_FREE_PAGES can deviate from the + * true value by nr_online_cpus * threshold. To avoid the zone + * watermarks being breached while under pressure, we reduce the + * per-cpu vmstat threshold while kswapd is awake and restore + * them before going back to sleep. + */ + set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); + schedule(); + set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold); + } else { + if (remaining) + count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); + else + count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY); + } + finish_wait(&pgdat->kswapd_wait, &wait); } /* @@ -2387,9 +2626,10 @@ out: static int kswapd(void *p) { unsigned long order; + int classzone_idx; pg_data_t *pgdat = (pg_data_t*)p; struct task_struct *tsk = current; - DEFINE_WAIT(wait); + struct reclaim_state reclaim_state = { .reclaimed_slab = 0, }; @@ -2417,49 +2657,30 @@ static int kswapd(void *p) set_freezable(); order = 0; + classzone_idx = MAX_NR_ZONES - 1; for ( ; ; ) { unsigned long new_order; + int new_classzone_idx; int ret; - prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); new_order = pgdat->kswapd_max_order; + new_classzone_idx = pgdat->classzone_idx; pgdat->kswapd_max_order = 0; - if (order < new_order) { + pgdat->classzone_idx = MAX_NR_ZONES - 1; + if (order < new_order || classzone_idx > new_classzone_idx) { /* * Don't sleep if someone wants a larger 'order' - * allocation + * allocation or has tigher zone constraints */ order = new_order; + classzone_idx = new_classzone_idx; } else { - if (!freezing(current) && !kthread_should_stop()) { - long remaining = 0; - - /* Try to sleep for a short interval */ - if (!sleeping_prematurely(pgdat, order, remaining)) { - remaining = schedule_timeout(HZ/10); - finish_wait(&pgdat->kswapd_wait, &wait); - prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); - } - - /* - * After a short sleep, check if it was a - * premature sleep. If not, then go fully - * to sleep until explicitly woken up - */ - if (!sleeping_prematurely(pgdat, order, remaining)) { - trace_mm_vmscan_kswapd_sleep(pgdat->node_id); - schedule(); - } else { - if (remaining) - count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); - else - count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY); - } - } - + kswapd_try_to_sleep(pgdat, order, classzone_idx); order = pgdat->kswapd_max_order; + classzone_idx = pgdat->classzone_idx; + pgdat->kswapd_max_order = 0; + pgdat->classzone_idx = MAX_NR_ZONES - 1; } - finish_wait(&pgdat->kswapd_wait, &wait); ret = try_to_freeze(); if (kthread_should_stop()) @@ -2471,7 +2692,7 @@ static int kswapd(void *p) */ if (!ret) { trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); - balance_pgdat(pgdat, order); + order = balance_pgdat(pgdat, order, &classzone_idx); } } return 0; @@ -2480,23 +2701,26 @@ static int kswapd(void *p) /* * A zone is low on free memory, so wake its kswapd task to service it. */ -void wakeup_kswapd(struct zone *zone, int order) +void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) { pg_data_t *pgdat; if (!populated_zone(zone)) return; - pgdat = zone->zone_pgdat; - if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0)) - return; - if (pgdat->kswapd_max_order < order) - pgdat->kswapd_max_order = order; - trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) return; + pgdat = zone->zone_pgdat; + if (pgdat->kswapd_max_order < order) { + pgdat->kswapd_max_order = order; + pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx); + } if (!waitqueue_active(&pgdat->kswapd_wait)) return; + if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0)) + return; + + trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); wake_up_interruptible(&pgdat->kswapd_wait); } diff --git a/mm/vmstat.c b/mm/vmstat.c index 312d728..0c3b504 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -83,7 +83,31 @@ EXPORT_SYMBOL(vm_stat); #ifdef CONFIG_SMP -static int calculate_threshold(struct zone *zone) +int calculate_pressure_threshold(struct zone *zone) +{ + int threshold; + int watermark_distance; + + /* + * As vmstats are not up to date, there is drift between the estimated + * and real values. For high thresholds and a high number of CPUs, it + * is possible for the min watermark to be breached while the estimated + * value looks fine. The pressure threshold is a reduced value such + * that even the maximum amount of drift will not accidentally breach + * the min watermark + */ + watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone); + threshold = max(1, (int)(watermark_distance / num_online_cpus())); + + /* + * Maximum threshold is 125 + */ + threshold = min(125, threshold); + + return threshold; +} + +int calculate_normal_threshold(struct zone *zone) { int threshold; int mem; /* memory in 128 MB units */ @@ -142,7 +166,7 @@ static void refresh_zone_stat_thresholds(void) for_each_populated_zone(zone) { unsigned long max_drift, tolerate_drift; - threshold = calculate_threshold(zone); + threshold = calculate_normal_threshold(zone); for_each_online_cpu(cpu) per_cpu_ptr(zone->pageset, cpu)->stat_threshold @@ -161,6 +185,26 @@ static void refresh_zone_stat_thresholds(void) } } +void set_pgdat_percpu_threshold(pg_data_t *pgdat, + int (*calculate_pressure)(struct zone *)) +{ + struct zone *zone; + int cpu; + int threshold; + int i; + + for (i = 0; i < pgdat->nr_zones; i++) { + zone = &pgdat->node_zones[i]; + if (!zone->percpu_drift_mark) + continue; + + threshold = (*calculate_pressure)(zone); + for_each_possible_cpu(cpu) + per_cpu_ptr(zone->pageset, cpu)->stat_threshold + = threshold; + } +} + /* * For use when we know that interrupts are disabled. */ @@ -836,6 +880,7 @@ static const char * const vmstat_text[] = { "numa_local", "numa_other", #endif + "nr_anon_transparent_hugepages", "nr_dirty_threshold", "nr_dirty_background_threshold", @@ -911,7 +956,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, "\n scanned %lu" "\n spanned %lu" "\n present %lu", - zone_nr_free_pages(zone), + zone_page_state(zone, NR_FREE_PAGES), min_wmark_pages(zone), low_wmark_pages(zone), high_wmark_pages(zone), diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 7f68625..f29abeb 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -104,8 +104,26 @@ static pfn_t fault_pfn; inline int kvm_is_mmio_pfn(pfn_t pfn) { if (pfn_valid(pfn)) { - struct page *page = compound_head(pfn_to_page(pfn)); - return PageReserved(page); + int reserved; + struct page *tail = pfn_to_page(pfn); + struct page *head = compound_trans_head(tail); + reserved = PageReserved(head); + if (head != tail) { + /* + * "head" is not a dangling pointer + * (compound_trans_head takes care of that) + * but the hugepage may have been splitted + * from under us (and we may not hold a + * reference count on the head page so it can + * be reused before we run PageReferenced), so + * we've to check PageTail before returning + * what we just read. + */ + smp_rmb(); + if (PageTail(tail)) + return reserved; + } + return PageReserved(tail); } return true; @@ -352,6 +370,22 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, return young; } +static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long address) +{ + struct kvm *kvm = mmu_notifier_to_kvm(mn); + int young, idx; + + idx = srcu_read_lock(&kvm->srcu); + spin_lock(&kvm->mmu_lock); + young = kvm_test_age_hva(kvm, address); + spin_unlock(&kvm->mmu_lock); + srcu_read_unlock(&kvm->srcu, idx); + + return young; +} + static void kvm_mmu_notifier_release(struct mmu_notifier *mn, struct mm_struct *mm) { @@ -368,6 +402,7 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, .clear_flush_young = kvm_mmu_notifier_clear_flush_young, + .test_young = kvm_mmu_notifier_test_young, .change_pte = kvm_mmu_notifier_change_pte, .release = kvm_mmu_notifier_release, }; |