diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2018-06-07 18:39:37 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2018-06-07 18:39:37 -0700 |
commit | 68abbe729567cef128b2c2141f2ed2567f3b8372 (patch) | |
tree | aa75c39cc815eee4d7cc8db2988fe10879fccd3e | |
parent | ba1b7309fc2e909a5828c36a7cd187e5d7df6f53 (diff) | |
parent | 016e92da037e0b43dd5e5848c19b0b9749506963 (diff) | |
download | op-kernel-dev-68abbe729567cef128b2c2141f2ed2567f3b8372.zip op-kernel-dev-68abbe729567cef128b2c2141f2ed2567f3b8372.tar.gz |
Merge branch 'akpm' (patches from Andrew)
Merge updates from Andrew Morton:
- a few misc things
- ocfs2 updates
- v9fs updates
- MM
- procfs updates
- lib/ updates
- autofs updates
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (118 commits)
autofs: small cleanup in autofs_getpath()
autofs: clean up includes
autofs: comment on selinux changes needed for module autoload
autofs: update MAINTAINERS entry for autofs
autofs: use autofs instead of autofs4 in documentation
autofs: rename autofs documentation files
autofs: create autofs Kconfig and Makefile
autofs: delete fs/autofs4 source files
autofs: update fs/autofs4/Makefile
autofs: update fs/autofs4/Kconfig
autofs: copy autofs4 to autofs
autofs4: use autofs instead of autofs4 everywhere
autofs4: merge auto_fs.h and auto_fs4.h
fs/binfmt_misc.c: do not allow offset overflow
checkpatch: improve patch recognition
lib/ucs2_string.c: add MODULE_LICENSE()
lib/mpi: headers cleanup
lib/percpu_ida.c: use _irqsave() instead of local_irq_save() + spin_lock
lib/idr.c: remove simple_ida_lock
lib/bitmap.c: micro-optimization for __bitmap_complement()
...
147 files changed, 2945 insertions, 2066 deletions
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 74cdeae..8a2c52d 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1001,14 +1001,44 @@ PAGE_SIZE multiple when read back. The total amount of memory currently being used by the cgroup and its descendants. + memory.min + A read-write single value file which exists on non-root + cgroups. The default is "0". + + Hard memory protection. If the memory usage of a cgroup + is within its effective min boundary, the cgroup's memory + won't be reclaimed under any conditions. If there is no + unprotected reclaimable memory available, OOM killer + is invoked. + + Effective min boundary is limited by memory.min values of + all ancestor cgroups. If there is memory.min overcommitment + (child cgroup or cgroups are requiring more protected memory + than parent will allow), then each child cgroup will get + the part of parent's protection proportional to its + actual memory usage below memory.min. + + Putting more memory than generally available under this + protection is discouraged and may lead to constant OOMs. + + If a memory cgroup is not populated with processes, + its memory.min is ignored. + memory.low A read-write single value file which exists on non-root cgroups. The default is "0". - Best-effort memory protection. If the memory usages of a - cgroup and all its ancestors are below their low boundaries, - the cgroup's memory won't be reclaimed unless memory can be - reclaimed from unprotected cgroups. + Best-effort memory protection. If the memory usage of a + cgroup is within its effective low boundary, the cgroup's + memory won't be reclaimed unless memory can be reclaimed + from unprotected cgroups. + + Effective low boundary is limited by memory.low values of + all ancestor cgroups. If there is memory.low overcommitment + (child cgroup or cgroups are requiring more protected memory + than parent will allow), then each child cgroup will get + the part of parent's protection proportional to its + actual memory usage below memory.low. Putting more memory than generally available under this protection is discouraged. @@ -1199,6 +1229,27 @@ PAGE_SIZE multiple when read back. Swap usage hard limit. If a cgroup's swap usage reaches this limit, anonymous memory of the cgroup will not be swapped out. + memory.swap.events + A read-only flat-keyed file which exists on non-root cgroups. + The following entries are defined. Unless specified + otherwise, a value change in this file generates a file + modified event. + + max + The number of times the cgroup's swap usage was about + to go over the max boundary and swap allocation + failed. + + fail + The number of times swap allocation failed either + because of running out of swap system-wide or max + limit. + + When reduced under the current usage, the existing swap + entries are reclaimed gradually and the swap usage may stay + higher than the limit for an extended period of time. This + reduces the impact on the workload and memory management. + Usage Guidelines ~~~~~~~~~~~~~~~~ @@ -1934,17 +1985,8 @@ system performance due to overreclaim, to the point where the feature becomes self-defeating. The memory.low boundary on the other hand is a top-down allocated -reserve. A cgroup enjoys reclaim protection when it and all its -ancestors are below their low boundaries, which makes delegation of -subtrees possible. Secondly, new cgroups have no reserve per default -and in the common case most cgroups are eligible for the preferred -reclaim pass. This allows the new low boundary to be efficiently -implemented with just a minor addition to the generic reclaim code, -without the need for out-of-band data structures and reclaim passes. -Because the generic reclaim code considers all cgroups except for the -ones running low in the preferred first reclaim pass, overreclaim of -individual groups is eliminated as well, resulting in much better -overall workload performance. +reserve. A cgroup enjoys reclaim protection when it's within its low, +which makes delegation of subtrees possible. The original high boundary, the hard limit, is defined as a strict limit that can not budge, even if the OOM killer has to be called. diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt index 257e657..875b2b5 100644 --- a/Documentation/blockdev/zram.txt +++ b/Documentation/blockdev/zram.txt @@ -218,6 +218,7 @@ line of text and contains the following stats separated by whitespace: same_pages the number of same element filled pages written to this disk. No memory is allocated for such pages. pages_compacted the number of pages freed during compaction + huge_pages the number of incompressible pages 9) Deactivate: swapoff /dev/zram0 @@ -242,5 +243,29 @@ to backing storage rather than keeping it in memory. User should set up backing device via /sys/block/zramX/backing_dev before disksize setting. += memory tracking + +With CONFIG_ZRAM_MEMORY_TRACKING, user can know information of the +zram block. It could be useful to catch cold or incompressible +pages of the process with*pagemap. +If you enable the feature, you could see block state via +/sys/kernel/debug/zram/zram0/block_state". The output is as follows, + + 300 75.033841 .wh + 301 63.806904 s.. + 302 63.806919 ..h + +First column is zram's block index. +Second column is access time since the system was booted +Third column is state of the block. +(s: same page +w: written page to backing store +h: huge page) + +First line of above example says 300th block is accessed at 75.033841sec +and the block's state is huge so it is written back to the backing +storage. It's a debugging feature so anyone shouldn't rely on it to work +properly. + Nitin Gupta ngupta@vflare.org diff --git a/Documentation/features/vm/pte_special/arch-support.txt b/Documentation/features/vm/pte_special/arch-support.txt index 6a608a6..a837842 100644 --- a/Documentation/features/vm/pte_special/arch-support.txt +++ b/Documentation/features/vm/pte_special/arch-support.txt @@ -1,6 +1,6 @@ # # Feature name: pte_special -# Kconfig: __HAVE_ARCH_PTE_SPECIAL +# Kconfig: ARCH_HAS_PTE_SPECIAL # description: arch supports the pte_special()/pte_mkspecial() VM APIs # ----------------------- diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX index b7bd6c9..a8bd4af 100644 --- a/Documentation/filesystems/00-INDEX +++ b/Documentation/filesystems/00-INDEX @@ -10,8 +10,8 @@ afs.txt - info and examples for the distributed AFS (Andrew File System) fs. affs.txt - info and mount options for the Amiga Fast File System. -autofs4-mount-control.txt - - info on device control operations for autofs4 module. +autofs-mount-control.txt + - info on device control operations for autofs module. automount-support.txt - information about filesystem automount support. befs.txt diff --git a/Documentation/filesystems/autofs4-mount-control.txt b/Documentation/filesystems/autofs-mount-control.txt index e5177cb..45edad6 100644 --- a/Documentation/filesystems/autofs4-mount-control.txt +++ b/Documentation/filesystems/autofs-mount-control.txt @@ -1,5 +1,5 @@ -Miscellaneous Device control operations for the autofs4 kernel module +Miscellaneous Device control operations for the autofs kernel module ==================================================================== The problem @@ -164,7 +164,7 @@ possibility for future development due to the requirements of the message bus architecture. -autofs4 Miscellaneous Device mount control interface +autofs Miscellaneous Device mount control interface ==================================================== The control interface is opening a device node, typically /dev/autofs. @@ -244,7 +244,7 @@ The device node ioctl operations implemented by this interface are: AUTOFS_DEV_IOCTL_VERSION ------------------------ -Get the major and minor version of the autofs4 device ioctl kernel module +Get the major and minor version of the autofs device ioctl kernel module implementation. It requires an initialized struct autofs_dev_ioctl as an input parameter and sets the version information in the passed in structure. It returns 0 on success or the error -EINVAL if a version mismatch is @@ -254,7 +254,7 @@ detected. AUTOFS_DEV_IOCTL_PROTOVER_CMD and AUTOFS_DEV_IOCTL_PROTOSUBVER_CMD ------------------------------------------------------------------ -Get the major and minor version of the autofs4 protocol version understood +Get the major and minor version of the autofs protocol version understood by loaded module. This call requires an initialized struct autofs_dev_ioctl with the ioctlfd field set to a valid autofs mount point descriptor and sets the requested version number in version field of struct args_protover @@ -404,4 +404,3 @@ type is also given we are looking for a particular autofs mount and if a match isn't found a fail is returned. If the the located path is the root of a mount 1 is returned along with the super magic of the mount or 0 otherwise. - diff --git a/Documentation/filesystems/autofs4.txt b/Documentation/filesystems/autofs.txt index f10dd59..373ad25 100644 --- a/Documentation/filesystems/autofs4.txt +++ b/Documentation/filesystems/autofs.txt @@ -30,15 +30,15 @@ key advantages: Context ------- -The "autofs4" filesystem module is only one part of an autofs system. +The "autofs" filesystem module is only one part of an autofs system. There also needs to be a user-space program which looks up names and mounts filesystems. This will often be the "automount" program, -though other tools including "systemd" can make use of "autofs4". +though other tools including "systemd" can make use of "autofs". This document describes only the kernel module and the interactions required with any user-space program. Subsequent text refers to this as the "automount daemon" or simply "the daemon". -"autofs4" is a Linux kernel module with provides the "autofs" +"autofs" is a Linux kernel module with provides the "autofs" filesystem type. Several "autofs" filesystems can be mounted and they can each be managed separately, or all managed by the same daemon. @@ -215,7 +215,7 @@ of expiry. The VFS also supports "expiry" of mounts using the MNT_EXPIRE flag to the `umount` system call. Unmounting with MNT_EXPIRE will fail unless a previous attempt had been made, and the filesystem has been inactive -and untouched since that previous attempt. autofs4 does not depend on +and untouched since that previous attempt. autofs does not depend on this but has its own internal tracking of whether filesystems were recently used. This allows individual names in the autofs directory to expire separately. @@ -415,7 +415,7 @@ which can be used to communicate directly with the autofs filesystem. It requires CAP_SYS_ADMIN for access. The `ioctl`s that can be used on this device are described in a separate -document `autofs4-mount-control.txt`, and are summarized briefly here. +document `autofs-mount-control.txt`, and are summarized briefly here. Each ioctl is passed a pointer to an `autofs_dev_ioctl` structure: struct autofs_dev_ioctl { diff --git a/Documentation/filesystems/automount-support.txt b/Documentation/filesystems/automount-support.txt index 7eb762e..b0afd3d 100644 --- a/Documentation/filesystems/automount-support.txt +++ b/Documentation/filesystems/automount-support.txt @@ -9,7 +9,7 @@ also be requested by userspace. IN-KERNEL AUTOMOUNTING ====================== -See section "Mount Traps" of Documentation/filesystems/autofs4.txt +See section "Mount Traps" of Documentation/filesystems/autofs.txt Then from userspace, you can just do something like: diff --git a/Documentation/filesystems/path-lookup.md b/Documentation/filesystems/path-lookup.md index 1933ef7..e2edd45 100644 --- a/Documentation/filesystems/path-lookup.md +++ b/Documentation/filesystems/path-lookup.md @@ -460,7 +460,7 @@ this retry process in the next article. Automount points are locations in the filesystem where an attempt to lookup a name can trigger changes to how that lookup should be handled, in particular by mounting a filesystem there. These are -covered in greater detail in autofs4.txt in the Linux documentation +covered in greater detail in autofs.txt in the Linux documentation tree, but a few notes specifically related to path lookup are in order here. diff --git a/MAINTAINERS b/MAINTAINERS index d325d2d..c9ac159 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7723,11 +7723,11 @@ W: https://linuxtv.org S: Maintained F: drivers/media/radio/radio-keene* -KERNEL AUTOMOUNTER v4 (AUTOFS4) +KERNEL AUTOMOUNTER M: Ian Kent <raven@themaw.net> L: autofs@vger.kernel.org S: Maintained -F: fs/autofs4/ +F: fs/autofs/ KERNEL BUILD + files below scripts/ (unless maintained elsewhere) M: Masahiro Yamada <yamada.masahiro@socionext.com> diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index 89d47ea..e81bcd2 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -48,6 +48,7 @@ config ARC select HAVE_GENERIC_DMA_COHERENT select HAVE_KERNEL_GZIP select HAVE_KERNEL_LZMA + select ARCH_HAS_PTE_SPECIAL config MIGHT_HAVE_PCI bool diff --git a/arch/arc/include/asm/pgtable.h b/arch/arc/include/asm/pgtable.h index 08fe338..8ec5599 100644 --- a/arch/arc/include/asm/pgtable.h +++ b/arch/arc/include/asm/pgtable.h @@ -320,8 +320,6 @@ PTE_BIT_FUNC(mkexec, |= (_PAGE_EXECUTE)); PTE_BIT_FUNC(mkspecial, |= (_PAGE_SPECIAL)); PTE_BIT_FUNC(mkhuge, |= (_PAGE_HW_SZ)); -#define __HAVE_ARCH_PTE_SPECIAL - static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) { return __pte((pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot)); diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 8f460bd..534563a 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -8,6 +8,7 @@ config ARM select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FORTIFY_SOURCE + select ARCH_HAS_PTE_SPECIAL if ARM_LPAE select ARCH_HAS_SET_MEMORY select ARCH_HAS_PHYS_TO_DMA select ARCH_HAS_STRICT_KERNEL_RWX if MMU && !XIP_KERNEL diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h index 2a48360..6d50a11 100644 --- a/arch/arm/include/asm/pgtable-3level.h +++ b/arch/arm/include/asm/pgtable-3level.h @@ -219,7 +219,6 @@ static inline pte_t pte_mkspecial(pte_t pte) pte_val(pte) |= L_PTE_SPECIAL; return pte; } -#define __HAVE_ARCH_PTE_SPECIAL #define pmd_write(pmd) (pmd_isclear((pmd), L_PMD_SECT_RDONLY)) #define pmd_dirty(pmd) (pmd_isset((pmd), L_PMD_SECT_DIRTY)) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index b25ed78..4759566 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -17,6 +17,7 @@ config ARM64 select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA select ARCH_HAS_KCOV select ARCH_HAS_MEMBARRIER_SYNC_CORE + select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_SET_MEMORY select ARCH_HAS_SG_CHAIN select ARCH_HAS_STRICT_KERNEL_RWX diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 7c4c8f3..9f82d6b 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -306,8 +306,6 @@ static inline int pte_same(pte_t pte_a, pte_t pte_b) #define HPAGE_MASK (~(HPAGE_SIZE - 1)) #define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT) -#define __HAVE_ARCH_PTE_SPECIAL - static inline pte_t pgd_pte(pgd_t pgd) { return __pte(pgd_val(pgd)); diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 076fe309..8f959df 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -135,6 +135,7 @@ config PPC select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_PHYS_TO_DMA select ARCH_HAS_PMEM_API if PPC64 + select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_MEMBARRIER_CALLBACKS select ARCH_HAS_SCALED_CPUTIME if VIRT_CPU_ACCOUNTING_NATIVE select ARCH_HAS_SG_CHAIN diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 42fe7c2..63cee15 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -335,9 +335,6 @@ extern unsigned long pci_io_base; /* Advertise special mapping type for AGP */ #define HAVE_PAGE_AGP -/* Advertise support for _PAGE_SPECIAL */ -#define __HAVE_ARCH_PTE_SPECIAL - #ifndef __ASSEMBLY__ /* diff --git a/arch/powerpc/include/asm/pte-common.h b/arch/powerpc/include/asm/pte-common.h index 050b0d7..bef5614 100644 --- a/arch/powerpc/include/asm/pte-common.h +++ b/arch/powerpc/include/asm/pte-common.h @@ -208,9 +208,6 @@ static inline bool pte_user(pte_t pte) #define PAGE_AGP (PAGE_KERNEL_NC) #define HAVE_PAGE_AGP -/* Advertise support for _PAGE_SPECIAL */ -#define __HAVE_ARCH_PTE_SPECIAL - #ifndef _PAGE_READ /* if not defined, we should not find _PAGE_WRITE too */ #define _PAGE_READ 0 diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 274bc06..17f19e6 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -42,6 +42,7 @@ config RISCV select THREAD_INFO_IN_TASK select RISCV_TIMER select GENERIC_IRQ_MULTI_HANDLER + select ARCH_HAS_PTE_SPECIAL config MMU def_bool y diff --git a/arch/riscv/include/asm/pgtable-bits.h b/arch/riscv/include/asm/pgtable-bits.h index 997ddbb..2fa2942 100644 --- a/arch/riscv/include/asm/pgtable-bits.h +++ b/arch/riscv/include/asm/pgtable-bits.h @@ -42,7 +42,4 @@ _PAGE_WRITE | _PAGE_EXEC | \ _PAGE_USER | _PAGE_GLOBAL)) -/* Advertise support for _PAGE_SPECIAL */ -#define __HAVE_ARCH_PTE_SPECIAL - #endif /* _ASM_RISCV_PGTABLE_BITS_H */ diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index b7deee7..baed397 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -65,6 +65,7 @@ config S390 select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA select ARCH_HAS_KCOV + select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_SET_MEMORY select ARCH_HAS_SG_CHAIN select ARCH_HAS_STRICT_KERNEL_RWX diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 2d24d33..9809694 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -171,7 +171,6 @@ static inline int is_module_addr(void *addr) #define _PAGE_WRITE 0x020 /* SW pte write bit */ #define _PAGE_SPECIAL 0x040 /* SW associated with special page */ #define _PAGE_UNUSED 0x080 /* SW bit for pgste usage state */ -#define __HAVE_ARCH_PTE_SPECIAL #ifdef CONFIG_MEM_SOFT_DIRTY #define _PAGE_SOFT_DIRTY 0x002 /* SW pte soft dirty bit */ diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index 562f729..84bd632 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -190,14 +190,15 @@ unsigned long *page_table_alloc(struct mm_struct *mm) if (!list_empty(&mm->context.pgtable_list)) { page = list_first_entry(&mm->context.pgtable_list, struct page, lru); - mask = atomic_read(&page->_mapcount); + mask = atomic_read(&page->_refcount) >> 24; mask = (mask | (mask >> 4)) & 3; if (mask != 3) { table = (unsigned long *) page_to_phys(page); bit = mask & 1; /* =1 -> second 2K */ if (bit) table += PTRS_PER_PTE; - atomic_xor_bits(&page->_mapcount, 1U << bit); + atomic_xor_bits(&page->_refcount, + 1U << (bit + 24)); list_del(&page->lru); } } @@ -218,12 +219,12 @@ unsigned long *page_table_alloc(struct mm_struct *mm) table = (unsigned long *) page_to_phys(page); if (mm_alloc_pgste(mm)) { /* Return 4K page table with PGSTEs */ - atomic_set(&page->_mapcount, 3); + atomic_xor_bits(&page->_refcount, 3 << 24); memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE); memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE); } else { /* Return the first 2K fragment of the page */ - atomic_set(&page->_mapcount, 1); + atomic_xor_bits(&page->_refcount, 1 << 24); memset64((u64 *)table, _PAGE_INVALID, 2 * PTRS_PER_PTE); spin_lock_bh(&mm->context.lock); list_add(&page->lru, &mm->context.pgtable_list); @@ -242,7 +243,8 @@ void page_table_free(struct mm_struct *mm, unsigned long *table) /* Free 2K page table fragment of a 4K page */ bit = (__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t)); spin_lock_bh(&mm->context.lock); - mask = atomic_xor_bits(&page->_mapcount, 1U << bit); + mask = atomic_xor_bits(&page->_refcount, 1U << (bit + 24)); + mask >>= 24; if (mask & 3) list_add(&page->lru, &mm->context.pgtable_list); else @@ -253,7 +255,6 @@ void page_table_free(struct mm_struct *mm, unsigned long *table) } pgtable_page_dtor(page); - atomic_set(&page->_mapcount, -1); __free_page(page); } @@ -274,7 +275,8 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table, } bit = (__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t)); spin_lock_bh(&mm->context.lock); - mask = atomic_xor_bits(&page->_mapcount, 0x11U << bit); + mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24)); + mask >>= 24; if (mask & 3) list_add_tail(&page->lru, &mm->context.pgtable_list); else @@ -296,12 +298,13 @@ static void __tlb_remove_table(void *_table) break; case 1: /* lower 2K of a 4K page table */ case 2: /* higher 2K of a 4K page table */ - if (atomic_xor_bits(&page->_mapcount, mask << 4) != 0) + mask = atomic_xor_bits(&page->_refcount, mask << (4 + 24)); + mask >>= 24; + if (mask != 0) break; /* fallthrough */ case 3: /* 4K page table with pgstes */ pgtable_page_dtor(page); - atomic_set(&page->_mapcount, -1); __free_page(page); break; } diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index ae619d5..4d61a08 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 config SUPERH def_bool y + select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_NO_COHERENT_DMA_MMAP if !MMU diff --git a/arch/sh/include/asm/pgtable.h b/arch/sh/include/asm/pgtable.h index 89c513a..f6abfe2 100644 --- a/arch/sh/include/asm/pgtable.h +++ b/arch/sh/include/asm/pgtable.h @@ -156,8 +156,6 @@ extern void page_table_range_init(unsigned long start, unsigned long end, #define HAVE_ARCH_UNMAPPED_AREA #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN -#define __HAVE_ARCH_PTE_SPECIAL - #include <asm-generic/pgtable.h> #endif /* __ASM_SH_PGTABLE_H */ diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index b42ba88..9a2b887 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -88,6 +88,7 @@ config SPARC64 select ARCH_USE_QUEUED_SPINLOCKS select GENERIC_TIME_VSYSCALL select ARCH_CLOCKSOURCE_DATA + select ARCH_HAS_PTE_SPECIAL config ARCH_DEFCONFIG string diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index 44d6ac4..1393a8a 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -117,9 +117,6 @@ bool kern_addr_valid(unsigned long addr); #define _PAGE_PMD_HUGE _AC(0x0100000000000000,UL) /* Huge page */ #define _PAGE_PUD_HUGE _PAGE_PMD_HUGE -/* Advertise support for _PAGE_SPECIAL */ -#define __HAVE_ARCH_PTE_SPECIAL - /* SUN4U pte bits... */ #define _PAGE_SZ4MB_4U _AC(0x6000000000000000,UL) /* 4MB Page */ #define _PAGE_SZ512K_4U _AC(0x4000000000000000,UL) /* 512K Page */ diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index cb6e3a2..f182a4e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -60,6 +60,7 @@ config X86 select ARCH_HAS_KCOV if X86_64 select ARCH_HAS_MEMBARRIER_SYNC_CORE select ARCH_HAS_PMEM_API if X86_64 + select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_REFCOUNT select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64 select ARCH_HAS_UACCESS_MCSAFE if X86_64 diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 1e5a406..99fff85 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -65,7 +65,6 @@ #define _PAGE_PKEY_BIT2 (_AT(pteval_t, 0)) #define _PAGE_PKEY_BIT3 (_AT(pteval_t, 0)) #endif -#define __HAVE_ARCH_PTE_SPECIAL #define _PAGE_PKEY_MASK (_PAGE_PKEY_BIT0 | \ _PAGE_PKEY_BIT1 | \ diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index ffc8c13..938dbcd 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -114,13 +114,12 @@ static inline void pgd_list_del(pgd_t *pgd) static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm) { - BUILD_BUG_ON(sizeof(virt_to_page(pgd)->index) < sizeof(mm)); - virt_to_page(pgd)->index = (pgoff_t)mm; + virt_to_page(pgd)->pt_mm = mm; } struct mm_struct *pgd_page_get_mm(struct page *page) { - return (struct mm_struct *)page->index; + return page->pt_mm; } static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd) diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig index ac3a31d..6352357 100644 --- a/drivers/block/zram/Kconfig +++ b/drivers/block/zram/Kconfig @@ -13,7 +13,7 @@ config ZRAM It has several use cases, for example: /tmp storage, use as swap disks and maybe many more. - See zram.txt for more information. + See Documentation/blockdev/zram.txt for more information. config ZRAM_WRITEBACK bool "Write back incompressible page to backing device" @@ -25,4 +25,14 @@ config ZRAM_WRITEBACK For this feature, admin should set up backing device via /sys/block/zramX/backing_dev. - See zram.txt for more infomration. + See Documentation/blockdev/zram.txt for more information. + +config ZRAM_MEMORY_TRACKING + bool "Track zRam block status" + depends on ZRAM && DEBUG_FS + help + With this feature, admin can track the state of allocated blocks + of zRAM. Admin could see the information via + /sys/kernel/debug/zram/zramX/block_state. + + See Documentation/blockdev/zram.txt for more information. diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 0f3fadd..da51293 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -31,6 +31,7 @@ #include <linux/err.h> #include <linux/idr.h> #include <linux/sysfs.h> +#include <linux/debugfs.h> #include <linux/cpuhotplug.h> #include "zram_drv.h" @@ -52,11 +53,28 @@ static size_t huge_class_size; static void zram_free_page(struct zram *zram, size_t index); +static void zram_slot_lock(struct zram *zram, u32 index) +{ + bit_spin_lock(ZRAM_LOCK, &zram->table[index].value); +} + +static void zram_slot_unlock(struct zram *zram, u32 index) +{ + bit_spin_unlock(ZRAM_LOCK, &zram->table[index].value); +} + static inline bool init_done(struct zram *zram) { return zram->disksize; } +static inline bool zram_allocated(struct zram *zram, u32 index) +{ + + return (zram->table[index].value >> (ZRAM_FLAG_SHIFT + 1)) || + zram->table[index].handle; +} + static inline struct zram *dev_to_zram(struct device *dev) { return (struct zram *)dev_to_disk(dev)->private_data; @@ -73,7 +91,7 @@ static void zram_set_handle(struct zram *zram, u32 index, unsigned long handle) } /* flag operations require table entry bit_spin_lock() being held */ -static int zram_test_flag(struct zram *zram, u32 index, +static bool zram_test_flag(struct zram *zram, u32 index, enum zram_pageflags flag) { return zram->table[index].value & BIT(flag); @@ -600,6 +618,114 @@ static int read_from_bdev(struct zram *zram, struct bio_vec *bvec, static void zram_wb_clear(struct zram *zram, u32 index) {} #endif +#ifdef CONFIG_ZRAM_MEMORY_TRACKING + +static struct dentry *zram_debugfs_root; + +static void zram_debugfs_create(void) +{ + zram_debugfs_root = debugfs_create_dir("zram", NULL); +} + +static void zram_debugfs_destroy(void) +{ + debugfs_remove_recursive(zram_debugfs_root); +} + +static void zram_accessed(struct zram *zram, u32 index) +{ + zram->table[index].ac_time = ktime_get_boottime(); +} + +static void zram_reset_access(struct zram *zram, u32 index) +{ + zram->table[index].ac_time = 0; +} + +static ssize_t read_block_state(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + char *kbuf; + ssize_t index, written = 0; + struct zram *zram = file->private_data; + unsigned long nr_pages = zram->disksize >> PAGE_SHIFT; + struct timespec64 ts; + + kbuf = kvmalloc(count, GFP_KERNEL); + if (!kbuf) + return -ENOMEM; + + down_read(&zram->init_lock); + if (!init_done(zram)) { + up_read(&zram->init_lock); + kvfree(kbuf); + return -EINVAL; + } + + for (index = *ppos; index < nr_pages; index++) { + int copied; + + zram_slot_lock(zram, index); + if (!zram_allocated(zram, index)) + goto next; + + ts = ktime_to_timespec64(zram->table[index].ac_time); + copied = snprintf(kbuf + written, count, + "%12zd %12lld.%06lu %c%c%c\n", + index, (s64)ts.tv_sec, + ts.tv_nsec / NSEC_PER_USEC, + zram_test_flag(zram, index, ZRAM_SAME) ? 's' : '.', + zram_test_flag(zram, index, ZRAM_WB) ? 'w' : '.', + zram_test_flag(zram, index, ZRAM_HUGE) ? 'h' : '.'); + + if (count < copied) { + zram_slot_unlock(zram, index); + break; + } + written += copied; + count -= copied; +next: + zram_slot_unlock(zram, index); + *ppos += 1; + } + + up_read(&zram->init_lock); + if (copy_to_user(buf, kbuf, written)) + written = -EFAULT; + kvfree(kbuf); + + return written; +} + +static const struct file_operations proc_zram_block_state_op = { + .open = simple_open, + .read = read_block_state, + .llseek = default_llseek, +}; + +static void zram_debugfs_register(struct zram *zram) +{ + if (!zram_debugfs_root) + return; + + zram->debugfs_dir = debugfs_create_dir(zram->disk->disk_name, + zram_debugfs_root); + debugfs_create_file("block_state", 0400, zram->debugfs_dir, + zram, &proc_zram_block_state_op); +} + +static void zram_debugfs_unregister(struct zram *zram) +{ + debugfs_remove_recursive(zram->debugfs_dir); +} +#else +static void zram_debugfs_create(void) {}; +static void zram_debugfs_destroy(void) {}; +static void zram_accessed(struct zram *zram, u32 index) {}; +static void zram_reset_access(struct zram *zram, u32 index) {}; +static void zram_debugfs_register(struct zram *zram) {}; +static void zram_debugfs_unregister(struct zram *zram) {}; +#endif /* * We switched to per-cpu streams and this attr is not needed anymore. @@ -719,14 +845,15 @@ static ssize_t mm_stat_show(struct device *dev, max_used = atomic_long_read(&zram->stats.max_used_pages); ret = scnprintf(buf, PAGE_SIZE, - "%8llu %8llu %8llu %8lu %8ld %8llu %8lu\n", + "%8llu %8llu %8llu %8lu %8ld %8llu %8lu %8llu\n", orig_size << PAGE_SHIFT, (u64)atomic64_read(&zram->stats.compr_data_size), mem_used << PAGE_SHIFT, zram->limit_pages << PAGE_SHIFT, max_used << PAGE_SHIFT, (u64)atomic64_read(&zram->stats.same_pages), - pool_stats.pages_compacted); + pool_stats.pages_compacted, + (u64)atomic64_read(&zram->stats.huge_pages)); up_read(&zram->init_lock); return ret; @@ -753,16 +880,6 @@ static DEVICE_ATTR_RO(io_stat); static DEVICE_ATTR_RO(mm_stat); static DEVICE_ATTR_RO(debug_stat); -static void zram_slot_lock(struct zram *zram, u32 index) -{ - bit_spin_lock(ZRAM_ACCESS, &zram->table[index].value); -} - -static void zram_slot_unlock(struct zram *zram, u32 index) -{ - bit_spin_unlock(ZRAM_ACCESS, &zram->table[index].value); -} - static void zram_meta_free(struct zram *zram, u64 disksize) { size_t num_pages = disksize >> PAGE_SHIFT; @@ -805,6 +922,13 @@ static void zram_free_page(struct zram *zram, size_t index) { unsigned long handle; + zram_reset_access(zram, index); + + if (zram_test_flag(zram, index, ZRAM_HUGE)) { + zram_clear_flag(zram, index, ZRAM_HUGE); + atomic64_dec(&zram->stats.huge_pages); + } + if (zram_wb_enabled(zram) && zram_test_flag(zram, index, ZRAM_WB)) { zram_wb_clear(zram, index); atomic64_dec(&zram->stats.pages_stored); @@ -973,6 +1097,7 @@ compress_again: } if (unlikely(comp_len >= huge_class_size)) { + comp_len = PAGE_SIZE; if (zram_wb_enabled(zram) && allow_wb) { zcomp_stream_put(zram->comp); ret = write_to_bdev(zram, bvec, index, bio, &element); @@ -984,7 +1109,6 @@ compress_again: allow_wb = false; goto compress_again; } - comp_len = PAGE_SIZE; } /* @@ -1046,6 +1170,11 @@ out: zram_slot_lock(zram, index); zram_free_page(zram, index); + if (comp_len == PAGE_SIZE) { + zram_set_flag(zram, index, ZRAM_HUGE); + atomic64_inc(&zram->stats.huge_pages); + } + if (flags) { zram_set_flag(zram, index, flags); zram_set_element(zram, index, element); @@ -1166,6 +1295,10 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, generic_end_io_acct(q, rw_acct, &zram->disk->part0, start_time); + zram_slot_lock(zram, index); + zram_accessed(zram, index); + zram_slot_unlock(zram, index); + if (unlikely(ret < 0)) { if (!is_write) atomic64_inc(&zram->stats.failed_reads); @@ -1577,6 +1710,7 @@ static int zram_add(void) } strlcpy(zram->compressor, default_compressor, sizeof(zram->compressor)); + zram_debugfs_register(zram); pr_info("Added device: %s\n", zram->disk->disk_name); return device_id; @@ -1610,6 +1744,7 @@ static int zram_remove(struct zram *zram) zram->claim = true; mutex_unlock(&bdev->bd_mutex); + zram_debugfs_unregister(zram); /* * Remove sysfs first, so no one will perform a disksize * store while we destroy the devices. This also helps during @@ -1712,6 +1847,7 @@ static void destroy_devices(void) { class_unregister(&zram_control_class); idr_for_each(&zram_index_idr, &zram_remove_cb, NULL); + zram_debugfs_destroy(); idr_destroy(&zram_index_idr); unregister_blkdev(zram_major, "zram"); cpuhp_remove_multi_state(CPUHP_ZCOMP_PREPARE); @@ -1733,6 +1869,7 @@ static int __init zram_init(void) return ret; } + zram_debugfs_create(); zram_major = register_blkdev(0, "zram"); if (zram_major <= 0) { pr_err("Unable to get major number\n"); diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 0088612..72c8584 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -43,10 +43,11 @@ /* Flags for zram pages (table[page_no].value) */ enum zram_pageflags { - /* Page consists the same element */ - ZRAM_SAME = ZRAM_FLAG_SHIFT, - ZRAM_ACCESS, /* page is now accessed */ + /* zram slot is locked */ + ZRAM_LOCK = ZRAM_FLAG_SHIFT, + ZRAM_SAME, /* Page consists the same element */ ZRAM_WB, /* page is stored on backing_device */ + ZRAM_HUGE, /* Incompressible page */ __NR_ZRAM_PAGEFLAGS, }; @@ -60,6 +61,9 @@ struct zram_table_entry { unsigned long element; }; unsigned long value; +#ifdef CONFIG_ZRAM_MEMORY_TRACKING + ktime_t ac_time; +#endif }; struct zram_stats { @@ -71,6 +75,7 @@ struct zram_stats { atomic64_t invalid_io; /* non-page-aligned I/O requests */ atomic64_t notify_free; /* no. of swap slot free notifications */ atomic64_t same_pages; /* no. of same element filled pages */ + atomic64_t huge_pages; /* no. of huge pages */ atomic64_t pages_stored; /* no. of pages currently stored */ atomic_long_t max_used_pages; /* no. of maximum pages stored */ atomic64_t writestall; /* no. of write slow paths */ @@ -107,5 +112,8 @@ struct zram { unsigned long nr_pages; spinlock_t bitmap_lock; #endif +#ifdef CONFIG_ZRAM_MEMORY_TRACKING + struct dentry *debugfs_dir; +#endif }; #endif diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index e622f0f..0429c8e 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -210,12 +210,12 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) p9_debug(P9_DEBUG_ERROR, "integer field, but no integer?\n"); ret = r; - continue; - } - v9ses->debug = option; + } else { + v9ses->debug = option; #ifdef CONFIG_NET_9P_DEBUG - p9_debug_level = option; + p9_debug_level = option; #endif + } break; case Opt_dfltuid: @@ -231,7 +231,6 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) p9_debug(P9_DEBUG_ERROR, "uid field, but not a uid?\n"); ret = -EINVAL; - continue; } break; case Opt_dfltgid: @@ -247,7 +246,6 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) p9_debug(P9_DEBUG_ERROR, "gid field, but not a gid?\n"); ret = -EINVAL; - continue; } break; case Opt_afid: @@ -256,9 +254,9 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) p9_debug(P9_DEBUG_ERROR, "integer field, but no integer?\n"); ret = r; - continue; + } else { + v9ses->afid = option; } - v9ses->afid = option; break; case Opt_uname: kfree(v9ses->uname); @@ -306,13 +304,12 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) "problem allocating copy of cache arg\n"); goto free_and_return; } - ret = get_cache_mode(s); - if (ret == -EINVAL) { - kfree(s); - goto free_and_return; - } + r = get_cache_mode(s); + if (r < 0) + ret = r; + else + v9ses->cache = r; - v9ses->cache = ret; kfree(s); break; @@ -341,14 +338,12 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) pr_info("Unknown access argument %s\n", s); kfree(s); - goto free_and_return; + continue; } v9ses->uid = make_kuid(current_user_ns(), uid); if (!uid_valid(v9ses->uid)) { ret = -EINVAL; pr_info("Uknown uid %s\n", s); - kfree(s); - goto free_and_return; } } @@ -108,6 +108,7 @@ source "fs/notify/Kconfig" source "fs/quota/Kconfig" +source "fs/autofs/Kconfig" source "fs/autofs4/Kconfig" source "fs/fuse/Kconfig" source "fs/overlayfs/Kconfig" @@ -203,6 +204,9 @@ config HUGETLBFS config HUGETLB_PAGE def_bool HUGETLBFS +config MEMFD_CREATE + def_bool TMPFS || HUGETLBFS + config ARCH_HAS_GIGANTIC_PAGE bool diff --git a/fs/Makefile b/fs/Makefile index c9375fd..2e00552 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -102,6 +102,7 @@ obj-$(CONFIG_AFFS_FS) += affs/ obj-$(CONFIG_ROMFS_FS) += romfs/ obj-$(CONFIG_QNX4FS_FS) += qnx4/ obj-$(CONFIG_QNX6FS_FS) += qnx6/ +obj-$(CONFIG_AUTOFS_FS) += autofs/ obj-$(CONFIG_AUTOFS4_FS) += autofs4/ obj-$(CONFIG_ADFS_FS) += adfs/ obj-$(CONFIG_FUSE_FS) += fuse/ diff --git a/fs/autofs/Kconfig b/fs/autofs/Kconfig new file mode 100644 index 0000000..6a2064e --- /dev/null +++ b/fs/autofs/Kconfig @@ -0,0 +1,20 @@ +config AUTOFS_FS + tristate "Kernel automounter support (supports v3, v4 and v5)" + default n + help + The automounter is a tool to automatically mount remote file systems + on demand. This implementation is partially kernel-based to reduce + overhead in the already-mounted case; this is unlike the BSD + automounter (amd), which is a pure user space daemon. + + To use the automounter you need the user-space tools from + <https://www.kernel.org/pub/linux/daemons/autofs/>; you also want + to answer Y to "NFS file system support", below. + + To compile this support as a module, choose M here: the module will be + called autofs. + + If you are not a part of a fairly large, distributed network or + don't have a laptop which needs to dynamically reconfigure to the + local network, you probably do not need an automounter, and can say + N here. diff --git a/fs/autofs/Makefile b/fs/autofs/Makefile new file mode 100644 index 0000000..43fedde --- /dev/null +++ b/fs/autofs/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the linux autofs-filesystem routines. +# + +obj-$(CONFIG_AUTOFS_FS) += autofs.o + +autofs-objs := init.o inode.o root.o symlink.o waitq.o expire.o dev-ioctl.o diff --git a/fs/autofs4/autofs_i.h b/fs/autofs/autofs_i.h index 4737615..9400a9f 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs/autofs_i.h @@ -9,7 +9,7 @@ /* Internal header file for autofs */ -#include <linux/auto_fs4.h> +#include <linux/auto_fs.h> #include <linux/auto_dev-ioctl.h> #include <linux/kernel.h> @@ -25,7 +25,7 @@ #include <linux/spinlock.h> #include <linux/list.h> #include <linux/completion.h> -#include <asm/current.h> +#include <linux/file.h> /* This is the range of ioctl() numbers we claim as ours */ #define AUTOFS_IOC_FIRST AUTOFS_IOC_READY @@ -122,44 +122,44 @@ struct autofs_sb_info { struct rcu_head rcu; }; -static inline struct autofs_sb_info *autofs4_sbi(struct super_block *sb) +static inline struct autofs_sb_info *autofs_sbi(struct super_block *sb) { return (struct autofs_sb_info *)(sb->s_fs_info); } -static inline struct autofs_info *autofs4_dentry_ino(struct dentry *dentry) +static inline struct autofs_info *autofs_dentry_ino(struct dentry *dentry) { return (struct autofs_info *)(dentry->d_fsdata); } -/* autofs4_oz_mode(): do we see the man behind the curtain? (The +/* autofs_oz_mode(): do we see the man behind the curtain? (The * processes which do manipulations for us in user space sees the raw * filesystem without "magic".) */ -static inline int autofs4_oz_mode(struct autofs_sb_info *sbi) +static inline int autofs_oz_mode(struct autofs_sb_info *sbi) { return sbi->catatonic || task_pgrp(current) == sbi->oz_pgrp; } -struct inode *autofs4_get_inode(struct super_block *, umode_t); -void autofs4_free_ino(struct autofs_info *); +struct inode *autofs_get_inode(struct super_block *, umode_t); +void autofs_free_ino(struct autofs_info *); /* Expiration */ -int is_autofs4_dentry(struct dentry *); -int autofs4_expire_wait(const struct path *path, int rcu_walk); -int autofs4_expire_run(struct super_block *, struct vfsmount *, - struct autofs_sb_info *, - struct autofs_packet_expire __user *); -int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt, - struct autofs_sb_info *sbi, int when); -int autofs4_expire_multi(struct super_block *, struct vfsmount *, - struct autofs_sb_info *, int __user *); -struct dentry *autofs4_expire_direct(struct super_block *sb, - struct vfsmount *mnt, - struct autofs_sb_info *sbi, int how); -struct dentry *autofs4_expire_indirect(struct super_block *sb, - struct vfsmount *mnt, - struct autofs_sb_info *sbi, int how); +int is_autofs_dentry(struct dentry *); +int autofs_expire_wait(const struct path *path, int rcu_walk); +int autofs_expire_run(struct super_block *, struct vfsmount *, + struct autofs_sb_info *, + struct autofs_packet_expire __user *); +int autofs_do_expire_multi(struct super_block *sb, struct vfsmount *mnt, + struct autofs_sb_info *sbi, int when); +int autofs_expire_multi(struct super_block *, struct vfsmount *, + struct autofs_sb_info *, int __user *); +struct dentry *autofs_expire_direct(struct super_block *sb, + struct vfsmount *mnt, + struct autofs_sb_info *sbi, int how); +struct dentry *autofs_expire_indirect(struct super_block *sb, + struct vfsmount *mnt, + struct autofs_sb_info *sbi, int how); /* Device node initialization */ @@ -168,11 +168,11 @@ void autofs_dev_ioctl_exit(void); /* Operations structures */ -extern const struct inode_operations autofs4_symlink_inode_operations; -extern const struct inode_operations autofs4_dir_inode_operations; -extern const struct file_operations autofs4_dir_operations; -extern const struct file_operations autofs4_root_operations; -extern const struct dentry_operations autofs4_dentry_operations; +extern const struct inode_operations autofs_symlink_inode_operations; +extern const struct inode_operations autofs_dir_inode_operations; +extern const struct file_operations autofs_dir_operations; +extern const struct file_operations autofs_root_operations; +extern const struct dentry_operations autofs_dentry_operations; /* VFS automount flags management functions */ static inline void __managed_dentry_set_managed(struct dentry *dentry) @@ -201,9 +201,9 @@ static inline void managed_dentry_clear_managed(struct dentry *dentry) /* Initializing function */ -int autofs4_fill_super(struct super_block *, void *, int); -struct autofs_info *autofs4_new_ino(struct autofs_sb_info *); -void autofs4_clean_ino(struct autofs_info *); +int autofs_fill_super(struct super_block *, void *, int); +struct autofs_info *autofs_new_ino(struct autofs_sb_info *); +void autofs_clean_ino(struct autofs_info *); static inline int autofs_prepare_pipe(struct file *pipe) { @@ -218,25 +218,25 @@ static inline int autofs_prepare_pipe(struct file *pipe) /* Queue management functions */ -int autofs4_wait(struct autofs_sb_info *, +int autofs_wait(struct autofs_sb_info *, const struct path *, enum autofs_notify); -int autofs4_wait_release(struct autofs_sb_info *, autofs_wqt_t, int); -void autofs4_catatonic_mode(struct autofs_sb_info *); +int autofs_wait_release(struct autofs_sb_info *, autofs_wqt_t, int); +void autofs_catatonic_mode(struct autofs_sb_info *); -static inline u32 autofs4_get_dev(struct autofs_sb_info *sbi) +static inline u32 autofs_get_dev(struct autofs_sb_info *sbi) { return new_encode_dev(sbi->sb->s_dev); } -static inline u64 autofs4_get_ino(struct autofs_sb_info *sbi) +static inline u64 autofs_get_ino(struct autofs_sb_info *sbi) { return d_inode(sbi->sb->s_root)->i_ino; } -static inline void __autofs4_add_expiring(struct dentry *dentry) +static inline void __autofs_add_expiring(struct dentry *dentry) { - struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); - struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); + struct autofs_info *ino = autofs_dentry_ino(dentry); if (ino) { if (list_empty(&ino->expiring)) @@ -244,10 +244,10 @@ static inline void __autofs4_add_expiring(struct dentry *dentry) } } -static inline void autofs4_add_expiring(struct dentry *dentry) +static inline void autofs_add_expiring(struct dentry *dentry) { - struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); - struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); + struct autofs_info *ino = autofs_dentry_ino(dentry); if (ino) { spin_lock(&sbi->lookup_lock); @@ -257,10 +257,10 @@ static inline void autofs4_add_expiring(struct dentry *dentry) } } -static inline void autofs4_del_expiring(struct dentry *dentry) +static inline void autofs_del_expiring(struct dentry *dentry) { - struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); - struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); + struct autofs_info *ino = autofs_dentry_ino(dentry); if (ino) { spin_lock(&sbi->lookup_lock); @@ -270,4 +270,4 @@ static inline void autofs4_del_expiring(struct dentry *dentry) } } -void autofs4_kill_sb(struct super_block *); +void autofs_kill_sb(struct super_block *); diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs/dev-ioctl.c index 26f6b4f..ea4ca14 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs/dev-ioctl.c @@ -7,23 +7,10 @@ * option, any later version, incorporated herein by reference. */ -#include <linux/module.h> -#include <linux/vmalloc.h> #include <linux/miscdevice.h> -#include <linux/init.h> -#include <linux/wait.h> -#include <linux/namei.h> -#include <linux/fcntl.h> -#include <linux/file.h> -#include <linux/fdtable.h> -#include <linux/sched.h> -#include <linux/cred.h> #include <linux/compat.h> #include <linux/syscalls.h> #include <linux/magic.h> -#include <linux/dcache.h> -#include <linux/uaccess.h> -#include <linux/slab.h> #include "autofs_i.h" @@ -166,7 +153,7 @@ static struct autofs_sb_info *autofs_dev_ioctl_sbi(struct file *f) if (f) { inode = file_inode(f); - sbi = autofs4_sbi(inode->i_sb); + sbi = autofs_sbi(inode->i_sb); } return sbi; } @@ -236,7 +223,7 @@ static int test_by_dev(const struct path *path, void *p) static int test_by_type(const struct path *path, void *p) { - struct autofs_info *ino = autofs4_dentry_ino(path->dentry); + struct autofs_info *ino = autofs_dentry_ino(path->dentry); return ino && ino->sbi->type & *(unsigned *)p; } @@ -324,7 +311,7 @@ static int autofs_dev_ioctl_ready(struct file *fp, autofs_wqt_t token; token = (autofs_wqt_t) param->ready.token; - return autofs4_wait_release(sbi, token, 0); + return autofs_wait_release(sbi, token, 0); } /* @@ -340,7 +327,7 @@ static int autofs_dev_ioctl_fail(struct file *fp, token = (autofs_wqt_t) param->fail.token; status = param->fail.status < 0 ? param->fail.status : -ENOENT; - return autofs4_wait_release(sbi, token, status); + return autofs_wait_release(sbi, token, status); } /* @@ -412,7 +399,7 @@ static int autofs_dev_ioctl_catatonic(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { - autofs4_catatonic_mode(sbi); + autofs_catatonic_mode(sbi); return 0; } @@ -459,10 +446,10 @@ static int autofs_dev_ioctl_requester(struct file *fp, if (err) goto out; - ino = autofs4_dentry_ino(path.dentry); + ino = autofs_dentry_ino(path.dentry); if (ino) { err = 0; - autofs4_expire_wait(&path, 0); + autofs_expire_wait(&path, 0); spin_lock(&sbi->fs_lock); param->requester.uid = from_kuid_munged(current_user_ns(), ino->uid); @@ -489,7 +476,7 @@ static int autofs_dev_ioctl_expire(struct file *fp, how = param->expire.how; mnt = fp->f_path.mnt; - return autofs4_do_expire_multi(sbi->sb, mnt, sbi, how); + return autofs_do_expire_multi(sbi->sb, mnt, sbi, how); } /* Check if autofs mount point is in use */ @@ -686,7 +673,7 @@ static int _autofs_dev_ioctl(unsigned int command, * Admin needs to be able to set the mount catatonic in * order to be able to perform the re-open. */ - if (!autofs4_oz_mode(sbi) && + if (!autofs_oz_mode(sbi) && cmd != AUTOFS_DEV_IOCTL_CATATONIC_CMD) { err = -EACCES; fput(fp); diff --git a/fs/autofs4/expire.c b/fs/autofs/expire.c index 57725d4..b332d3f 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs/expire.c @@ -13,10 +13,10 @@ static unsigned long now; /* Check if a dentry can be expired */ -static inline int autofs4_can_expire(struct dentry *dentry, - unsigned long timeout, int do_now) +static inline int autofs_can_expire(struct dentry *dentry, + unsigned long timeout, int do_now) { - struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_info *ino = autofs_dentry_ino(dentry); /* dentry in the process of being deleted */ if (ino == NULL) @@ -31,7 +31,7 @@ static inline int autofs4_can_expire(struct dentry *dentry, } /* Check a mount point for busyness */ -static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry) +static int autofs_mount_busy(struct vfsmount *mnt, struct dentry *dentry) { struct dentry *top = dentry; struct path path = {.mnt = mnt, .dentry = dentry}; @@ -44,8 +44,8 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry) if (!follow_down_one(&path)) goto done; - if (is_autofs4_dentry(path.dentry)) { - struct autofs_sb_info *sbi = autofs4_sbi(path.dentry->d_sb); + if (is_autofs_dentry(path.dentry)) { + struct autofs_sb_info *sbi = autofs_sbi(path.dentry->d_sb); /* This is an autofs submount, we can't expire it */ if (autofs_type_indirect(sbi->type)) @@ -56,7 +56,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry) if (!may_umount_tree(path.mnt)) { struct autofs_info *ino; - ino = autofs4_dentry_ino(top); + ino = autofs_dentry_ino(top); ino->last_used = jiffies; goto done; } @@ -74,7 +74,7 @@ done: static struct dentry *get_next_positive_subdir(struct dentry *prev, struct dentry *root) { - struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb); + struct autofs_sb_info *sbi = autofs_sbi(root->d_sb); struct list_head *next; struct dentry *q; @@ -121,7 +121,7 @@ cont: static struct dentry *get_next_positive_dentry(struct dentry *prev, struct dentry *root) { - struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb); + struct autofs_sb_info *sbi = autofs_sbi(root->d_sb); struct list_head *next; struct dentry *p, *ret; @@ -184,10 +184,10 @@ again: * The tree is not busy iff no mountpoints are busy and there are no * autofs submounts. */ -static int autofs4_direct_busy(struct vfsmount *mnt, - struct dentry *top, - unsigned long timeout, - int do_now) +static int autofs_direct_busy(struct vfsmount *mnt, + struct dentry *top, + unsigned long timeout, + int do_now) { pr_debug("top %p %pd\n", top, top); @@ -195,14 +195,14 @@ static int autofs4_direct_busy(struct vfsmount *mnt, if (!may_umount_tree(mnt)) { struct autofs_info *ino; - ino = autofs4_dentry_ino(top); + ino = autofs_dentry_ino(top); if (ino) ino->last_used = jiffies; return 1; } /* Timeout of a direct mount is determined by its top dentry */ - if (!autofs4_can_expire(top, timeout, do_now)) + if (!autofs_can_expire(top, timeout, do_now)) return 1; return 0; @@ -212,12 +212,12 @@ static int autofs4_direct_busy(struct vfsmount *mnt, * Check a directory tree of mount points for busyness * The tree is not busy iff no mountpoints are busy */ -static int autofs4_tree_busy(struct vfsmount *mnt, - struct dentry *top, - unsigned long timeout, - int do_now) +static int autofs_tree_busy(struct vfsmount *mnt, + struct dentry *top, + unsigned long timeout, + int do_now) { - struct autofs_info *top_ino = autofs4_dentry_ino(top); + struct autofs_info *top_ino = autofs_dentry_ino(top); struct dentry *p; pr_debug("top %p %pd\n", top, top); @@ -237,13 +237,13 @@ static int autofs4_tree_busy(struct vfsmount *mnt, * If the fs is busy update the expiry counter. */ if (d_mountpoint(p)) { - if (autofs4_mount_busy(mnt, p)) { + if (autofs_mount_busy(mnt, p)) { top_ino->last_used = jiffies; dput(p); return 1; } } else { - struct autofs_info *ino = autofs4_dentry_ino(p); + struct autofs_info *ino = autofs_dentry_ino(p); unsigned int ino_count = atomic_read(&ino->count); /* allow for dget above and top is already dgot */ @@ -261,16 +261,16 @@ static int autofs4_tree_busy(struct vfsmount *mnt, } /* Timeout of a tree mount is ultimately determined by its top dentry */ - if (!autofs4_can_expire(top, timeout, do_now)) + if (!autofs_can_expire(top, timeout, do_now)) return 1; return 0; } -static struct dentry *autofs4_check_leaves(struct vfsmount *mnt, - struct dentry *parent, - unsigned long timeout, - int do_now) +static struct dentry *autofs_check_leaves(struct vfsmount *mnt, + struct dentry *parent, + unsigned long timeout, + int do_now) { struct dentry *p; @@ -282,11 +282,11 @@ static struct dentry *autofs4_check_leaves(struct vfsmount *mnt, if (d_mountpoint(p)) { /* Can we umount this guy */ - if (autofs4_mount_busy(mnt, p)) + if (autofs_mount_busy(mnt, p)) continue; /* Can we expire this guy */ - if (autofs4_can_expire(p, timeout, do_now)) + if (autofs_can_expire(p, timeout, do_now)) return p; } } @@ -294,10 +294,10 @@ static struct dentry *autofs4_check_leaves(struct vfsmount *mnt, } /* Check if we can expire a direct mount (possibly a tree) */ -struct dentry *autofs4_expire_direct(struct super_block *sb, - struct vfsmount *mnt, - struct autofs_sb_info *sbi, - int how) +struct dentry *autofs_expire_direct(struct super_block *sb, + struct vfsmount *mnt, + struct autofs_sb_info *sbi, + int how) { unsigned long timeout; struct dentry *root = dget(sb->s_root); @@ -310,9 +310,9 @@ struct dentry *autofs4_expire_direct(struct super_block *sb, now = jiffies; timeout = sbi->exp_timeout; - if (!autofs4_direct_busy(mnt, root, timeout, do_now)) { + if (!autofs_direct_busy(mnt, root, timeout, do_now)) { spin_lock(&sbi->fs_lock); - ino = autofs4_dentry_ino(root); + ino = autofs_dentry_ino(root); /* No point expiring a pending mount */ if (ino->flags & AUTOFS_INF_PENDING) { spin_unlock(&sbi->fs_lock); @@ -321,7 +321,7 @@ struct dentry *autofs4_expire_direct(struct super_block *sb, ino->flags |= AUTOFS_INF_WANT_EXPIRE; spin_unlock(&sbi->fs_lock); synchronize_rcu(); - if (!autofs4_direct_busy(mnt, root, timeout, do_now)) { + if (!autofs_direct_busy(mnt, root, timeout, do_now)) { spin_lock(&sbi->fs_lock); ino->flags |= AUTOFS_INF_EXPIRING; init_completion(&ino->expire_complete); @@ -350,7 +350,7 @@ static struct dentry *should_expire(struct dentry *dentry, { int do_now = how & AUTOFS_EXP_IMMEDIATE; int exp_leaves = how & AUTOFS_EXP_LEAVES; - struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_info *ino = autofs_dentry_ino(dentry); unsigned int ino_count; /* No point expiring a pending mount */ @@ -367,11 +367,11 @@ static struct dentry *should_expire(struct dentry *dentry, pr_debug("checking mountpoint %p %pd\n", dentry, dentry); /* Can we umount this guy */ - if (autofs4_mount_busy(mnt, dentry)) + if (autofs_mount_busy(mnt, dentry)) return NULL; /* Can we expire this guy */ - if (autofs4_can_expire(dentry, timeout, do_now)) + if (autofs_can_expire(dentry, timeout, do_now)) return dentry; return NULL; } @@ -382,7 +382,7 @@ static struct dentry *should_expire(struct dentry *dentry, * A symlink can't be "busy" in the usual sense so * just check last used for expire timeout. */ - if (autofs4_can_expire(dentry, timeout, do_now)) + if (autofs_can_expire(dentry, timeout, do_now)) return dentry; return NULL; } @@ -397,7 +397,7 @@ static struct dentry *should_expire(struct dentry *dentry, if (d_count(dentry) > ino_count) return NULL; - if (!autofs4_tree_busy(mnt, dentry, timeout, do_now)) + if (!autofs_tree_busy(mnt, dentry, timeout, do_now)) return dentry; /* * Case 3: pseudo direct mount, expire individual leaves @@ -411,7 +411,7 @@ static struct dentry *should_expire(struct dentry *dentry, if (d_count(dentry) > ino_count) return NULL; - expired = autofs4_check_leaves(mnt, dentry, timeout, do_now); + expired = autofs_check_leaves(mnt, dentry, timeout, do_now); if (expired) { if (expired == dentry) dput(dentry); @@ -427,10 +427,10 @@ static struct dentry *should_expire(struct dentry *dentry, * - it is unused by any user process * - it has been unused for exp_timeout time */ -struct dentry *autofs4_expire_indirect(struct super_block *sb, - struct vfsmount *mnt, - struct autofs_sb_info *sbi, - int how) +struct dentry *autofs_expire_indirect(struct super_block *sb, + struct vfsmount *mnt, + struct autofs_sb_info *sbi, + int how) { unsigned long timeout; struct dentry *root = sb->s_root; @@ -450,7 +450,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb, int flags = how; spin_lock(&sbi->fs_lock); - ino = autofs4_dentry_ino(dentry); + ino = autofs_dentry_ino(dentry); if (ino->flags & AUTOFS_INF_WANT_EXPIRE) { spin_unlock(&sbi->fs_lock); continue; @@ -462,7 +462,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb, continue; spin_lock(&sbi->fs_lock); - ino = autofs4_dentry_ino(expired); + ino = autofs_dentry_ino(expired); ino->flags |= AUTOFS_INF_WANT_EXPIRE; spin_unlock(&sbi->fs_lock); synchronize_rcu(); @@ -498,11 +498,11 @@ found: return expired; } -int autofs4_expire_wait(const struct path *path, int rcu_walk) +int autofs_expire_wait(const struct path *path, int rcu_walk) { struct dentry *dentry = path->dentry; - struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); - struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); + struct autofs_info *ino = autofs_dentry_ino(dentry); int status; int state; @@ -529,7 +529,7 @@ retry: pr_debug("waiting for expire %p name=%pd\n", dentry, dentry); - status = autofs4_wait(sbi, path, NFY_NONE); + status = autofs_wait(sbi, path, NFY_NONE); wait_for_completion(&ino->expire_complete); pr_debug("expire done status=%d\n", status); @@ -545,10 +545,10 @@ retry: } /* Perform an expiry operation */ -int autofs4_expire_run(struct super_block *sb, - struct vfsmount *mnt, - struct autofs_sb_info *sbi, - struct autofs_packet_expire __user *pkt_p) +int autofs_expire_run(struct super_block *sb, + struct vfsmount *mnt, + struct autofs_sb_info *sbi, + struct autofs_packet_expire __user *pkt_p) { struct autofs_packet_expire pkt; struct autofs_info *ino; @@ -560,7 +560,7 @@ int autofs4_expire_run(struct super_block *sb, pkt.hdr.proto_version = sbi->version; pkt.hdr.type = autofs_ptype_expire; - dentry = autofs4_expire_indirect(sb, mnt, sbi, 0); + dentry = autofs_expire_indirect(sb, mnt, sbi, 0); if (!dentry) return -EAGAIN; @@ -573,7 +573,7 @@ int autofs4_expire_run(struct super_block *sb, ret = -EFAULT; spin_lock(&sbi->fs_lock); - ino = autofs4_dentry_ino(dentry); + ino = autofs_dentry_ino(dentry); /* avoid rapid-fire expire attempts if expiry fails */ ino->last_used = now; ino->flags &= ~(AUTOFS_INF_EXPIRING|AUTOFS_INF_WANT_EXPIRE); @@ -583,25 +583,25 @@ int autofs4_expire_run(struct super_block *sb, return ret; } -int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt, - struct autofs_sb_info *sbi, int when) +int autofs_do_expire_multi(struct super_block *sb, struct vfsmount *mnt, + struct autofs_sb_info *sbi, int when) { struct dentry *dentry; int ret = -EAGAIN; if (autofs_type_trigger(sbi->type)) - dentry = autofs4_expire_direct(sb, mnt, sbi, when); + dentry = autofs_expire_direct(sb, mnt, sbi, when); else - dentry = autofs4_expire_indirect(sb, mnt, sbi, when); + dentry = autofs_expire_indirect(sb, mnt, sbi, when); if (dentry) { - struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_info *ino = autofs_dentry_ino(dentry); const struct path path = { .mnt = mnt, .dentry = dentry }; /* This is synchronous because it makes the daemon a * little easier */ - ret = autofs4_wait(sbi, &path, NFY_EXPIRE); + ret = autofs_wait(sbi, &path, NFY_EXPIRE); spin_lock(&sbi->fs_lock); /* avoid rapid-fire expire attempts if expiry fails */ @@ -619,7 +619,7 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt, * Call repeatedly until it returns -EAGAIN, meaning there's nothing * more to be done. */ -int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt, +int autofs_expire_multi(struct super_block *sb, struct vfsmount *mnt, struct autofs_sb_info *sbi, int __user *arg) { int do_now = 0; @@ -627,6 +627,5 @@ int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt, if (arg && get_user(do_now, arg)) return -EFAULT; - return autofs4_do_expire_multi(sb, mnt, sbi, do_now); + return autofs_do_expire_multi(sb, mnt, sbi, do_now); } - diff --git a/fs/autofs4/init.c b/fs/autofs/init.c index 8cf0e63..16fb613 100644 --- a/fs/autofs4/init.c +++ b/fs/autofs/init.c @@ -13,18 +13,18 @@ static struct dentry *autofs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { - return mount_nodev(fs_type, flags, data, autofs4_fill_super); + return mount_nodev(fs_type, flags, data, autofs_fill_super); } static struct file_system_type autofs_fs_type = { .owner = THIS_MODULE, .name = "autofs", .mount = autofs_mount, - .kill_sb = autofs4_kill_sb, + .kill_sb = autofs_kill_sb, }; MODULE_ALIAS_FS("autofs"); -static int __init init_autofs4_fs(void) +static int __init init_autofs_fs(void) { int err; @@ -37,12 +37,12 @@ static int __init init_autofs4_fs(void) return err; } -static void __exit exit_autofs4_fs(void) +static void __exit exit_autofs_fs(void) { autofs_dev_ioctl_exit(); unregister_filesystem(&autofs_fs_type); } -module_init(init_autofs4_fs) -module_exit(exit_autofs4_fs) +module_init(init_autofs_fs) +module_exit(exit_autofs_fs) MODULE_LICENSE("GPL"); diff --git a/fs/autofs4/inode.c b/fs/autofs/inode.c index 09e7d68..b51980f 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs/inode.c @@ -7,18 +7,14 @@ * option, any later version, incorporated herein by reference. */ -#include <linux/kernel.h> -#include <linux/slab.h> -#include <linux/file.h> #include <linux/seq_file.h> #include <linux/pagemap.h> #include <linux/parser.h> -#include <linux/bitops.h> #include <linux/magic.h> + #include "autofs_i.h" -#include <linux/module.h> -struct autofs_info *autofs4_new_ino(struct autofs_sb_info *sbi) +struct autofs_info *autofs_new_ino(struct autofs_sb_info *sbi) { struct autofs_info *ino; @@ -32,21 +28,21 @@ struct autofs_info *autofs4_new_ino(struct autofs_sb_info *sbi) return ino; } -void autofs4_clean_ino(struct autofs_info *ino) +void autofs_clean_ino(struct autofs_info *ino) { ino->uid = GLOBAL_ROOT_UID; ino->gid = GLOBAL_ROOT_GID; ino->last_used = jiffies; } -void autofs4_free_ino(struct autofs_info *ino) +void autofs_free_ino(struct autofs_info *ino) { kfree(ino); } -void autofs4_kill_sb(struct super_block *sb) +void autofs_kill_sb(struct super_block *sb) { - struct autofs_sb_info *sbi = autofs4_sbi(sb); + struct autofs_sb_info *sbi = autofs_sbi(sb); /* * In the event of a failure in get_sb_nodev the superblock @@ -56,7 +52,7 @@ void autofs4_kill_sb(struct super_block *sb) */ if (sbi) { /* Free wait queues, close pipe */ - autofs4_catatonic_mode(sbi); + autofs_catatonic_mode(sbi); put_pid(sbi->oz_pgrp); } @@ -66,9 +62,9 @@ void autofs4_kill_sb(struct super_block *sb) kfree_rcu(sbi, rcu); } -static int autofs4_show_options(struct seq_file *m, struct dentry *root) +static int autofs_show_options(struct seq_file *m, struct dentry *root) { - struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb); + struct autofs_sb_info *sbi = autofs_sbi(root->d_sb); struct inode *root_inode = d_inode(root->d_sb->s_root); if (!sbi) @@ -101,16 +97,16 @@ static int autofs4_show_options(struct seq_file *m, struct dentry *root) return 0; } -static void autofs4_evict_inode(struct inode *inode) +static void autofs_evict_inode(struct inode *inode) { clear_inode(inode); kfree(inode->i_private); } -static const struct super_operations autofs4_sops = { +static const struct super_operations autofs_sops = { .statfs = simple_statfs, - .show_options = autofs4_show_options, - .evict_inode = autofs4_evict_inode, + .show_options = autofs_show_options, + .evict_inode = autofs_evict_inode, }; enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto, @@ -206,7 +202,7 @@ static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid, return (*pipefd < 0); } -int autofs4_fill_super(struct super_block *s, void *data, int silent) +int autofs_fill_super(struct super_block *s, void *data, int silent) { struct inode *root_inode; struct dentry *root; @@ -246,19 +242,19 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) s->s_blocksize = 1024; s->s_blocksize_bits = 10; s->s_magic = AUTOFS_SUPER_MAGIC; - s->s_op = &autofs4_sops; - s->s_d_op = &autofs4_dentry_operations; + s->s_op = &autofs_sops; + s->s_d_op = &autofs_dentry_operations; s->s_time_gran = 1; /* * Get the root inode and dentry, but defer checking for errors. */ - ino = autofs4_new_ino(sbi); + ino = autofs_new_ino(sbi); if (!ino) { ret = -ENOMEM; goto fail_free; } - root_inode = autofs4_get_inode(s, S_IFDIR | 0755); + root_inode = autofs_get_inode(s, S_IFDIR | 0755); root = d_make_root(root_inode); if (!root) goto fail_ino; @@ -305,8 +301,8 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) if (autofs_type_trigger(sbi->type)) __managed_dentry_set_managed(root); - root_inode->i_fop = &autofs4_root_operations; - root_inode->i_op = &autofs4_dir_inode_operations; + root_inode->i_fop = &autofs_root_operations; + root_inode->i_op = &autofs_dir_inode_operations; pr_debug("pipe fd = %d, pgrp = %u\n", pipefd, pid_nr(sbi->oz_pgrp)); pipe = fget(pipefd); @@ -340,14 +336,14 @@ fail_dput: dput(root); goto fail_free; fail_ino: - autofs4_free_ino(ino); + autofs_free_ino(ino); fail_free: kfree(sbi); s->s_fs_info = NULL; return ret; } -struct inode *autofs4_get_inode(struct super_block *sb, umode_t mode) +struct inode *autofs_get_inode(struct super_block *sb, umode_t mode) { struct inode *inode = new_inode(sb); @@ -364,10 +360,10 @@ struct inode *autofs4_get_inode(struct super_block *sb, umode_t mode) if (S_ISDIR(mode)) { set_nlink(inode, 2); - inode->i_op = &autofs4_dir_inode_operations; - inode->i_fop = &autofs4_dir_operations; + inode->i_op = &autofs_dir_inode_operations; + inode->i_fop = &autofs_dir_operations; } else if (S_ISLNK(mode)) { - inode->i_op = &autofs4_symlink_inode_operations; + inode->i_op = &autofs_symlink_inode_operations; } else WARN_ON(1); diff --git a/fs/autofs4/root.c b/fs/autofs/root.c index b12e37f..a3d4141 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs/root.c @@ -9,72 +9,66 @@ */ #include <linux/capability.h> -#include <linux/errno.h> -#include <linux/stat.h> -#include <linux/slab.h> -#include <linux/param.h> -#include <linux/time.h> #include <linux/compat.h> -#include <linux/mutex.h> #include "autofs_i.h" -static int autofs4_dir_symlink(struct inode *, struct dentry *, const char *); -static int autofs4_dir_unlink(struct inode *, struct dentry *); -static int autofs4_dir_rmdir(struct inode *, struct dentry *); -static int autofs4_dir_mkdir(struct inode *, struct dentry *, umode_t); -static long autofs4_root_ioctl(struct file *, unsigned int, unsigned long); +static int autofs_dir_symlink(struct inode *, struct dentry *, const char *); +static int autofs_dir_unlink(struct inode *, struct dentry *); +static int autofs_dir_rmdir(struct inode *, struct dentry *); +static int autofs_dir_mkdir(struct inode *, struct dentry *, umode_t); +static long autofs_root_ioctl(struct file *, unsigned int, unsigned long); #ifdef CONFIG_COMPAT -static long autofs4_root_compat_ioctl(struct file *, - unsigned int, unsigned long); +static long autofs_root_compat_ioctl(struct file *, + unsigned int, unsigned long); #endif -static int autofs4_dir_open(struct inode *inode, struct file *file); -static struct dentry *autofs4_lookup(struct inode *, - struct dentry *, unsigned int); -static struct vfsmount *autofs4_d_automount(struct path *); -static int autofs4_d_manage(const struct path *, bool); -static void autofs4_dentry_release(struct dentry *); - -const struct file_operations autofs4_root_operations = { +static int autofs_dir_open(struct inode *inode, struct file *file); +static struct dentry *autofs_lookup(struct inode *, + struct dentry *, unsigned int); +static struct vfsmount *autofs_d_automount(struct path *); +static int autofs_d_manage(const struct path *, bool); +static void autofs_dentry_release(struct dentry *); + +const struct file_operations autofs_root_operations = { .open = dcache_dir_open, .release = dcache_dir_close, .read = generic_read_dir, .iterate_shared = dcache_readdir, .llseek = dcache_dir_lseek, - .unlocked_ioctl = autofs4_root_ioctl, + .unlocked_ioctl = autofs_root_ioctl, #ifdef CONFIG_COMPAT - .compat_ioctl = autofs4_root_compat_ioctl, + .compat_ioctl = autofs_root_compat_ioctl, #endif }; -const struct file_operations autofs4_dir_operations = { - .open = autofs4_dir_open, +const struct file_operations autofs_dir_operations = { + .open = autofs_dir_open, .release = dcache_dir_close, .read = generic_read_dir, .iterate_shared = dcache_readdir, .llseek = dcache_dir_lseek, }; -const struct inode_operations autofs4_dir_inode_operations = { - .lookup = autofs4_lookup, - .unlink = autofs4_dir_unlink, - .symlink = autofs4_dir_symlink, - .mkdir = autofs4_dir_mkdir, - .rmdir = autofs4_dir_rmdir, +const struct inode_operations autofs_dir_inode_operations = { + .lookup = autofs_lookup, + .unlink = autofs_dir_unlink, + .symlink = autofs_dir_symlink, + .mkdir = autofs_dir_mkdir, + .rmdir = autofs_dir_rmdir, }; -const struct dentry_operations autofs4_dentry_operations = { - .d_automount = autofs4_d_automount, - .d_manage = autofs4_d_manage, - .d_release = autofs4_dentry_release, +const struct dentry_operations autofs_dentry_operations = { + .d_automount = autofs_d_automount, + .d_manage = autofs_d_manage, + .d_release = autofs_dentry_release, }; -static void autofs4_add_active(struct dentry *dentry) +static void autofs_add_active(struct dentry *dentry) { - struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); struct autofs_info *ino; - ino = autofs4_dentry_ino(dentry); + ino = autofs_dentry_ino(dentry); if (ino) { spin_lock(&sbi->lookup_lock); if (!ino->active_count) { @@ -86,12 +80,12 @@ static void autofs4_add_active(struct dentry *dentry) } } -static void autofs4_del_active(struct dentry *dentry) +static void autofs_del_active(struct dentry *dentry) { - struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); struct autofs_info *ino; - ino = autofs4_dentry_ino(dentry); + ino = autofs_dentry_ino(dentry); if (ino) { spin_lock(&sbi->lookup_lock); ino->active_count--; @@ -103,14 +97,14 @@ static void autofs4_del_active(struct dentry *dentry) } } -static int autofs4_dir_open(struct inode *inode, struct file *file) +static int autofs_dir_open(struct inode *inode, struct file *file) { struct dentry *dentry = file->f_path.dentry; - struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); pr_debug("file=%p dentry=%p %pd\n", file, dentry, dentry); - if (autofs4_oz_mode(sbi)) + if (autofs_oz_mode(sbi)) goto out; /* @@ -133,10 +127,10 @@ out: return dcache_dir_open(inode, file); } -static void autofs4_dentry_release(struct dentry *de) +static void autofs_dentry_release(struct dentry *de) { - struct autofs_info *ino = autofs4_dentry_ino(de); - struct autofs_sb_info *sbi = autofs4_sbi(de->d_sb); + struct autofs_info *ino = autofs_dentry_ino(de); + struct autofs_sb_info *sbi = autofs_sbi(de->d_sb); pr_debug("releasing %p\n", de); @@ -152,12 +146,12 @@ static void autofs4_dentry_release(struct dentry *de) spin_unlock(&sbi->lookup_lock); } - autofs4_free_ino(ino); + autofs_free_ino(ino); } -static struct dentry *autofs4_lookup_active(struct dentry *dentry) +static struct dentry *autofs_lookup_active(struct dentry *dentry) { - struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); struct dentry *parent = dentry->d_parent; const struct qstr *name = &dentry->d_name; unsigned int len = name->len; @@ -209,10 +203,10 @@ next: return NULL; } -static struct dentry *autofs4_lookup_expiring(struct dentry *dentry, - bool rcu_walk) +static struct dentry *autofs_lookup_expiring(struct dentry *dentry, + bool rcu_walk) { - struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); struct dentry *parent = dentry->d_parent; const struct qstr *name = &dentry->d_name; unsigned int len = name->len; @@ -269,17 +263,17 @@ next: return NULL; } -static int autofs4_mount_wait(const struct path *path, bool rcu_walk) +static int autofs_mount_wait(const struct path *path, bool rcu_walk) { - struct autofs_sb_info *sbi = autofs4_sbi(path->dentry->d_sb); - struct autofs_info *ino = autofs4_dentry_ino(path->dentry); + struct autofs_sb_info *sbi = autofs_sbi(path->dentry->d_sb); + struct autofs_info *ino = autofs_dentry_ino(path->dentry); int status = 0; if (ino->flags & AUTOFS_INF_PENDING) { if (rcu_walk) return -ECHILD; pr_debug("waiting for mount name=%pd\n", path->dentry); - status = autofs4_wait(sbi, path, NFY_MOUNT); + status = autofs_wait(sbi, path, NFY_MOUNT); pr_debug("mount wait done status=%d\n", status); } ino->last_used = jiffies; @@ -291,11 +285,11 @@ static int do_expire_wait(const struct path *path, bool rcu_walk) struct dentry *dentry = path->dentry; struct dentry *expiring; - expiring = autofs4_lookup_expiring(dentry, rcu_walk); + expiring = autofs_lookup_expiring(dentry, rcu_walk); if (IS_ERR(expiring)) return PTR_ERR(expiring); if (!expiring) - return autofs4_expire_wait(path, rcu_walk); + return autofs_expire_wait(path, rcu_walk); else { const struct path this = { .mnt = path->mnt, .dentry = expiring }; /* @@ -303,17 +297,17 @@ static int do_expire_wait(const struct path *path, bool rcu_walk) * be quite complete, but the directory has been removed * so it must have been successful, just wait for it. */ - autofs4_expire_wait(&this, 0); - autofs4_del_expiring(expiring); + autofs_expire_wait(&this, 0); + autofs_del_expiring(expiring); dput(expiring); } return 0; } -static struct dentry *autofs4_mountpoint_changed(struct path *path) +static struct dentry *autofs_mountpoint_changed(struct path *path) { struct dentry *dentry = path->dentry; - struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); /* * If this is an indirect mount the dentry could have gone away @@ -327,7 +321,7 @@ static struct dentry *autofs4_mountpoint_changed(struct path *path) new = d_lookup(parent, &dentry->d_name); if (!new) return NULL; - ino = autofs4_dentry_ino(new); + ino = autofs_dentry_ino(new); ino->last_used = jiffies; dput(path->dentry); path->dentry = new; @@ -335,17 +329,17 @@ static struct dentry *autofs4_mountpoint_changed(struct path *path) return path->dentry; } -static struct vfsmount *autofs4_d_automount(struct path *path) +static struct vfsmount *autofs_d_automount(struct path *path) { struct dentry *dentry = path->dentry; - struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); - struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); + struct autofs_info *ino = autofs_dentry_ino(dentry); int status; pr_debug("dentry=%p %pd\n", dentry, dentry); /* The daemon never triggers a mount. */ - if (autofs4_oz_mode(sbi)) + if (autofs_oz_mode(sbi)) return NULL; /* @@ -364,7 +358,7 @@ static struct vfsmount *autofs4_d_automount(struct path *path) spin_lock(&sbi->fs_lock); if (ino->flags & AUTOFS_INF_PENDING) { spin_unlock(&sbi->fs_lock); - status = autofs4_mount_wait(path, 0); + status = autofs_mount_wait(path, 0); if (status) return ERR_PTR(status); goto done; @@ -405,7 +399,7 @@ static struct vfsmount *autofs4_d_automount(struct path *path) } ino->flags |= AUTOFS_INF_PENDING; spin_unlock(&sbi->fs_lock); - status = autofs4_mount_wait(path, 0); + status = autofs_mount_wait(path, 0); spin_lock(&sbi->fs_lock); ino->flags &= ~AUTOFS_INF_PENDING; if (status) { @@ -416,24 +410,24 @@ static struct vfsmount *autofs4_d_automount(struct path *path) spin_unlock(&sbi->fs_lock); done: /* Mount succeeded, check if we ended up with a new dentry */ - dentry = autofs4_mountpoint_changed(path); + dentry = autofs_mountpoint_changed(path); if (!dentry) return ERR_PTR(-ENOENT); return NULL; } -static int autofs4_d_manage(const struct path *path, bool rcu_walk) +static int autofs_d_manage(const struct path *path, bool rcu_walk) { struct dentry *dentry = path->dentry; - struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); - struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); + struct autofs_info *ino = autofs_dentry_ino(dentry); int status; pr_debug("dentry=%p %pd\n", dentry, dentry); /* The daemon never waits. */ - if (autofs4_oz_mode(sbi)) { + if (autofs_oz_mode(sbi)) { if (!path_is_mountpoint(path)) return -EISDIR; return 0; @@ -447,7 +441,7 @@ static int autofs4_d_manage(const struct path *path, bool rcu_walk) * This dentry may be under construction so wait on mount * completion. */ - status = autofs4_mount_wait(path, rcu_walk); + status = autofs_mount_wait(path, rcu_walk); if (status) return status; @@ -500,8 +494,8 @@ static int autofs4_d_manage(const struct path *path, bool rcu_walk) } /* Lookups in the root directory */ -static struct dentry *autofs4_lookup(struct inode *dir, - struct dentry *dentry, unsigned int flags) +static struct dentry *autofs_lookup(struct inode *dir, + struct dentry *dentry, unsigned int flags) { struct autofs_sb_info *sbi; struct autofs_info *ino; @@ -513,13 +507,13 @@ static struct dentry *autofs4_lookup(struct inode *dir, if (dentry->d_name.len > NAME_MAX) return ERR_PTR(-ENAMETOOLONG); - sbi = autofs4_sbi(dir->i_sb); + sbi = autofs_sbi(dir->i_sb); pr_debug("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d\n", current->pid, task_pgrp_nr(current), sbi->catatonic, - autofs4_oz_mode(sbi)); + autofs_oz_mode(sbi)); - active = autofs4_lookup_active(dentry); + active = autofs_lookup_active(dentry); if (active) return active; else { @@ -529,7 +523,7 @@ static struct dentry *autofs4_lookup(struct inode *dir, * can return fail immediately. The daemon however does need * to create directories within the file system. */ - if (!autofs4_oz_mode(sbi) && !IS_ROOT(dentry->d_parent)) + if (!autofs_oz_mode(sbi) && !IS_ROOT(dentry->d_parent)) return ERR_PTR(-ENOENT); /* Mark entries in the root as mount triggers */ @@ -537,24 +531,24 @@ static struct dentry *autofs4_lookup(struct inode *dir, autofs_type_indirect(sbi->type)) __managed_dentry_set_managed(dentry); - ino = autofs4_new_ino(sbi); + ino = autofs_new_ino(sbi); if (!ino) return ERR_PTR(-ENOMEM); dentry->d_fsdata = ino; ino->dentry = dentry; - autofs4_add_active(dentry); + autofs_add_active(dentry); } return NULL; } -static int autofs4_dir_symlink(struct inode *dir, +static int autofs_dir_symlink(struct inode *dir, struct dentry *dentry, const char *symname) { - struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb); - struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb); + struct autofs_info *ino = autofs_dentry_ino(dentry); struct autofs_info *p_ino; struct inode *inode; size_t size = strlen(symname); @@ -562,14 +556,14 @@ static int autofs4_dir_symlink(struct inode *dir, pr_debug("%s <- %pd\n", symname, dentry); - if (!autofs4_oz_mode(sbi)) + if (!autofs_oz_mode(sbi)) return -EACCES; BUG_ON(!ino); - autofs4_clean_ino(ino); + autofs_clean_ino(ino); - autofs4_del_active(dentry); + autofs_del_active(dentry); cp = kmalloc(size + 1, GFP_KERNEL); if (!cp) @@ -577,7 +571,7 @@ static int autofs4_dir_symlink(struct inode *dir, strcpy(cp, symname); - inode = autofs4_get_inode(dir->i_sb, S_IFLNK | 0555); + inode = autofs_get_inode(dir->i_sb, S_IFLNK | 0555); if (!inode) { kfree(cp); return -ENOMEM; @@ -588,7 +582,7 @@ static int autofs4_dir_symlink(struct inode *dir, dget(dentry); atomic_inc(&ino->count); - p_ino = autofs4_dentry_ino(dentry->d_parent); + p_ino = autofs_dentry_ino(dentry->d_parent); if (p_ino && !IS_ROOT(dentry)) atomic_inc(&p_ino->count); @@ -610,20 +604,20 @@ static int autofs4_dir_symlink(struct inode *dir, * If a process is blocked on the dentry waiting for the expire to finish, * it will invalidate the dentry and try to mount with a new one. * - * Also see autofs4_dir_rmdir().. + * Also see autofs_dir_rmdir().. */ -static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry) +static int autofs_dir_unlink(struct inode *dir, struct dentry *dentry) { - struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb); - struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb); + struct autofs_info *ino = autofs_dentry_ino(dentry); struct autofs_info *p_ino; /* This allows root to remove symlinks */ - if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) + if (!autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) return -EPERM; if (atomic_dec_and_test(&ino->count)) { - p_ino = autofs4_dentry_ino(dentry->d_parent); + p_ino = autofs_dentry_ino(dentry->d_parent); if (p_ino && !IS_ROOT(dentry)) atomic_dec(&p_ino->count); } @@ -635,7 +629,7 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry) dir->i_mtime = current_time(dir); spin_lock(&sbi->lookup_lock); - __autofs4_add_expiring(dentry); + __autofs_add_expiring(dentry); d_drop(dentry); spin_unlock(&sbi->lookup_lock); @@ -692,15 +686,15 @@ static void autofs_clear_leaf_automount_flags(struct dentry *dentry) managed_dentry_set_managed(parent); } -static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry) +static int autofs_dir_rmdir(struct inode *dir, struct dentry *dentry) { - struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb); - struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb); + struct autofs_info *ino = autofs_dentry_ino(dentry); struct autofs_info *p_ino; pr_debug("dentry %p, removing %pd\n", dentry, dentry); - if (!autofs4_oz_mode(sbi)) + if (!autofs_oz_mode(sbi)) return -EACCES; spin_lock(&sbi->lookup_lock); @@ -708,7 +702,7 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry) spin_unlock(&sbi->lookup_lock); return -ENOTEMPTY; } - __autofs4_add_expiring(dentry); + __autofs_add_expiring(dentry); d_drop(dentry); spin_unlock(&sbi->lookup_lock); @@ -716,7 +710,7 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry) autofs_clear_leaf_automount_flags(dentry); if (atomic_dec_and_test(&ino->count)) { - p_ino = autofs4_dentry_ino(dentry->d_parent); + p_ino = autofs_dentry_ino(dentry->d_parent); if (p_ino && dentry->d_parent != dentry) atomic_dec(&p_ino->count); } @@ -730,26 +724,26 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry) return 0; } -static int autofs4_dir_mkdir(struct inode *dir, - struct dentry *dentry, umode_t mode) +static int autofs_dir_mkdir(struct inode *dir, + struct dentry *dentry, umode_t mode) { - struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb); - struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb); + struct autofs_info *ino = autofs_dentry_ino(dentry); struct autofs_info *p_ino; struct inode *inode; - if (!autofs4_oz_mode(sbi)) + if (!autofs_oz_mode(sbi)) return -EACCES; pr_debug("dentry %p, creating %pd\n", dentry, dentry); BUG_ON(!ino); - autofs4_clean_ino(ino); + autofs_clean_ino(ino); - autofs4_del_active(dentry); + autofs_del_active(dentry); - inode = autofs4_get_inode(dir->i_sb, S_IFDIR | mode); + inode = autofs_get_inode(dir->i_sb, S_IFDIR | mode); if (!inode) return -ENOMEM; d_add(dentry, inode); @@ -759,7 +753,7 @@ static int autofs4_dir_mkdir(struct inode *dir, dget(dentry); atomic_inc(&ino->count); - p_ino = autofs4_dentry_ino(dentry->d_parent); + p_ino = autofs_dentry_ino(dentry->d_parent); if (p_ino && !IS_ROOT(dentry)) atomic_inc(&p_ino->count); inc_nlink(dir); @@ -770,7 +764,7 @@ static int autofs4_dir_mkdir(struct inode *dir, /* Get/set timeout ioctl() operation */ #ifdef CONFIG_COMPAT -static inline int autofs4_compat_get_set_timeout(struct autofs_sb_info *sbi, +static inline int autofs_compat_get_set_timeout(struct autofs_sb_info *sbi, compat_ulong_t __user *p) { unsigned long ntimeout; @@ -795,7 +789,7 @@ error: } #endif -static inline int autofs4_get_set_timeout(struct autofs_sb_info *sbi, +static inline int autofs_get_set_timeout(struct autofs_sb_info *sbi, unsigned long __user *p) { unsigned long ntimeout; @@ -820,14 +814,14 @@ error: } /* Return protocol version */ -static inline int autofs4_get_protover(struct autofs_sb_info *sbi, +static inline int autofs_get_protover(struct autofs_sb_info *sbi, int __user *p) { return put_user(sbi->version, p); } /* Return protocol sub version */ -static inline int autofs4_get_protosubver(struct autofs_sb_info *sbi, +static inline int autofs_get_protosubver(struct autofs_sb_info *sbi, int __user *p) { return put_user(sbi->sub_version, p); @@ -836,7 +830,7 @@ static inline int autofs4_get_protosubver(struct autofs_sb_info *sbi, /* * Tells the daemon whether it can umount the autofs mount. */ -static inline int autofs4_ask_umount(struct vfsmount *mnt, int __user *p) +static inline int autofs_ask_umount(struct vfsmount *mnt, int __user *p) { int status = 0; @@ -850,14 +844,14 @@ static inline int autofs4_ask_umount(struct vfsmount *mnt, int __user *p) return status; } -/* Identify autofs4_dentries - this is so we can tell if there's +/* Identify autofs_dentries - this is so we can tell if there's * an extra dentry refcount or not. We only hold a refcount on the * dentry if its non-negative (ie, d_inode != NULL) */ -int is_autofs4_dentry(struct dentry *dentry) +int is_autofs_dentry(struct dentry *dentry) { return dentry && d_really_is_positive(dentry) && - dentry->d_op == &autofs4_dentry_operations && + dentry->d_op == &autofs_dentry_operations && dentry->d_fsdata != NULL; } @@ -865,10 +859,10 @@ int is_autofs4_dentry(struct dentry *dentry) * ioctl()'s on the root directory is the chief method for the daemon to * generate kernel reactions */ -static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp, +static int autofs_root_ioctl_unlocked(struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg) { - struct autofs_sb_info *sbi = autofs4_sbi(inode->i_sb); + struct autofs_sb_info *sbi = autofs_sbi(inode->i_sb); void __user *p = (void __user *)arg; pr_debug("cmd = 0x%08x, arg = 0x%08lx, sbi = %p, pgrp = %u\n", @@ -878,64 +872,63 @@ static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp, _IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT) return -ENOTTY; - if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) + if (!autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) return -EPERM; switch (cmd) { case AUTOFS_IOC_READY: /* Wait queue: go ahead and retry */ - return autofs4_wait_release(sbi, (autofs_wqt_t) arg, 0); + return autofs_wait_release(sbi, (autofs_wqt_t) arg, 0); case AUTOFS_IOC_FAIL: /* Wait queue: fail with ENOENT */ - return autofs4_wait_release(sbi, (autofs_wqt_t) arg, -ENOENT); + return autofs_wait_release(sbi, (autofs_wqt_t) arg, -ENOENT); case AUTOFS_IOC_CATATONIC: /* Enter catatonic mode (daemon shutdown) */ - autofs4_catatonic_mode(sbi); + autofs_catatonic_mode(sbi); return 0; case AUTOFS_IOC_PROTOVER: /* Get protocol version */ - return autofs4_get_protover(sbi, p); + return autofs_get_protover(sbi, p); case AUTOFS_IOC_PROTOSUBVER: /* Get protocol sub version */ - return autofs4_get_protosubver(sbi, p); + return autofs_get_protosubver(sbi, p); case AUTOFS_IOC_SETTIMEOUT: - return autofs4_get_set_timeout(sbi, p); + return autofs_get_set_timeout(sbi, p); #ifdef CONFIG_COMPAT case AUTOFS_IOC_SETTIMEOUT32: - return autofs4_compat_get_set_timeout(sbi, p); + return autofs_compat_get_set_timeout(sbi, p); #endif case AUTOFS_IOC_ASKUMOUNT: - return autofs4_ask_umount(filp->f_path.mnt, p); + return autofs_ask_umount(filp->f_path.mnt, p); /* return a single thing to expire */ case AUTOFS_IOC_EXPIRE: - return autofs4_expire_run(inode->i_sb, - filp->f_path.mnt, sbi, p); + return autofs_expire_run(inode->i_sb, filp->f_path.mnt, sbi, p); /* same as above, but can send multiple expires through pipe */ case AUTOFS_IOC_EXPIRE_MULTI: - return autofs4_expire_multi(inode->i_sb, - filp->f_path.mnt, sbi, p); + return autofs_expire_multi(inode->i_sb, + filp->f_path.mnt, sbi, p); default: return -EINVAL; } } -static long autofs4_root_ioctl(struct file *filp, +static long autofs_root_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); - return autofs4_root_ioctl_unlocked(inode, filp, cmd, arg); + return autofs_root_ioctl_unlocked(inode, filp, cmd, arg); } #ifdef CONFIG_COMPAT -static long autofs4_root_compat_ioctl(struct file *filp, +static long autofs_root_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); int ret; if (cmd == AUTOFS_IOC_READY || cmd == AUTOFS_IOC_FAIL) - ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg); + ret = autofs_root_ioctl_unlocked(inode, filp, cmd, arg); else - ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, + ret = autofs_root_ioctl_unlocked(inode, filp, cmd, (unsigned long) compat_ptr(arg)); return ret; diff --git a/fs/autofs4/symlink.c b/fs/autofs/symlink.c index ab0b428..aad3902 100644 --- a/fs/autofs4/symlink.c +++ b/fs/autofs/symlink.c @@ -8,22 +8,22 @@ #include "autofs_i.h" -static const char *autofs4_get_link(struct dentry *dentry, - struct inode *inode, - struct delayed_call *done) +static const char *autofs_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) { struct autofs_sb_info *sbi; struct autofs_info *ino; if (!dentry) return ERR_PTR(-ECHILD); - sbi = autofs4_sbi(dentry->d_sb); - ino = autofs4_dentry_ino(dentry); - if (ino && !autofs4_oz_mode(sbi)) + sbi = autofs_sbi(dentry->d_sb); + ino = autofs_dentry_ino(dentry); + if (ino && !autofs_oz_mode(sbi)) ino->last_used = jiffies; return d_inode(dentry)->i_private; } -const struct inode_operations autofs4_symlink_inode_operations = { - .get_link = autofs4_get_link +const struct inode_operations autofs_symlink_inode_operations = { + .get_link = autofs_get_link }; diff --git a/fs/autofs4/waitq.c b/fs/autofs/waitq.c index be9c3dc..f6385c6 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs/waitq.c @@ -7,19 +7,15 @@ * option, any later version, incorporated herein by reference. */ -#include <linux/slab.h> -#include <linux/time.h> -#include <linux/signal.h> #include <linux/sched/signal.h> -#include <linux/file.h> #include "autofs_i.h" /* We make this a static variable rather than a part of the superblock; it * is better if we don't reassign numbers easily even across filesystems */ -static autofs_wqt_t autofs4_next_wait_queue = 1; +static autofs_wqt_t autofs_next_wait_queue = 1; -void autofs4_catatonic_mode(struct autofs_sb_info *sbi) +void autofs_catatonic_mode(struct autofs_sb_info *sbi) { struct autofs_wait_queue *wq, *nwq; @@ -49,8 +45,8 @@ void autofs4_catatonic_mode(struct autofs_sb_info *sbi) mutex_unlock(&sbi->wq_mutex); } -static int autofs4_write(struct autofs_sb_info *sbi, - struct file *file, const void *addr, int bytes) +static int autofs_write(struct autofs_sb_info *sbi, + struct file *file, const void *addr, int bytes) { unsigned long sigpipe, flags; const char *data = (const char *)addr; @@ -82,7 +78,7 @@ static int autofs4_write(struct autofs_sb_info *sbi, return bytes == 0 ? 0 : wr < 0 ? wr : -EIO; } -static void autofs4_notify_daemon(struct autofs_sb_info *sbi, +static void autofs_notify_daemon(struct autofs_sb_info *sbi, struct autofs_wait_queue *wq, int type) { @@ -167,23 +163,23 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, mutex_unlock(&sbi->wq_mutex); - switch (ret = autofs4_write(sbi, pipe, &pkt, pktsz)) { + switch (ret = autofs_write(sbi, pipe, &pkt, pktsz)) { case 0: break; case -ENOMEM: case -ERESTARTSYS: /* Just fail this one */ - autofs4_wait_release(sbi, wq->wait_queue_token, ret); + autofs_wait_release(sbi, wq->wait_queue_token, ret); break; default: - autofs4_catatonic_mode(sbi); + autofs_catatonic_mode(sbi); break; } fput(pipe); } -static int autofs4_getpath(struct autofs_sb_info *sbi, - struct dentry *dentry, char **name) +static int autofs_getpath(struct autofs_sb_info *sbi, + struct dentry *dentry, char *name) { struct dentry *root = sbi->sb->s_root; struct dentry *tmp; @@ -193,7 +189,7 @@ static int autofs4_getpath(struct autofs_sb_info *sbi, unsigned seq; rename_retry: - buf = *name; + buf = name; len = 0; seq = read_seqbegin(&rename_lock); @@ -228,7 +224,7 @@ rename_retry: } static struct autofs_wait_queue * -autofs4_find_wait(struct autofs_sb_info *sbi, const struct qstr *qstr) +autofs_find_wait(struct autofs_sb_info *sbi, const struct qstr *qstr) { struct autofs_wait_queue *wq; @@ -263,7 +259,7 @@ static int validate_request(struct autofs_wait_queue **wait, return -ENOENT; /* Wait in progress, continue; */ - wq = autofs4_find_wait(sbi, qstr); + wq = autofs_find_wait(sbi, qstr); if (wq) { *wait = wq; return 1; @@ -272,7 +268,7 @@ static int validate_request(struct autofs_wait_queue **wait, *wait = NULL; /* If we don't yet have any info this is a new request */ - ino = autofs4_dentry_ino(dentry); + ino = autofs_dentry_ino(dentry); if (!ino) return 1; @@ -297,7 +293,7 @@ static int validate_request(struct autofs_wait_queue **wait, if (sbi->catatonic) return -ENOENT; - wq = autofs4_find_wait(sbi, qstr); + wq = autofs_find_wait(sbi, qstr); if (wq) { *wait = wq; return 1; @@ -351,7 +347,7 @@ static int validate_request(struct autofs_wait_queue **wait, return 1; } -int autofs4_wait(struct autofs_sb_info *sbi, +int autofs_wait(struct autofs_sb_info *sbi, const struct path *path, enum autofs_notify notify) { struct dentry *dentry = path->dentry; @@ -399,7 +395,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, if (IS_ROOT(dentry) && autofs_type_trigger(sbi->type)) qstr.len = sprintf(name, "%p", dentry); else { - qstr.len = autofs4_getpath(sbi, dentry, &name); + qstr.len = autofs_getpath(sbi, dentry, name); if (!qstr.len) { kfree(name); return -ENOENT; @@ -430,15 +426,15 @@ int autofs4_wait(struct autofs_sb_info *sbi, return -ENOMEM; } - wq->wait_queue_token = autofs4_next_wait_queue; - if (++autofs4_next_wait_queue == 0) - autofs4_next_wait_queue = 1; + wq->wait_queue_token = autofs_next_wait_queue; + if (++autofs_next_wait_queue == 0) + autofs_next_wait_queue = 1; wq->next = sbi->queues; sbi->queues = wq; init_waitqueue_head(&wq->queue); memcpy(&wq->name, &qstr, sizeof(struct qstr)); - wq->dev = autofs4_get_dev(sbi); - wq->ino = autofs4_get_ino(sbi); + wq->dev = autofs_get_dev(sbi); + wq->ino = autofs_get_ino(sbi); wq->uid = current_uid(); wq->gid = current_gid(); wq->pid = pid; @@ -467,9 +463,9 @@ int autofs4_wait(struct autofs_sb_info *sbi, wq->name.name, notify); /* - * autofs4_notify_daemon() may block; it will unlock ->wq_mutex + * autofs_notify_daemon() may block; it will unlock ->wq_mutex */ - autofs4_notify_daemon(sbi, wq, type); + autofs_notify_daemon(sbi, wq, type); } else { wq->wait_ctr++; pr_debug("existing wait id = 0x%08lx, name = %.*s, nfy=%d\n", @@ -500,12 +496,12 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *de = NULL; /* direct mount or browsable map */ - ino = autofs4_dentry_ino(dentry); + ino = autofs_dentry_ino(dentry); if (!ino) { /* If not lookup actual dentry used */ de = d_lookup(dentry->d_parent, &dentry->d_name); if (de) - ino = autofs4_dentry_ino(de); + ino = autofs_dentry_ino(de); } /* Set mount requester */ @@ -530,7 +526,8 @@ int autofs4_wait(struct autofs_sb_info *sbi, } -int autofs4_wait_release(struct autofs_sb_info *sbi, autofs_wqt_t wait_queue_token, int status) +int autofs_wait_release(struct autofs_sb_info *sbi, + autofs_wqt_t wait_queue_token, int status) { struct autofs_wait_queue *wq, **wql; diff --git a/fs/autofs4/Kconfig b/fs/autofs4/Kconfig index 44727bf..99fda4d 100644 --- a/fs/autofs4/Kconfig +++ b/fs/autofs4/Kconfig @@ -1,5 +1,7 @@ config AUTOFS4_FS - tristate "Kernel automounter version 4 support (also supports v3)" + tristate "Kernel automounter version 4 support (also supports v3 and v5)" + default n + depends on AUTOFS_FS = n help The automounter is a tool to automatically mount remote file systems on demand. This implementation is partially kernel-based to reduce @@ -7,14 +9,38 @@ config AUTOFS4_FS automounter (amd), which is a pure user space daemon. To use the automounter you need the user-space tools from - <https://www.kernel.org/pub/linux/daemons/autofs/v4/>; you also - want to answer Y to "NFS file system support", below. + <https://www.kernel.org/pub/linux/daemons/autofs/>; you also want + to answer Y to "NFS file system support", below. - To compile this support as a module, choose M here: the module will be - called autofs4. You will need to add "alias autofs autofs4" to your - modules configuration file. + This module is in the process of being renamed from autofs4 to + autofs. Since autofs is now the only module that provides the + autofs file system the module is not version 4 specific. - If you are not a part of a fairly large, distributed network or - don't have a laptop which needs to dynamically reconfigure to the - local network, you probably do not need an automounter, and can say - N here. + The autofs4 module is now built from the source located in + fs/autofs. The autofs4 directory and its configuration entry + will be removed two kernel versions from the inclusion of this + change. + + Changes that will need to be made should be limited to: + - source include statments should be changed from autofs_fs4.h to + autofs_fs.h since these two header files have been merged. + - user space scripts that manually load autofs4.ko should be + changed to load autofs.ko. But since the module directory name + and the module name are the same as the file system name there + is no need to manually load module. + - any "alias autofs autofs4" will need to be removed. + - due to the autofs4 module directory name not being the same as + its file system name autoloading didn't work properly. Because + of this kernel configurations would often build the module into + the kernel. This may have resulted in selinux policies that will + prevent the autofs module from autoloading and will need to be + updated. + + Please configure AUTOFS_FS instead of AUTOFS4_FS from now on. + + NOTE: Since the modules autofs and autofs4 use the same file system + type name of "autofs" only one can be built. The "depends" + above will result in AUTOFS4_FS not appearing in .config for + any setting of AUTOFS_FS other than n and AUTOFS4_FS will + appear under the AUTOFS_FS entry otherwise which is intended + to draw attention to the module rename change. diff --git a/fs/autofs4/Makefile b/fs/autofs4/Makefile index a811c1f..417dd72 100644 --- a/fs/autofs4/Makefile +++ b/fs/autofs4/Makefile @@ -4,4 +4,6 @@ obj-$(CONFIG_AUTOFS4_FS) += autofs4.o -autofs4-objs := init.o inode.o root.o symlink.o waitq.o expire.o dev-ioctl.o +autofs4-objs := ../autofs/init.o ../autofs/inode.o ../autofs/root.o \ + ../autofs/symlink.o ../autofs/waitq.o ../autofs/expire.o \ + ../autofs/dev-ioctl.o diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index a41b48f..4de1915 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -387,8 +387,13 @@ static Node *create_entry(const char __user *buffer, size_t count) s = strchr(p, del); if (!s) goto einval; - *s++ = '\0'; - e->offset = simple_strtoul(p, &p, 10); + *s = '\0'; + if (p != s) { + int r = kstrtoint(p, 10, &e->offset); + if (r != 0 || e->offset < 0) + goto einval; + } + p = s; if (*p++) goto einval; pr_debug("register: offset: %#x\n", e->offset); @@ -428,7 +433,8 @@ static Node *create_entry(const char __user *buffer, size_t count) if (e->mask && string_unescape_inplace(e->mask, UNESCAPE_HEX) != e->size) goto einval; - if (e->size + e->offset > BINPRM_BUF_SIZE) + if (e->size > BINPRM_BUF_SIZE || + BINPRM_BUF_SIZE - e->size < e->offset) goto einval; pr_debug("register: magic/mask length: %i\n", e->size); if (USE_DEBUG) { diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index ef80085..9907475 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -38,8 +38,6 @@ #include <linux/ppp-ioctl.h> #include <linux/if_pppox.h> #include <linux/mtio.h> -#include <linux/auto_fs.h> -#include <linux/auto_fs4.h> #include <linux/tty.h> #include <linux/vt_kern.h> #include <linux/fb.h> @@ -905,12 +905,12 @@ out: * If this page is ever written to we will re-fault and change the mapping to * point to real DAX storage instead. */ -static int dax_load_hole(struct address_space *mapping, void *entry, +static vm_fault_t dax_load_hole(struct address_space *mapping, void *entry, struct vm_fault *vmf) { struct inode *inode = mapping->host; unsigned long vaddr = vmf->address; - int ret = VM_FAULT_NOPAGE; + vm_fault_t ret = VM_FAULT_NOPAGE; struct page *zero_page; void *entry2; pfn_t pfn; @@ -929,7 +929,7 @@ static int dax_load_hole(struct address_space *mapping, void *entry, goto out; } - vm_insert_mixed(vmf->vma, vaddr, pfn); + ret = vmf_insert_mixed(vmf->vma, vaddr, pfn); out: trace_dax_load_hole(inode, vmf, ret); return ret; @@ -1112,7 +1112,7 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, } EXPORT_SYMBOL_GPL(dax_iomap_rw); -static int dax_fault_return(int error) +static vm_fault_t dax_fault_return(int error) { if (error == 0) return VM_FAULT_NOPAGE; @@ -1132,7 +1132,7 @@ static bool dax_fault_is_synchronous(unsigned long flags, && (iomap->flags & IOMAP_F_DIRTY); } -static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, +static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, int *iomap_errp, const struct iomap_ops *ops) { struct vm_area_struct *vma = vmf->vma; @@ -1145,18 +1145,18 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, int error, major = 0; bool write = vmf->flags & FAULT_FLAG_WRITE; bool sync; - int vmf_ret = 0; + vm_fault_t ret = 0; void *entry; pfn_t pfn; - trace_dax_pte_fault(inode, vmf, vmf_ret); + trace_dax_pte_fault(inode, vmf, ret); /* * Check whether offset isn't beyond end of file now. Caller is supposed * to hold locks serializing us with truncate / punch hole so this is * a reliable test. */ if (pos >= i_size_read(inode)) { - vmf_ret = VM_FAULT_SIGBUS; + ret = VM_FAULT_SIGBUS; goto out; } @@ -1165,7 +1165,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, entry = grab_mapping_entry(mapping, vmf->pgoff, 0); if (IS_ERR(entry)) { - vmf_ret = dax_fault_return(PTR_ERR(entry)); + ret = dax_fault_return(PTR_ERR(entry)); goto out; } @@ -1176,7 +1176,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, * retried. */ if (pmd_trans_huge(*vmf->pmd) || pmd_devmap(*vmf->pmd)) { - vmf_ret = VM_FAULT_NOPAGE; + ret = VM_FAULT_NOPAGE; goto unlock_entry; } @@ -1189,7 +1189,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, if (iomap_errp) *iomap_errp = error; if (error) { - vmf_ret = dax_fault_return(error); + ret = dax_fault_return(error); goto unlock_entry; } if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) { @@ -1219,9 +1219,9 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, goto error_finish_iomap; __SetPageUptodate(vmf->cow_page); - vmf_ret = finish_fault(vmf); - if (!vmf_ret) - vmf_ret = VM_FAULT_DONE_COW; + ret = finish_fault(vmf); + if (!ret) + ret = VM_FAULT_DONE_COW; goto finish_iomap; } @@ -1257,23 +1257,20 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, goto error_finish_iomap; } *pfnp = pfn; - vmf_ret = VM_FAULT_NEEDDSYNC | major; + ret = VM_FAULT_NEEDDSYNC | major; goto finish_iomap; } trace_dax_insert_mapping(inode, vmf, entry); if (write) - error = vm_insert_mixed_mkwrite(vma, vaddr, pfn); + ret = vmf_insert_mixed_mkwrite(vma, vaddr, pfn); else - error = vm_insert_mixed(vma, vaddr, pfn); + ret = vmf_insert_mixed(vma, vaddr, pfn); - /* -EBUSY is fine, somebody else faulted on the same PTE */ - if (error == -EBUSY) - error = 0; - break; + goto finish_iomap; case IOMAP_UNWRITTEN: case IOMAP_HOLE: if (!write) { - vmf_ret = dax_load_hole(mapping, entry, vmf); + ret = dax_load_hole(mapping, entry, vmf); goto finish_iomap; } /*FALLTHRU*/ @@ -1284,12 +1281,12 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, } error_finish_iomap: - vmf_ret = dax_fault_return(error) | major; + ret = dax_fault_return(error); finish_iomap: if (ops->iomap_end) { int copied = PAGE_SIZE; - if (vmf_ret & VM_FAULT_ERROR) + if (ret & VM_FAULT_ERROR) copied = 0; /* * The fault is done by now and there's no way back (other @@ -1302,12 +1299,12 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, unlock_entry: put_locked_mapping_entry(mapping, vmf->pgoff); out: - trace_dax_pte_fault_done(inode, vmf, vmf_ret); - return vmf_ret; + trace_dax_pte_fault_done(inode, vmf, ret); + return ret | major; } #ifdef CONFIG_FS_DAX_PMD -static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, +static vm_fault_t dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, void *entry) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; @@ -1348,7 +1345,7 @@ fallback: return VM_FAULT_FALLBACK; } -static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, +static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, const struct iomap_ops *ops) { struct vm_area_struct *vma = vmf->vma; @@ -1358,7 +1355,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, bool sync; unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT; struct inode *inode = mapping->host; - int result = VM_FAULT_FALLBACK; + vm_fault_t result = VM_FAULT_FALLBACK; struct iomap iomap = { 0 }; pgoff_t max_pgoff, pgoff; void *entry; @@ -1509,7 +1506,7 @@ out: return result; } #else -static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, +static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, const struct iomap_ops *ops) { return VM_FAULT_FALLBACK; @@ -1529,7 +1526,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, * has done all the necessary locking for page fault to proceed * successfully. */ -int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size, +vm_fault_t dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size, pfn_t *pfnp, int *iomap_errp, const struct iomap_ops *ops) { switch (pe_size) { @@ -1553,14 +1550,14 @@ EXPORT_SYMBOL_GPL(dax_iomap_fault); * DAX file. It takes care of marking corresponding radix tree entry as dirty * as well. */ -static int dax_insert_pfn_mkwrite(struct vm_fault *vmf, +static vm_fault_t dax_insert_pfn_mkwrite(struct vm_fault *vmf, enum page_entry_size pe_size, pfn_t pfn) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; void *entry, **slot; pgoff_t index = vmf->pgoff; - int vmf_ret, error; + vm_fault_t ret; xa_lock_irq(&mapping->i_pages); entry = get_unlocked_mapping_entry(mapping, index, &slot); @@ -1579,21 +1576,20 @@ static int dax_insert_pfn_mkwrite(struct vm_fault *vmf, xa_unlock_irq(&mapping->i_pages); switch (pe_size) { case PE_SIZE_PTE: - error = vm_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn); - vmf_ret = dax_fault_return(error); + ret = vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn); break; #ifdef CONFIG_FS_DAX_PMD case PE_SIZE_PMD: - vmf_ret = vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, + ret = vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, pfn, true); break; #endif default: - vmf_ret = VM_FAULT_FALLBACK; + ret = VM_FAULT_FALLBACK; } put_locked_mapping_entry(mapping, index); - trace_dax_insert_pfn_mkwrite(mapping->host, vmf, vmf_ret); - return vmf_ret; + trace_dax_insert_pfn_mkwrite(mapping->host, vmf, ret); + return ret; } /** @@ -1606,8 +1602,8 @@ static int dax_insert_pfn_mkwrite(struct vm_fault *vmf, * stored persistently on the media and handles inserting of appropriate page * table entry. */ -int dax_finish_sync_fault(struct vm_fault *vmf, enum page_entry_size pe_size, - pfn_t pfn) +vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, + enum page_entry_size pe_size, pfn_t pfn) { int err; loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT; @@ -23,7 +23,7 @@ #include <linux/rcupdate.h> #include <linux/pid_namespace.h> #include <linux/user_namespace.h> -#include <linux/shmem_fs.h> +#include <linux/memfd.h> #include <linux/compat.h> #include <linux/poll.h> diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 97a972e..68728de 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -788,35 +788,34 @@ static inline void ocfs2_add_holder(struct ocfs2_lock_res *lockres, spin_unlock(&lockres->l_lock); } -static inline void ocfs2_remove_holder(struct ocfs2_lock_res *lockres, - struct ocfs2_lock_holder *oh) -{ - spin_lock(&lockres->l_lock); - list_del(&oh->oh_list); - spin_unlock(&lockres->l_lock); - - put_pid(oh->oh_owner_pid); -} - -static inline int ocfs2_is_locked_by_me(struct ocfs2_lock_res *lockres) +static struct ocfs2_lock_holder * +ocfs2_pid_holder(struct ocfs2_lock_res *lockres, + struct pid *pid) { struct ocfs2_lock_holder *oh; - struct pid *pid; - /* look in the list of holders for one with the current task as owner */ spin_lock(&lockres->l_lock); - pid = task_pid(current); list_for_each_entry(oh, &lockres->l_holders, oh_list) { if (oh->oh_owner_pid == pid) { spin_unlock(&lockres->l_lock); - return 1; + return oh; } } spin_unlock(&lockres->l_lock); + return NULL; +} - return 0; +static inline void ocfs2_remove_holder(struct ocfs2_lock_res *lockres, + struct ocfs2_lock_holder *oh) +{ + spin_lock(&lockres->l_lock); + list_del(&oh->oh_list); + spin_unlock(&lockres->l_lock); + + put_pid(oh->oh_owner_pid); } + static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres, int level) { @@ -2610,34 +2609,93 @@ void ocfs2_inode_unlock(struct inode *inode, * * return < 0 on error, return == 0 if there's no lock holder on the stack * before this call, return == 1 if this call would be a recursive locking. + * return == -1 if this lock attempt will cause an upgrade which is forbidden. + * + * When taking lock levels into account,we face some different situations. + * + * 1. no lock is held + * In this case, just lock the inode as requested and return 0 + * + * 2. We are holding a lock + * For this situation, things diverges into several cases + * + * wanted holding what to do + * ex ex see 2.1 below + * ex pr see 2.2 below + * pr ex see 2.1 below + * pr pr see 2.1 below + * + * 2.1 lock level that is been held is compatible + * with the wanted level, so no lock action will be tacken. + * + * 2.2 Otherwise, an upgrade is needed, but it is forbidden. + * + * Reason why upgrade within a process is forbidden is that + * lock upgrade may cause dead lock. The following illustrates + * how it happens. + * + * thread on node1 thread on node2 + * ocfs2_inode_lock_tracker(ex=0) + * + * <====== ocfs2_inode_lock_tracker(ex=1) + * + * ocfs2_inode_lock_tracker(ex=1) */ int ocfs2_inode_lock_tracker(struct inode *inode, struct buffer_head **ret_bh, int ex, struct ocfs2_lock_holder *oh) { - int status; - int arg_flags = 0, has_locked; + int status = 0; struct ocfs2_lock_res *lockres; + struct ocfs2_lock_holder *tmp_oh; + struct pid *pid = task_pid(current); + lockres = &OCFS2_I(inode)->ip_inode_lockres; - has_locked = ocfs2_is_locked_by_me(lockres); - /* Just get buffer head if the cluster lock has been taken */ - if (has_locked) - arg_flags = OCFS2_META_LOCK_GETBH; + tmp_oh = ocfs2_pid_holder(lockres, pid); - if (likely(!has_locked || ret_bh)) { - status = ocfs2_inode_lock_full(inode, ret_bh, ex, arg_flags); + if (!tmp_oh) { + /* + * This corresponds to the case 1. + * We haven't got any lock before. + */ + status = ocfs2_inode_lock_full(inode, ret_bh, ex, 0); if (status < 0) { if (status != -ENOENT) mlog_errno(status); return status; } - } - if (!has_locked) + + oh->oh_ex = ex; ocfs2_add_holder(lockres, oh); + return 0; + } - return has_locked; + if (unlikely(ex && !tmp_oh->oh_ex)) { + /* + * case 2.2 upgrade may cause dead lock, forbid it. + */ + mlog(ML_ERROR, "Recursive locking is not permitted to " + "upgrade to EX level from PR level.\n"); + dump_stack(); + return -EINVAL; + } + + /* + * case 2.1 OCFS2_META_LOCK_GETBH flag make ocfs2_inode_lock_full. + * ignore the lock level and just update it. + */ + if (ret_bh) { + status = ocfs2_inode_lock_full(inode, ret_bh, ex, + OCFS2_META_LOCK_GETBH); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + return status; + } + } + return tmp_oh ? 1 : 0; } void ocfs2_inode_unlock_tracker(struct inode *inode, @@ -2649,12 +2707,13 @@ void ocfs2_inode_unlock_tracker(struct inode *inode, lockres = &OCFS2_I(inode)->ip_inode_lockres; /* had_lock means that the currect process already takes the cluster - * lock previously. If had_lock is 1, we have nothing to do here, and - * it will get unlocked where we got the lock. + * lock previously. + * If had_lock is 1, we have nothing to do here. + * If had_lock is 0, we will release the lock. */ if (!had_lock) { + ocfs2_inode_unlock(inode, oh->oh_ex); ocfs2_remove_holder(lockres, oh); - ocfs2_inode_unlock(inode, ex); } } diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h index 256e0a9..4ec1c82 100644 --- a/fs/ocfs2/dlmglue.h +++ b/fs/ocfs2/dlmglue.h @@ -96,6 +96,7 @@ struct ocfs2_trim_fs_info { struct ocfs2_lock_holder { struct list_head oh_list; struct pid *oh_owner_pid; + int oh_ex; }; /* ocfs2_inode_lock_full() 'arg_flags' flags */ diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 6ee94bc..a2a8603 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -563,8 +563,8 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb, return ret; } -static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start, - u32 clusters_to_add, int mark_unwritten) +static int ocfs2_extend_allocation(struct inode *inode, u32 logical_start, + u32 clusters_to_add, int mark_unwritten) { int status = 0; int restart_func = 0; @@ -1035,8 +1035,8 @@ int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh, clusters_to_add -= oi->ip_clusters; if (clusters_to_add) { - ret = __ocfs2_extend_allocation(inode, oi->ip_clusters, - clusters_to_add, 0); + ret = ocfs2_extend_allocation(inode, oi->ip_clusters, + clusters_to_add, 0); if (ret) { mlog_errno(ret); goto out; @@ -1493,7 +1493,7 @@ static int ocfs2_allocate_unwritten_extents(struct inode *inode, goto next; } - ret = __ocfs2_extend_allocation(inode, cpos, alloc_size, 1); + ret = ocfs2_extend_allocation(inode, cpos, alloc_size, 1); if (ret) { if (ret != -ENOSPC) mlog_errno(ret); diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h index 1fdc983..7eb7f03 100644 --- a/fs/ocfs2/file.h +++ b/fs/ocfs2/file.h @@ -65,8 +65,6 @@ int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh, u64 new_i_size, u64 zero_to); int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh, loff_t zero_to); -int ocfs2_extend_allocation(struct inode *inode, u32 logical_start, - u32 clusters_to_add, int mark_unwritten); int ocfs2_setattr(struct dentry *dentry, struct iattr *attr); int ocfs2_getattr(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags); diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index ab30c005..994726a 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -402,7 +402,7 @@ out_err: static void o2ffg_update_histogram(struct ocfs2_info_free_chunk_list *hist, unsigned int chunksize) { - int index; + u32 index; index = __ilog2_u32(chunksize); if (index >= OCFS2_INFO_MAX_HIST) diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index fb9a20e..05220b3 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -44,11 +44,11 @@ #include "ocfs2_trace.h" -static int ocfs2_fault(struct vm_fault *vmf) +static vm_fault_t ocfs2_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; sigset_t oldset; - int ret; + vm_fault_t ret; ocfs2_block_signals(&oldset); ret = filemap_fault(vmf); @@ -59,10 +59,11 @@ static int ocfs2_fault(struct vm_fault *vmf) return ret; } -static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh, - struct page *page) +static vm_fault_t __ocfs2_page_mkwrite(struct file *file, + struct buffer_head *di_bh, struct page *page) { - int ret = VM_FAULT_NOPAGE; + int err; + vm_fault_t ret = VM_FAULT_NOPAGE; struct inode *inode = file_inode(file); struct address_space *mapping = inode->i_mapping; loff_t pos = page_offset(page); @@ -105,15 +106,12 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh, if (page->index == last_index) len = ((size - 1) & ~PAGE_MASK) + 1; - ret = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_MMAP, + err = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_MMAP, &locked_page, &fsdata, di_bh, page); - if (ret) { - if (ret != -ENOSPC) - mlog_errno(ret); - if (ret == -ENOMEM) - ret = VM_FAULT_OOM; - else - ret = VM_FAULT_SIGBUS; + if (err) { + if (err != -ENOSPC) + mlog_errno(err); + ret = vmf_error(err); goto out; } @@ -121,20 +119,21 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh, ret = VM_FAULT_NOPAGE; goto out; } - ret = ocfs2_write_end_nolock(mapping, pos, len, len, fsdata); - BUG_ON(ret != len); + err = ocfs2_write_end_nolock(mapping, pos, len, len, fsdata); + BUG_ON(err != len); ret = VM_FAULT_LOCKED; out: return ret; } -static int ocfs2_page_mkwrite(struct vm_fault *vmf) +static vm_fault_t ocfs2_page_mkwrite(struct vm_fault *vmf) { struct page *page = vmf->page; struct inode *inode = file_inode(vmf->vma->vm_file); struct buffer_head *di_bh = NULL; sigset_t oldset; - int ret; + int err; + vm_fault_t ret; sb_start_pagefault(inode->i_sb); ocfs2_block_signals(&oldset); @@ -144,13 +143,10 @@ static int ocfs2_page_mkwrite(struct vm_fault *vmf) * node. Taking the data lock will also ensure that we don't * attempt page truncation as part of a downconvert. */ - ret = ocfs2_inode_lock(inode, &di_bh, 1); - if (ret < 0) { - mlog_errno(ret); - if (ret == -ENOMEM) - ret = VM_FAULT_OOM; - else - ret = VM_FAULT_SIGBUS; + err = ocfs2_inode_lock(inode, &di_bh, 1); + if (err < 0) { + mlog_errno(err); + ret = vmf_error(err); goto out; } diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 8dd6f70..b7ca84b 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -2332,8 +2332,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, struct buffer_head *orphan_dir_bh, bool dio) { - const int namelen = OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN; - char name[namelen + 1]; + char name[OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN + 1]; struct ocfs2_dinode *orphan_fe; int status = 0; struct ocfs2_dir_lookup_result lookup = { NULL, }; diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index 5bb4a89..7071ad0 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -807,11 +807,11 @@ struct ocfs2_dir_block_trailer { * in this block. (unused) */ /*10*/ __u8 db_signature[8]; /* Signature for verification */ __le64 db_reserved2; - __le64 db_free_next; /* Next block in list (unused) */ -/*20*/ __le64 db_blkno; /* Offset on disk, in blocks */ - __le64 db_parent_dinode; /* dinode which owns me, in +/*20*/ __le64 db_free_next; /* Next block in list (unused) */ + __le64 db_blkno; /* Offset on disk, in blocks */ +/*30*/ __le64 db_parent_dinode; /* dinode which owns me, in blocks */ -/*30*/ struct ocfs2_block_check db_check; /* Error checking */ + struct ocfs2_block_check db_check; /* Error checking */ /*40*/ }; diff --git a/fs/proc/array.c b/fs/proc/array.c index 004077f..0ceb3b6 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -268,7 +268,7 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p) unsigned long flags; sigset_t pending, shpending, blocked, ignored, caught; int num_threads = 0; - unsigned long qsize = 0; + unsigned int qsize = 0; unsigned long qlim = 0; sigemptyset(&pending); diff --git a/fs/proc/base.c b/fs/proc/base.c index af128b3..44dec22 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -213,10 +213,14 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, char *page; unsigned long count = _count; unsigned long arg_start, arg_end, env_start, env_end; - unsigned long len1, len2, len; - unsigned long p; + unsigned long len1, len2; + char __user *buf0 = buf; + struct { + unsigned long p; + unsigned long len; + } cmdline[2]; char c; - ssize_t rv; + int rv; BUG_ON(*pos < 0); @@ -239,12 +243,12 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, goto out_mmput; } - down_read(&mm->mmap_sem); + spin_lock(&mm->arg_lock); arg_start = mm->arg_start; arg_end = mm->arg_end; env_start = mm->env_start; env_end = mm->env_end; - up_read(&mm->mmap_sem); + spin_unlock(&mm->arg_lock); BUG_ON(arg_start > arg_end); BUG_ON(env_start > env_end); @@ -253,61 +257,31 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, len2 = env_end - env_start; /* Empty ARGV. */ - if (len1 == 0) { - rv = 0; - goto out_free_page; - } + if (len1 == 0) + goto end; + /* * Inherently racy -- command line shares address space * with code and data. */ - rv = access_remote_vm(mm, arg_end - 1, &c, 1, FOLL_ANON); - if (rv <= 0) - goto out_free_page; - - rv = 0; + if (access_remote_vm(mm, arg_end - 1, &c, 1, FOLL_ANON) != 1) + goto end; + cmdline[0].p = arg_start; + cmdline[0].len = len1; if (c == '\0') { /* Command line (set of strings) occupies whole ARGV. */ - if (len1 <= *pos) - goto out_free_page; - - p = arg_start + *pos; - len = len1 - *pos; - while (count > 0 && len > 0) { - unsigned int _count; - int nr_read; - - _count = min3(count, len, PAGE_SIZE); - nr_read = access_remote_vm(mm, p, page, _count, FOLL_ANON); - if (nr_read < 0) - rv = nr_read; - if (nr_read <= 0) - goto out_free_page; - - if (copy_to_user(buf, page, nr_read)) { - rv = -EFAULT; - goto out_free_page; - } - - p += nr_read; - len -= nr_read; - buf += nr_read; - count -= nr_read; - rv += nr_read; - } + cmdline[1].len = 0; } else { /* * Command line (1 string) occupies ARGV and * extends into ENVP. */ - struct { - unsigned long p; - unsigned long len; - } cmdline[2] = { - { .p = arg_start, .len = len1 }, - { .p = env_start, .len = len2 }, - }; + cmdline[1].p = env_start; + cmdline[1].len = len2; + } + + { loff_t pos1 = *pos; unsigned int i; @@ -317,44 +291,40 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, i++; } while (i < 2) { + unsigned long p; + unsigned long len; + p = cmdline[i].p + pos1; len = cmdline[i].len - pos1; while (count > 0 && len > 0) { - unsigned int _count, l; - int nr_read; - bool final; - - _count = min3(count, len, PAGE_SIZE); - nr_read = access_remote_vm(mm, p, page, _count, FOLL_ANON); - if (nr_read < 0) - rv = nr_read; - if (nr_read <= 0) - goto out_free_page; + unsigned int nr_read, nr_write; + + nr_read = min3(count, len, PAGE_SIZE); + nr_read = access_remote_vm(mm, p, page, nr_read, FOLL_ANON); + if (nr_read == 0) + goto end; /* * Command line can be shorter than whole ARGV * even if last "marker" byte says it is not. */ - final = false; - l = strnlen(page, nr_read); - if (l < nr_read) { - nr_read = l; - final = true; - } + if (c == '\0') + nr_write = nr_read; + else + nr_write = strnlen(page, nr_read); - if (copy_to_user(buf, page, nr_read)) { + if (copy_to_user(buf, page, nr_write)) { rv = -EFAULT; goto out_free_page; } - p += nr_read; - len -= nr_read; - buf += nr_read; - count -= nr_read; - rv += nr_read; + p += nr_write; + len -= nr_write; + buf += nr_write; + count -= nr_write; - if (final) - goto out_free_page; + if (nr_write < nr_read) + goto end; } /* Only first chunk can be read partially. */ @@ -363,12 +333,13 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, } } +end: + *pos += buf - buf0; + rv = buf - buf0; out_free_page: free_page((unsigned long)page); out_mmput: mmput(mm); - if (rv > 0) - *pos += rv; return rv; } @@ -430,7 +401,6 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns, struct stack_trace trace; unsigned long *entries; int err; - int i; entries = kmalloc(MAX_STACK_TRACE_DEPTH * sizeof(*entries), GFP_KERNEL); if (!entries) @@ -443,6 +413,8 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns, err = lock_trace(task); if (!err) { + unsigned int i; + save_stack_trace_tsk(task, &trace); for (i = 0; i < trace.nr_entries; i++) { @@ -927,10 +899,10 @@ static ssize_t environ_read(struct file *file, char __user *buf, if (!mmget_not_zero(mm)) goto free; - down_read(&mm->mmap_sem); + spin_lock(&mm->arg_lock); env_start = mm->env_start; env_end = mm->env_end; - up_read(&mm->mmap_sem); + spin_unlock(&mm->arg_lock); while (count > 0) { size_t this_len, max_len; @@ -1784,9 +1756,9 @@ int pid_getattr(const struct path *path, struct kstat *stat, generic_fillattr(inode, stat); - rcu_read_lock(); stat->uid = GLOBAL_ROOT_UID; stat->gid = GLOBAL_ROOT_GID; + rcu_read_lock(); task = pid_task(proc_pid(inode), PIDTYPE_PID); if (task) { if (!has_pid_permissions(pid, task, HIDEPID_INVISIBLE)) { @@ -1875,7 +1847,7 @@ const struct dentry_operations pid_dentry_operations = * by stat. */ bool proc_fill_cache(struct file *file, struct dir_context *ctx, - const char *name, int len, + const char *name, unsigned int len, instantiate_t instantiate, struct task_struct *task, const void *ptr) { struct dentry *child, *dir = file->f_path.dentry; @@ -3251,7 +3223,7 @@ int proc_pid_readdir(struct file *file, struct dir_context *ctx) iter.task; iter.tgid += 1, iter = next_tgid(ns, iter)) { char name[10 + 1]; - int len; + unsigned int len; cond_resched(); if (!has_pid_permissions(ns, iter.task, HIDEPID_INVISIBLE)) @@ -3578,7 +3550,7 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx) task; task = next_tid(task), ctx->pos++) { char name[10 + 1]; - int len; + unsigned int len; tid = task_pid_nr_ns(task, ns); len = snprintf(name, sizeof(name), "%u", tid); if (!proc_fill_cache(file, ctx, name, len, diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 05b9893..81882a1 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -248,7 +248,7 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx, struct file *f; struct fd_data data; char name[10 + 1]; - int len; + unsigned int len; f = fcheck_files(files, fd); if (!f) diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 93eb190..50cb22a 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -163,7 +163,7 @@ extern loff_t mem_lseek(struct file *, loff_t, int); /* Lookups */ typedef struct dentry *instantiate_t(struct dentry *, struct task_struct *, const void *); -extern bool proc_fill_cache(struct file *, struct dir_context *, const char *, int, +bool proc_fill_cache(struct file *, struct dir_context *, const char *, unsigned int, instantiate_t, struct task_struct *, const void *); /* diff --git a/fs/proc/page.c b/fs/proc/page.c index 1491918..792c78a 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -154,6 +154,8 @@ u64 stable_page_flags(struct page *page) if (PageBalloon(page)) u |= 1 << KPF_BALLOON; + if (PageTable(page)) + u |= 1 << KPF_PGTABLE; if (page_is_idle(page)) u |= 1 << KPF_IDLE; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 7e07413..597969d 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1259,8 +1259,9 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, if (pte_swp_soft_dirty(pte)) flags |= PM_SOFT_DIRTY; entry = pte_to_swp_entry(pte); - frame = swp_type(entry) | - (swp_offset(entry) << MAX_SWAPFILES_SHIFT); + if (pm->show_pfn) + frame = swp_type(entry) | + (swp_offset(entry) << MAX_SWAPFILES_SHIFT); flags |= PM_SWAP; if (is_migration_entry(entry)) page = migration_entry_to_page(entry); @@ -1311,11 +1312,14 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION else if (is_swap_pmd(pmd)) { swp_entry_t entry = pmd_to_swp_entry(pmd); - unsigned long offset = swp_offset(entry); + unsigned long offset; - offset += (addr & ~PMD_MASK) >> PAGE_SHIFT; - frame = swp_type(entry) | - (offset << MAX_SWAPFILES_SHIFT); + if (pm->show_pfn) { + offset = swp_offset(entry) + + ((addr & ~PMD_MASK) >> PAGE_SHIFT); + frame = swp_type(entry) | + (offset << MAX_SWAPFILES_SHIFT); + } flags |= PM_SWAP; if (pmd_swp_soft_dirty(pmd)) flags |= PM_SOFT_DIRTY; @@ -1333,10 +1337,12 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, err = add_to_pagemap(addr, &pme, pm); if (err) break; - if (pm->show_pfn && (flags & PM_PRESENT)) - frame++; - else if (flags & PM_SWAP) - frame += (1 << MAX_SWAPFILES_SHIFT); + if (pm->show_pfn) { + if (flags & PM_PRESENT) + frame++; + else if (flags & PM_SWAP) + frame += (1 << MAX_SWAPFILES_SHIFT); + } } spin_unlock(ptl); return err; diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index cec550c..123bf7d 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -62,6 +62,8 @@ struct userfaultfd_ctx { enum userfaultfd_state state; /* released */ bool released; + /* memory mappings are changing because of non-cooperative event */ + bool mmap_changing; /* mm with one ore more vmas attached to this userfaultfd_ctx */ struct mm_struct *mm; }; @@ -641,6 +643,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, * already released. */ out: + WRITE_ONCE(ctx->mmap_changing, false); userfaultfd_ctx_put(ctx); } @@ -686,10 +689,12 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs) ctx->state = UFFD_STATE_RUNNING; ctx->features = octx->features; ctx->released = false; + ctx->mmap_changing = false; ctx->mm = vma->vm_mm; mmgrab(ctx->mm); userfaultfd_ctx_get(octx); + WRITE_ONCE(octx->mmap_changing, true); fctx->orig = octx; fctx->new = ctx; list_add_tail(&fctx->list, fcs); @@ -732,6 +737,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma, if (ctx && (ctx->features & UFFD_FEATURE_EVENT_REMAP)) { vm_ctx->ctx = ctx; userfaultfd_ctx_get(ctx); + WRITE_ONCE(ctx->mmap_changing, true); } } @@ -772,6 +778,7 @@ bool userfaultfd_remove(struct vm_area_struct *vma, return true; userfaultfd_ctx_get(ctx); + WRITE_ONCE(ctx->mmap_changing, true); up_read(&mm->mmap_sem); msg_init(&ewq.msg); @@ -815,6 +822,7 @@ int userfaultfd_unmap_prep(struct vm_area_struct *vma, return -ENOMEM; userfaultfd_ctx_get(ctx); + WRITE_ONCE(ctx->mmap_changing, true); unmap_ctx->ctx = ctx; unmap_ctx->start = start; unmap_ctx->end = end; @@ -1653,6 +1661,10 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx, user_uffdio_copy = (struct uffdio_copy __user *) arg; + ret = -EAGAIN; + if (READ_ONCE(ctx->mmap_changing)) + goto out; + ret = -EFAULT; if (copy_from_user(&uffdio_copy, user_uffdio_copy, /* don't copy "copy" last field */ @@ -1674,7 +1686,7 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx, goto out; if (mmget_not_zero(ctx->mm)) { ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src, - uffdio_copy.len); + uffdio_copy.len, &ctx->mmap_changing); mmput(ctx->mm); } else { return -ESRCH; @@ -1705,6 +1717,10 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx, user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg; + ret = -EAGAIN; + if (READ_ONCE(ctx->mmap_changing)) + goto out; + ret = -EFAULT; if (copy_from_user(&uffdio_zeropage, user_uffdio_zeropage, /* don't copy "zeropage" last field */ @@ -1721,7 +1737,8 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx, if (mmget_not_zero(ctx->mm)) { ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start, - uffdio_zeropage.range.len); + uffdio_zeropage.range.len, + &ctx->mmap_changing); mmput(ctx->mm); } else { return -ESRCH; @@ -1900,6 +1917,7 @@ SYSCALL_DEFINE1(userfaultfd, int, flags) ctx->features = 0; ctx->state = UFFD_STATE_WAIT_API; ctx->released = false; + ctx->mmap_changing = false; ctx->mm = current->mm; /* prevent the mm struct to be freed */ mmgrab(ctx->mm); diff --git a/include/asm-generic/int-ll64.h b/include/asm-generic/int-ll64.h index ffb68d6..a248545 100644 --- a/include/asm-generic/int-ll64.h +++ b/include/asm-generic/int-ll64.h @@ -13,17 +13,14 @@ #ifndef __ASSEMBLY__ -typedef signed char s8; -typedef unsigned char u8; - -typedef signed short s16; -typedef unsigned short u16; - -typedef signed int s32; -typedef unsigned int u32; - -typedef signed long long s64; -typedef unsigned long long u64; +typedef __s8 s8; +typedef __u8 u8; +typedef __s16 s16; +typedef __u16 u16; +typedef __s32 s32; +typedef __u32 u32; +typedef __s64 s64; +typedef __u64 u64; #define S8_C(x) x #define U8_C(x) x ## U diff --git a/include/linux/dax.h b/include/linux/dax.h index c99692d..88504e8 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -125,8 +125,8 @@ ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops); int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size, pfn_t *pfnp, int *errp, const struct iomap_ops *ops); -int dax_finish_sync_fault(struct vm_fault *vmf, enum page_entry_size pe_size, - pfn_t pfn); +vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, + enum page_entry_size pe_size, pfn_t pfn); int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index); int dax_invalidate_mapping_entry_sync(struct address_space *mapping, pgoff_t index); diff --git a/include/linux/gfp.h b/include/linux/gfp.h index fc5ab85..a6afcec 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -24,6 +24,7 @@ struct vm_area_struct; #define ___GFP_HIGH 0x20u #define ___GFP_IO 0x40u #define ___GFP_FS 0x80u +#define ___GFP_WRITE 0x100u #define ___GFP_NOWARN 0x200u #define ___GFP_RETRY_MAYFAIL 0x400u #define ___GFP_NOFAIL 0x800u @@ -36,11 +37,10 @@ struct vm_area_struct; #define ___GFP_THISNODE 0x40000u #define ___GFP_ATOMIC 0x80000u #define ___GFP_ACCOUNT 0x100000u -#define ___GFP_DIRECT_RECLAIM 0x400000u -#define ___GFP_WRITE 0x800000u -#define ___GFP_KSWAPD_RECLAIM 0x1000000u +#define ___GFP_DIRECT_RECLAIM 0x200000u +#define ___GFP_KSWAPD_RECLAIM 0x400000u #ifdef CONFIG_LOCKDEP -#define ___GFP_NOLOCKDEP 0x2000000u +#define ___GFP_NOLOCKDEP 0x800000u #else #define ___GFP_NOLOCKDEP 0 #endif @@ -205,7 +205,7 @@ struct vm_area_struct; #define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP) /* Room for N __GFP_FOO bits */ -#define __GFP_BITS_SHIFT (25 + IS_ENABLED(CONFIG_LOCKDEP)) +#define __GFP_BITS_SHIFT (23 + IS_ENABLED(CONFIG_LOCKDEP)) #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) /* @@ -343,7 +343,7 @@ static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags) * 0x1 => DMA or NORMAL * 0x2 => HIGHMEM or NORMAL * 0x3 => BAD (DMA+HIGHMEM) - * 0x4 => DMA32 or DMA or NORMAL + * 0x4 => DMA32 or NORMAL * 0x5 => BAD (DMA+DMA32) * 0x6 => BAD (HIGHMEM+DMA32) * 0x7 => BAD (HIGHMEM+DMA32+DMA) @@ -351,7 +351,7 @@ static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags) * 0x9 => DMA or NORMAL (MOVABLE+DMA) * 0xa => MOVABLE (Movable is valid only if HIGHMEM is set too) * 0xb => BAD (MOVABLE+HIGHMEM+DMA) - * 0xc => DMA32 (MOVABLE+DMA32) + * 0xc => DMA32 or NORMAL (MOVABLE+DMA32) * 0xd => BAD (MOVABLE+DMA32+DMA) * 0xe => BAD (MOVABLE+DMA32+HIGHMEM) * 0xf => BAD (MOVABLE+DMA32+HIGHMEM+DMA) diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 2f1327c..4c92e3b 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -522,9 +522,7 @@ void hmm_devmem_remove(struct hmm_devmem *devmem); static inline void hmm_devmem_page_set_drvdata(struct page *page, unsigned long data) { - unsigned long *drvdata = (unsigned long *)&page->pgmap; - - drvdata[1] = data; + page->hmm_data = data; } /* @@ -535,9 +533,7 @@ static inline void hmm_devmem_page_set_drvdata(struct page *page, */ static inline unsigned long hmm_devmem_page_get_drvdata(const struct page *page) { - const unsigned long *drvdata = (const unsigned long *)&page->pgmap; - - return drvdata[1]; + return page->hmm_data; } diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 7aed926..7c4e8f1 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -29,6 +29,7 @@ #define LLONG_MIN (-LLONG_MAX - 1) #define ULLONG_MAX (~0ULL) #define SIZE_MAX (~(size_t)0) +#define PHYS_ADDR_MAX (~(phys_addr_t)0) #define U8_MAX ((u8)~0U) #define S8_MAX ((s8)(U8_MAX>>1)) diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 44368b1..161e816 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -37,17 +37,6 @@ static inline void ksm_exit(struct mm_struct *mm) __ksm_exit(mm); } -static inline struct stable_node *page_stable_node(struct page *page) -{ - return PageKsm(page) ? page_rmapping(page) : NULL; -} - -static inline void set_page_stable_node(struct page *page, - struct stable_node *stable_node) -{ - page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM); -} - /* * When do_swap_page() first faults in from swap what used to be a KSM page, * no problem, it will be assigned to this vma's anon_vma; but thereafter, @@ -89,12 +78,6 @@ static inline struct page *ksm_might_need_to_copy(struct page *page, return page; } -static inline int page_referenced_ksm(struct page *page, - struct mem_cgroup *memcg, unsigned long *vm_flags) -{ - return 0; -} - static inline void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) { diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index d99b71b..4f52ec755 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -53,9 +53,17 @@ enum memcg_memory_event { MEMCG_HIGH, MEMCG_MAX, MEMCG_OOM, + MEMCG_SWAP_MAX, + MEMCG_SWAP_FAIL, MEMCG_NR_MEMORY_EVENTS, }; +enum mem_cgroup_protection { + MEMCG_PROT_NONE, + MEMCG_PROT_LOW, + MEMCG_PROT_MIN, +}; + struct mem_cgroup_reclaim_cookie { pg_data_t *pgdat; int priority; @@ -158,6 +166,15 @@ enum memcg_kmem_state { KMEM_ONLINE, }; +#if defined(CONFIG_SMP) +struct memcg_padding { + char x[0]; +} ____cacheline_internodealigned_in_smp; +#define MEMCG_PADDING(name) struct memcg_padding name; +#else +#define MEMCG_PADDING(name) +#endif + /* * The memory controller data structure. The memory controller controls both * page cache and RSS per cgroup. We would eventually like to provide @@ -179,8 +196,7 @@ struct mem_cgroup { struct page_counter kmem; struct page_counter tcpmem; - /* Normal memory consumption range */ - unsigned long low; + /* Upper bound of normal memory consumption range */ unsigned long high; /* Range enforcement for interrupt charges */ @@ -205,9 +221,11 @@ struct mem_cgroup { int oom_kill_disable; /* memory.events */ - atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS]; struct cgroup_file events_file; + /* handle for "memory.swap.events" */ + struct cgroup_file swap_events_file; + /* protect arrays of thresholds */ struct mutex thresholds_lock; @@ -225,19 +243,26 @@ struct mem_cgroup { * mem_cgroup ? And what type of charges should we move ? */ unsigned long move_charge_at_immigrate; + /* taken only while moving_account > 0 */ + spinlock_t move_lock; + unsigned long move_lock_flags; + + MEMCG_PADDING(_pad1_); + /* * set > 0 if pages under this cgroup are moving to other cgroup. */ atomic_t moving_account; - /* taken only while moving_account > 0 */ - spinlock_t move_lock; struct task_struct *move_lock_task; - unsigned long move_lock_flags; /* memory.stat */ struct mem_cgroup_stat_cpu __percpu *stat_cpu; + + MEMCG_PADDING(_pad2_); + atomic_long_t stat[MEMCG_NR_STAT]; atomic_long_t events[NR_VM_EVENT_ITEMS]; + atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS]; unsigned long socket_pressure; @@ -285,7 +310,8 @@ static inline bool mem_cgroup_disabled(void) return !cgroup_subsys_enabled(memory_cgrp_subsys); } -bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg); +enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root, + struct mem_cgroup *memcg); int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask, struct mem_cgroup **memcgp, @@ -462,7 +488,7 @@ unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec, void mem_cgroup_handle_over_high(void); -unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg); +unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg); void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p); @@ -730,10 +756,10 @@ static inline void memcg_memory_event(struct mem_cgroup *memcg, { } -static inline bool mem_cgroup_low(struct mem_cgroup *root, - struct mem_cgroup *memcg) +static inline enum mem_cgroup_protection mem_cgroup_protected( + struct mem_cgroup *root, struct mem_cgroup *memcg) { - return false; + return MEMCG_PROT_NONE; } static inline int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, @@ -853,7 +879,7 @@ mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, return 0; } -static inline unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg) +static inline unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg) { return 0; } @@ -1093,7 +1119,6 @@ static inline void dec_lruvec_page_state(struct page *page, #ifdef CONFIG_CGROUP_WRITEBACK -struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg); struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb); void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, unsigned long *pheadroom, unsigned long *pdirty, diff --git a/include/linux/memfd.h b/include/linux/memfd.h new file mode 100644 index 0000000..4f16004 --- /dev/null +++ b/include/linux/memfd.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __LINUX_MEMFD_H +#define __LINUX_MEMFD_H + +#include <linux/file.h> + +#ifdef CONFIG_MEMFD_CREATE +extern long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg); +#else +static inline long memfd_fcntl(struct file *f, unsigned int c, unsigned long a) +{ + return -EINVAL; +} +#endif + +#endif /* __LINUX_MEMFD_H */ diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 2b02652..4e9828c 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -107,7 +107,6 @@ static inline bool movable_node_is_enabled(void) } #ifdef CONFIG_MEMORY_HOTREMOVE -extern bool is_pageblock_removable_nolock(struct page *page); extern int arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap); extern int __remove_pages(struct zone *zone, unsigned long start_pfn, diff --git a/include/linux/mm.h b/include/linux/mm.h index 29c5458..4c3881b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1851,6 +1851,7 @@ static inline bool pgtable_page_ctor(struct page *page) { if (!ptlock_init(page)) return false; + __SetPageTable(page); inc_zone_page_state(page, NR_PAGETABLE); return true; } @@ -1858,6 +1859,7 @@ static inline bool pgtable_page_ctor(struct page *page) static inline void pgtable_page_dtor(struct page *page) { pte_lock_deinit(page); + __ClearPageTable(page); dec_zone_page_state(page, NR_PAGETABLE); } @@ -2303,10 +2305,10 @@ extern void truncate_inode_pages_range(struct address_space *, extern void truncate_inode_pages_final(struct address_space *); /* generic vm_area_ops exported for stackable file systems */ -extern int filemap_fault(struct vm_fault *vmf); +extern vm_fault_t filemap_fault(struct vm_fault *vmf); extern void filemap_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); -extern int filemap_page_mkwrite(struct vm_fault *vmf); +extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf); /* mm/page-writeback.c */ int __must_check write_one_page(struct page *page); @@ -2431,8 +2433,8 @@ int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, pgprot_t pgprot); int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn); -int vm_insert_mixed_mkwrite(struct vm_area_struct *vma, unsigned long addr, - pfn_t pfn); +vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma, + unsigned long addr, pfn_t pfn); int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len); static inline vm_fault_t vmf_insert_page(struct vm_area_struct *vma, @@ -2530,12 +2532,10 @@ extern int apply_to_page_range(struct mm_struct *mm, unsigned long address, #ifdef CONFIG_PAGE_POISONING extern bool page_poisoning_enabled(void); extern void kernel_poison_pages(struct page *page, int numpages, int enable); -extern bool page_is_poisoned(struct page *page); #else static inline bool page_poisoning_enabled(void) { return false; } static inline void kernel_poison_pages(struct page *page, int numpages, int enable) { } -static inline bool page_is_poisoned(struct page *page) { return false; } #endif #ifdef CONFIG_DEBUG_PAGEALLOC diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 2161234..99ce070 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -33,29 +33,27 @@ struct hmm; * it to keep track of whatever it is we are using the page for at the * moment. Note that we have no way to track which tasks are using * a page, though if it is a pagecache page, rmap structures can tell us - * who is mapping it. If you allocate the page using alloc_pages(), you - * can use some of the space in struct page for your own purposes. + * who is mapping it. * - * Pages that were once in the page cache may be found under the RCU lock - * even after they have been recycled to a different purpose. The page - * cache reads and writes some of the fields in struct page to pin the - * page before checking that it's still in the page cache. It is vital - * that all users of struct page: - * 1. Use the first word as PageFlags. - * 2. Clear or preserve bit 0 of page->compound_head. It is used as - * PageTail for compound pages, and the page cache must not see false - * positives. Some users put a pointer here (guaranteed to be at least - * 4-byte aligned), other users avoid using the field altogether. - * 3. page->_refcount must either not be used, or must be used in such a - * way that other CPUs temporarily incrementing and then decrementing the - * refcount does not cause problems. On receiving the page from - * alloc_pages(), the refcount will be positive. - * 4. Either preserve page->_mapcount or restore it to -1 before freeing it. + * If you allocate the page using alloc_pages(), you can use some of the + * space in struct page for your own purposes. The five words in the main + * union are available, except for bit 0 of the first word which must be + * kept clear. Many users use this word to store a pointer to an object + * which is guaranteed to be aligned. If you use the same storage as + * page->mapping, you must restore it to NULL before freeing the page. * - * If you allocate pages of order > 0, you can use the fields in the struct - * page associated with each page, but bear in mind that the pages may have - * been inserted individually into the page cache, so you must use the above - * four fields in a compatible way for each struct page. + * If your page will not be mapped to userspace, you can also use the four + * bytes in the mapcount union, but you must call page_mapcount_reset() + * before freeing it. + * + * If you want to use the refcount field, it must be used in such a way + * that other CPUs temporarily incrementing and then decrementing the + * refcount does not cause problems. On receiving the page from + * alloc_pages(), the refcount will be positive. + * + * If you allocate pages of order > 0, you can use some of the fields + * in each subpage, but you may need to restore some of their values + * afterwards. * * SLUB uses cmpxchg_double() to atomically update its freelist and * counters. That requires that freelist & counters be adjacent and @@ -65,135 +63,122 @@ struct hmm; */ #ifdef CONFIG_HAVE_ALIGNED_STRUCT_PAGE #define _struct_page_alignment __aligned(2 * sizeof(unsigned long)) -#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) -#define _slub_counter_t unsigned long #else -#define _slub_counter_t unsigned int -#endif -#else /* !CONFIG_HAVE_ALIGNED_STRUCT_PAGE */ #define _struct_page_alignment -#define _slub_counter_t unsigned int -#endif /* !CONFIG_HAVE_ALIGNED_STRUCT_PAGE */ +#endif struct page { - /* First double word block */ unsigned long flags; /* Atomic flags, some possibly * updated asynchronously */ - union { - /* See page-flags.h for the definition of PAGE_MAPPING_FLAGS */ - struct address_space *mapping; - - void *s_mem; /* slab first object */ - atomic_t compound_mapcount; /* first tail page */ - /* page_deferred_list().next -- second tail page */ - }; - - /* Second double word */ - union { - pgoff_t index; /* Our offset within mapping. */ - void *freelist; /* sl[aou]b first free object */ - /* page_deferred_list().prev -- second tail page */ - }; - - union { - _slub_counter_t counters; - unsigned int active; /* SLAB */ - struct { /* SLUB */ - unsigned inuse:16; - unsigned objects:15; - unsigned frozen:1; - }; - int units; /* SLOB */ - - struct { /* Page cache */ - /* - * Count of ptes mapped in mms, to show when - * page is mapped & limit reverse map searches. - * - * Extra information about page type may be - * stored here for pages that are never mapped, - * in which case the value MUST BE <= -2. - * See page-flags.h for more details. - */ - atomic_t _mapcount; - - /* - * Usage count, *USE WRAPPER FUNCTION* when manual - * accounting. See page_ref.h - */ - atomic_t _refcount; - }; - }; - /* - * WARNING: bit 0 of the first word encode PageTail(). That means - * the rest users of the storage space MUST NOT use the bit to + * Five words (20/40 bytes) are available in this union. + * WARNING: bit 0 of the first word is used for PageTail(). That + * means the other users of this union MUST NOT use the bit to * avoid collision and false-positive PageTail(). */ union { - struct list_head lru; /* Pageout list, eg. active_list - * protected by zone_lru_lock ! - * Can be used as a generic list - * by the page owner. - */ - struct dev_pagemap *pgmap; /* ZONE_DEVICE pages are never on an - * lru or handled by a slab - * allocator, this points to the - * hosting device page map. - */ - struct { /* slub per cpu partial pages */ - struct page *next; /* Next partial slab */ + struct { /* Page cache and anonymous pages */ + /** + * @lru: Pageout list, eg. active_list protected by + * zone_lru_lock. Sometimes used as a generic list + * by the page owner. + */ + struct list_head lru; + /* See page-flags.h for PAGE_MAPPING_FLAGS */ + struct address_space *mapping; + pgoff_t index; /* Our offset within mapping. */ + /** + * @private: Mapping-private opaque data. + * Usually used for buffer_heads if PagePrivate. + * Used for swp_entry_t if PageSwapCache. + * Indicates order in the buddy system if PageBuddy. + */ + unsigned long private; + }; + struct { /* slab, slob and slub */ + union { + struct list_head slab_list; /* uses lru */ + struct { /* Partial pages */ + struct page *next; #ifdef CONFIG_64BIT - int pages; /* Nr of partial slabs left */ - int pobjects; /* Approximate # of objects */ + int pages; /* Nr of pages left */ + int pobjects; /* Approximate count */ #else - short int pages; - short int pobjects; + short int pages; + short int pobjects; #endif + }; + }; + struct kmem_cache *slab_cache; /* not slob */ + /* Double-word boundary */ + void *freelist; /* first free object */ + union { + void *s_mem; /* slab: first object */ + unsigned long counters; /* SLUB */ + struct { /* SLUB */ + unsigned inuse:16; + unsigned objects:15; + unsigned frozen:1; + }; + }; }; - - struct rcu_head rcu_head; /* Used by SLAB - * when destroying via RCU - */ - /* Tail pages of compound page */ - struct { - unsigned long compound_head; /* If bit zero is set */ + struct { /* Tail pages of compound page */ + unsigned long compound_head; /* Bit zero is set */ /* First tail page only */ unsigned char compound_dtor; unsigned char compound_order; - /* two/six bytes available here */ + atomic_t compound_mapcount; }; - -#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && USE_SPLIT_PMD_PTLOCKS - struct { - unsigned long __pad; /* do not overlay pmd_huge_pte - * with compound_head to avoid - * possible bit 0 collision. - */ - pgtable_t pmd_huge_pte; /* protected by page->ptl */ + struct { /* Second tail page of compound page */ + unsigned long _compound_pad_1; /* compound_head */ + unsigned long _compound_pad_2; + struct list_head deferred_list; }; + struct { /* Page table pages */ + unsigned long _pt_pad_1; /* compound_head */ + pgtable_t pmd_huge_pte; /* protected by page->ptl */ + unsigned long _pt_pad_2; /* mapping */ + struct mm_struct *pt_mm; /* x86 pgds only */ +#if ALLOC_SPLIT_PTLOCKS + spinlock_t *ptl; +#else + spinlock_t ptl; #endif + }; + struct { /* ZONE_DEVICE pages */ + /** @pgmap: Points to the hosting device page map. */ + struct dev_pagemap *pgmap; + unsigned long hmm_data; + unsigned long _zd_pad_1; /* uses mapping */ + }; + + /** @rcu_head: You can use this to free a page by RCU. */ + struct rcu_head rcu_head; }; - union { + union { /* This union is 4 bytes in size. */ /* - * Mapping-private opaque data: - * Usually used for buffer_heads if PagePrivate - * Used for swp_entry_t if PageSwapCache - * Indicates order in the buddy system if PageBuddy + * If the page can be mapped to userspace, encodes the number + * of times this page is referenced by a page table. */ - unsigned long private; -#if USE_SPLIT_PTE_PTLOCKS -#if ALLOC_SPLIT_PTLOCKS - spinlock_t *ptl; -#else - spinlock_t ptl; -#endif -#endif - struct kmem_cache *slab_cache; /* SL[AU]B: Pointer to slab */ + atomic_t _mapcount; + + /* + * If the page is neither PageSlab nor mappable to userspace, + * the value stored here may help determine what this page + * is used for. See page-flags.h for a list of page types + * which are currently stored here. + */ + unsigned int page_type; + + unsigned int active; /* SLAB */ + int units; /* SLOB */ }; + /* Usage count. *DO NOT USE DIRECTLY*. See page_ref.h */ + atomic_t _refcount; + #ifdef CONFIG_MEMCG struct mem_cgroup *mem_cgroup; #endif @@ -413,6 +398,8 @@ struct mm_struct { unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE & ~VM_STACK */ unsigned long stack_vm; /* VM_STACK */ unsigned long def_flags; + + spinlock_t arg_lock; /* protect the below fields */ unsigned long start_code, end_code, start_data, end_data; unsigned long start_brk, brk, start_stack; unsigned long arg_start, arg_end, env_start, env_end; @@ -627,9 +614,9 @@ struct vm_special_mapping { * If non-NULL, then this is called to resolve page faults * on the special mapping. If used, .pages is not checked. */ - int (*fault)(const struct vm_special_mapping *sm, - struct vm_area_struct *vma, - struct vm_fault *vmf); + vm_fault_t (*fault)(const struct vm_special_mapping *sm, + struct vm_area_struct *vma, + struct vm_fault *vmf); int (*mremap)(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma); diff --git a/include/linux/mpi.h b/include/linux/mpi.h index 1cc5ffb..7cd1473 100644 --- a/include/linux/mpi.h +++ b/include/linux/mpi.h @@ -53,93 +53,32 @@ struct gcry_mpi { typedef struct gcry_mpi *MPI; #define mpi_get_nlimbs(a) ((a)->nlimbs) -#define mpi_is_neg(a) ((a)->sign) /*-- mpiutil.c --*/ MPI mpi_alloc(unsigned nlimbs); -MPI mpi_alloc_secure(unsigned nlimbs); -MPI mpi_alloc_like(MPI a); void mpi_free(MPI a); int mpi_resize(MPI a, unsigned nlimbs); -int mpi_copy(MPI *copy, const MPI a); -void mpi_clear(MPI a); -int mpi_set(MPI w, MPI u); -int mpi_set_ui(MPI w, ulong u); -MPI mpi_alloc_set_ui(unsigned long u); -void mpi_m_check(MPI a); -void mpi_swap(MPI a, MPI b); /*-- mpicoder.c --*/ -MPI do_encode_md(const void *sha_buffer, unsigned nbits); MPI mpi_read_raw_data(const void *xbuffer, size_t nbytes); MPI mpi_read_from_buffer(const void *buffer, unsigned *ret_nread); MPI mpi_read_raw_from_sgl(struct scatterlist *sgl, unsigned int len); -int mpi_fromstr(MPI val, const char *str); -u32 mpi_get_keyid(MPI a, u32 *keyid); void *mpi_get_buffer(MPI a, unsigned *nbytes, int *sign); int mpi_read_buffer(MPI a, uint8_t *buf, unsigned buf_len, unsigned *nbytes, int *sign); -void *mpi_get_secure_buffer(MPI a, unsigned *nbytes, int *sign); int mpi_write_to_sgl(MPI a, struct scatterlist *sg, unsigned nbytes, int *sign); -#define log_mpidump g10_log_mpidump - -/*-- mpi-add.c --*/ -int mpi_add_ui(MPI w, MPI u, ulong v); -int mpi_add(MPI w, MPI u, MPI v); -int mpi_addm(MPI w, MPI u, MPI v, MPI m); -int mpi_sub_ui(MPI w, MPI u, ulong v); -int mpi_sub(MPI w, MPI u, MPI v); -int mpi_subm(MPI w, MPI u, MPI v, MPI m); - -/*-- mpi-mul.c --*/ -int mpi_mul_ui(MPI w, MPI u, ulong v); -int mpi_mul_2exp(MPI w, MPI u, ulong cnt); -int mpi_mul(MPI w, MPI u, MPI v); -int mpi_mulm(MPI w, MPI u, MPI v, MPI m); - -/*-- mpi-div.c --*/ -ulong mpi_fdiv_r_ui(MPI rem, MPI dividend, ulong divisor); -int mpi_fdiv_r(MPI rem, MPI dividend, MPI divisor); -int mpi_fdiv_q(MPI quot, MPI dividend, MPI divisor); -int mpi_fdiv_qr(MPI quot, MPI rem, MPI dividend, MPI divisor); -int mpi_tdiv_r(MPI rem, MPI num, MPI den); -int mpi_tdiv_qr(MPI quot, MPI rem, MPI num, MPI den); -int mpi_tdiv_q_2exp(MPI w, MPI u, unsigned count); -int mpi_divisible_ui(const MPI dividend, ulong divisor); - -/*-- mpi-gcd.c --*/ -int mpi_gcd(MPI g, const MPI a, const MPI b); - /*-- mpi-pow.c --*/ -int mpi_pow(MPI w, MPI u, MPI v); int mpi_powm(MPI res, MPI base, MPI exp, MPI mod); -/*-- mpi-mpow.c --*/ -int mpi_mulpowm(MPI res, MPI *basearray, MPI *exparray, MPI mod); - /*-- mpi-cmp.c --*/ int mpi_cmp_ui(MPI u, ulong v); int mpi_cmp(MPI u, MPI v); -/*-- mpi-scan.c --*/ -int mpi_getbyte(MPI a, unsigned idx); -void mpi_putbyte(MPI a, unsigned idx, int value); -unsigned mpi_trailing_zeros(MPI a); - /*-- mpi-bit.c --*/ void mpi_normalize(MPI a); unsigned mpi_get_nbits(MPI a); -int mpi_test_bit(MPI a, unsigned n); -int mpi_set_bit(MPI a, unsigned n); -int mpi_set_highbit(MPI a, unsigned n); -void mpi_clear_highbit(MPI a, unsigned n); -void mpi_clear_bit(MPI a, unsigned n); -int mpi_rshift(MPI x, MPI a, unsigned n); - -/*-- mpi-inv.c --*/ -int mpi_invm(MPI x, MPI u, MPI v); /* inline functions */ diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index e34a277..901943e 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -642,49 +642,62 @@ PAGEFLAG_FALSE(DoubleMap) #endif /* - * For pages that are never mapped to userspace, page->mapcount may be - * used for storing extra information about page type. Any value used - * for this purpose must be <= -2, but it's better start not too close - * to -2 so that an underflow of the page_mapcount() won't be mistaken - * for a special page. + * For pages that are never mapped to userspace (and aren't PageSlab), + * page_type may be used. Because it is initialised to -1, we invert the + * sense of the bit, so __SetPageFoo *clears* the bit used for PageFoo, and + * __ClearPageFoo *sets* the bit used for PageFoo. We reserve a few high and + * low bits so that an underflow or overflow of page_mapcount() won't be + * mistaken for a page type value. */ -#define PAGE_MAPCOUNT_OPS(uname, lname) \ + +#define PAGE_TYPE_BASE 0xf0000000 +/* Reserve 0x0000007f to catch underflows of page_mapcount */ +#define PG_buddy 0x00000080 +#define PG_balloon 0x00000100 +#define PG_kmemcg 0x00000200 +#define PG_table 0x00000400 + +#define PageType(page, flag) \ + ((page->page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE) + +#define PAGE_TYPE_OPS(uname, lname) \ static __always_inline int Page##uname(struct page *page) \ { \ - return atomic_read(&page->_mapcount) == \ - PAGE_##lname##_MAPCOUNT_VALUE; \ + return PageType(page, PG_##lname); \ } \ static __always_inline void __SetPage##uname(struct page *page) \ { \ - VM_BUG_ON_PAGE(atomic_read(&page->_mapcount) != -1, page); \ - atomic_set(&page->_mapcount, PAGE_##lname##_MAPCOUNT_VALUE); \ + VM_BUG_ON_PAGE(!PageType(page, 0), page); \ + page->page_type &= ~PG_##lname; \ } \ static __always_inline void __ClearPage##uname(struct page *page) \ { \ VM_BUG_ON_PAGE(!Page##uname(page), page); \ - atomic_set(&page->_mapcount, -1); \ + page->page_type |= PG_##lname; \ } /* - * PageBuddy() indicate that the page is free and in the buddy system + * PageBuddy() indicates that the page is free and in the buddy system * (see mm/page_alloc.c). */ -#define PAGE_BUDDY_MAPCOUNT_VALUE (-128) -PAGE_MAPCOUNT_OPS(Buddy, BUDDY) +PAGE_TYPE_OPS(Buddy, buddy) /* - * PageBalloon() is set on pages that are on the balloon page list + * PageBalloon() is true for pages that are on the balloon page list * (see mm/balloon_compaction.c). */ -#define PAGE_BALLOON_MAPCOUNT_VALUE (-256) -PAGE_MAPCOUNT_OPS(Balloon, BALLOON) +PAGE_TYPE_OPS(Balloon, balloon) /* * If kmemcg is enabled, the buddy allocator will set PageKmemcg() on * pages allocated with __GFP_ACCOUNT. It gets cleared on page free. */ -#define PAGE_KMEMCG_MAPCOUNT_VALUE (-512) -PAGE_MAPCOUNT_OPS(Kmemcg, KMEMCG) +PAGE_TYPE_OPS(Kmemcg, kmemcg) + +/* + * Marks pages in use as page tables. + */ +PAGE_TYPE_OPS(Table, table) extern bool is_free_buddy_page(struct page *page); diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h index c15ab80..bab7e57 100644 --- a/include/linux/page_counter.h +++ b/include/linux/page_counter.h @@ -7,10 +7,22 @@ #include <asm/page.h> struct page_counter { - atomic_long_t count; - unsigned long limit; + atomic_long_t usage; + unsigned long min; + unsigned long low; + unsigned long max; struct page_counter *parent; + /* effective memory.min and memory.min usage tracking */ + unsigned long emin; + atomic_long_t min_usage; + atomic_long_t children_min_usage; + + /* effective memory.low and memory.low usage tracking */ + unsigned long elow; + atomic_long_t low_usage; + atomic_long_t children_low_usage; + /* legacy */ unsigned long watermark; unsigned long failcnt; @@ -25,14 +37,14 @@ struct page_counter { static inline void page_counter_init(struct page_counter *counter, struct page_counter *parent) { - atomic_long_set(&counter->count, 0); - counter->limit = PAGE_COUNTER_MAX; + atomic_long_set(&counter->usage, 0); + counter->max = PAGE_COUNTER_MAX; counter->parent = parent; } static inline unsigned long page_counter_read(struct page_counter *counter) { - return atomic_long_read(&counter->count); + return atomic_long_read(&counter->usage); } void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages); @@ -41,7 +53,9 @@ bool page_counter_try_charge(struct page_counter *counter, unsigned long nr_pages, struct page_counter **fail); void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages); -int page_counter_limit(struct page_counter *counter, unsigned long limit); +void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages); +void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages); +int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages); int page_counter_memparse(const char *buf, const char *max, unsigned long *nr_pages); diff --git a/include/linux/pfn_t.h b/include/linux/pfn_t.h index a03c264..21713dc 100644 --- a/include/linux/pfn_t.h +++ b/include/linux/pfn_t.h @@ -122,7 +122,7 @@ pud_t pud_mkdevmap(pud_t pud); #endif #endif /* __HAVE_ARCH_PTE_DEVMAP */ -#ifdef __HAVE_ARCH_PTE_SPECIAL +#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL static inline bool pfn_t_special(pfn_t pfn) { return (pfn.val & PFN_SPECIAL) == PFN_SPECIAL; @@ -132,5 +132,5 @@ static inline bool pfn_t_special(pfn_t pfn) { return false; } -#endif /* __HAVE_ARCH_PTE_SPECIAL */ +#endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */ #endif /* _LINUX_PFN_T_H_ */ diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 76a8cb4..44d356f 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -163,9 +163,13 @@ static inline gfp_t current_gfp_context(gfp_t flags) } #ifdef CONFIG_LOCKDEP +extern void __fs_reclaim_acquire(void); +extern void __fs_reclaim_release(void); extern void fs_reclaim_acquire(gfp_t gfp_mask); extern void fs_reclaim_release(gfp_t gfp_mask); #else +static inline void __fs_reclaim_acquire(void) { } +static inline void __fs_reclaim_release(void) { } static inline void fs_reclaim_acquire(gfp_t gfp_mask) { } static inline void fs_reclaim_release(gfp_t gfp_mask) { } #endif diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 73b5e65..f155dc6 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -110,19 +110,6 @@ static inline bool shmem_file(struct file *file) extern bool shmem_charge(struct inode *inode, long pages); extern void shmem_uncharge(struct inode *inode, long pages); -#ifdef CONFIG_TMPFS - -extern long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg); - -#else - -static inline long memfd_fcntl(struct file *f, unsigned int c, unsigned long a) -{ - return -EINVAL; -} - -#endif - #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE extern bool shmem_huge_enabled(struct vm_area_struct *vma); #else diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h index d9228e4..3485c58 100644 --- a/include/linux/slab_def.h +++ b/include/linux/slab_def.h @@ -67,9 +67,10 @@ struct kmem_cache { /* * If debugging is enabled, then the allocator can add additional - * fields and/or padding to every object. size contains the total - * object size including these internal fields, the following two - * variables contain the offset to the user object and its size. + * fields and/or padding to every object. 'size' contains the total + * object size including these internal fields, while 'obj_offset' + * and 'object_size' contain the offset to the user object and its + * size. */ int obj_offset; #endif /* CONFIG_DEBUG_SLAB */ diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 3773e26..09fa2c6 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -101,7 +101,6 @@ struct kmem_cache { void (*ctor)(void *); unsigned int inuse; /* Offset to metadata */ unsigned int align; /* Alignment */ - unsigned int reserved; /* Reserved bytes at the end of slabs */ unsigned int red_left_pad; /* Left redzone padding size */ const char *name; /* Name (only for display!) */ struct list_head list; /* List of slab caches */ diff --git a/include/linux/types.h b/include/linux/types.h index ec13d02..9834e90 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -10,14 +10,14 @@ #define DECLARE_BITMAP(name,bits) \ unsigned long name[BITS_TO_LONGS(bits)] -typedef __u32 __kernel_dev_t; +typedef u32 __kernel_dev_t; typedef __kernel_fd_set fd_set; typedef __kernel_dev_t dev_t; typedef __kernel_ino_t ino_t; typedef __kernel_mode_t mode_t; typedef unsigned short umode_t; -typedef __u32 nlink_t; +typedef u32 nlink_t; typedef __kernel_off_t off_t; typedef __kernel_pid_t pid_t; typedef __kernel_daddr_t daddr_t; @@ -95,29 +95,29 @@ typedef unsigned long ulong; #ifndef __BIT_TYPES_DEFINED__ #define __BIT_TYPES_DEFINED__ -typedef __u8 u_int8_t; -typedef __s8 int8_t; -typedef __u16 u_int16_t; -typedef __s16 int16_t; -typedef __u32 u_int32_t; -typedef __s32 int32_t; +typedef u8 u_int8_t; +typedef s8 int8_t; +typedef u16 u_int16_t; +typedef s16 int16_t; +typedef u32 u_int32_t; +typedef s32 int32_t; #endif /* !(__BIT_TYPES_DEFINED__) */ -typedef __u8 uint8_t; -typedef __u16 uint16_t; -typedef __u32 uint32_t; +typedef u8 uint8_t; +typedef u16 uint16_t; +typedef u32 uint32_t; #if defined(__GNUC__) -typedef __u64 uint64_t; -typedef __u64 u_int64_t; -typedef __s64 int64_t; +typedef u64 uint64_t; +typedef u64 u_int64_t; +typedef s64 int64_t; #endif /* this is a special 64bit data type that is 8-byte aligned */ -#define aligned_u64 __u64 __attribute__((aligned(8))) -#define aligned_be64 __be64 __attribute__((aligned(8))) -#define aligned_le64 __le64 __attribute__((aligned(8))) +#define aligned_u64 __aligned_u64 +#define aligned_be64 __aligned_be64 +#define aligned_le64 __aligned_le64 /** * The type used for indexing onto a disc or disc partition. diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index f2f3b68..e091f0a 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -31,10 +31,12 @@ extern int handle_userfault(struct vm_fault *vmf, unsigned long reason); extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, - unsigned long src_start, unsigned long len); + unsigned long src_start, unsigned long len, + bool *mmap_changing); extern ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long dst_start, - unsigned long len); + unsigned long len, + bool *mmap_changing); /* mm helpers */ static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma, diff --git a/include/uapi/linux/auto_fs.h b/include/uapi/linux/auto_fs.h index 2a4432c..e13eec3 100644 --- a/include/uapi/linux/auto_fs.h +++ b/include/uapi/linux/auto_fs.h @@ -1,6 +1,8 @@ /* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ /* - * Copyright 1997 Transmeta Corporation - All Rights Reserved + * Copyright 1997 Transmeta Corporation - All Rights Reserved + * Copyright 1999-2000 Jeremy Fitzhardinge <jeremy@goop.org> + * Copyright 2005-2006,2013,2017-2018 Ian Kent <raven@themaw.net> * * This file is part of the Linux kernel and is made available under * the terms of the GNU General Public License, version 2, or at your @@ -8,7 +10,6 @@ * * ----------------------------------------------------------------------- */ - #ifndef _UAPI_LINUX_AUTO_FS_H #define _UAPI_LINUX_AUTO_FS_H @@ -18,13 +19,11 @@ #include <sys/ioctl.h> #endif /* __KERNEL__ */ +#define AUTOFS_PROTO_VERSION 5 +#define AUTOFS_MIN_PROTO_VERSION 3 +#define AUTOFS_MAX_PROTO_VERSION 5 -/* This file describes autofs v3 */ -#define AUTOFS_PROTO_VERSION 3 - -/* Range of protocol versions defined */ -#define AUTOFS_MAX_PROTO_VERSION AUTOFS_PROTO_VERSION -#define AUTOFS_MIN_PROTO_VERSION AUTOFS_PROTO_VERSION +#define AUTOFS_PROTO_SUBVERSION 2 /* * The wait_queue_token (autofs_wqt_t) is part of a structure which is passed @@ -76,9 +75,155 @@ enum { #define AUTOFS_IOC_READY _IO(AUTOFS_IOCTL, AUTOFS_IOC_READY_CMD) #define AUTOFS_IOC_FAIL _IO(AUTOFS_IOCTL, AUTOFS_IOC_FAIL_CMD) #define AUTOFS_IOC_CATATONIC _IO(AUTOFS_IOCTL, AUTOFS_IOC_CATATONIC_CMD) -#define AUTOFS_IOC_PROTOVER _IOR(AUTOFS_IOCTL, AUTOFS_IOC_PROTOVER_CMD, int) -#define AUTOFS_IOC_SETTIMEOUT32 _IOWR(AUTOFS_IOCTL, AUTOFS_IOC_SETTIMEOUT_CMD, compat_ulong_t) -#define AUTOFS_IOC_SETTIMEOUT _IOWR(AUTOFS_IOCTL, AUTOFS_IOC_SETTIMEOUT_CMD, unsigned long) -#define AUTOFS_IOC_EXPIRE _IOR(AUTOFS_IOCTL, AUTOFS_IOC_EXPIRE_CMD, struct autofs_packet_expire) +#define AUTOFS_IOC_PROTOVER _IOR(AUTOFS_IOCTL, \ + AUTOFS_IOC_PROTOVER_CMD, int) +#define AUTOFS_IOC_SETTIMEOUT32 _IOWR(AUTOFS_IOCTL, \ + AUTOFS_IOC_SETTIMEOUT_CMD, \ + compat_ulong_t) +#define AUTOFS_IOC_SETTIMEOUT _IOWR(AUTOFS_IOCTL, \ + AUTOFS_IOC_SETTIMEOUT_CMD, \ + unsigned long) +#define AUTOFS_IOC_EXPIRE _IOR(AUTOFS_IOCTL, \ + AUTOFS_IOC_EXPIRE_CMD, \ + struct autofs_packet_expire) + +/* autofs version 4 and later definitions */ + +/* Mask for expire behaviour */ +#define AUTOFS_EXP_IMMEDIATE 1 +#define AUTOFS_EXP_LEAVES 2 + +#define AUTOFS_TYPE_ANY 0U +#define AUTOFS_TYPE_INDIRECT 1U +#define AUTOFS_TYPE_DIRECT 2U +#define AUTOFS_TYPE_OFFSET 4U + +static inline void set_autofs_type_indirect(unsigned int *type) +{ + *type = AUTOFS_TYPE_INDIRECT; +} + +static inline unsigned int autofs_type_indirect(unsigned int type) +{ + return (type == AUTOFS_TYPE_INDIRECT); +} + +static inline void set_autofs_type_direct(unsigned int *type) +{ + *type = AUTOFS_TYPE_DIRECT; +} + +static inline unsigned int autofs_type_direct(unsigned int type) +{ + return (type == AUTOFS_TYPE_DIRECT); +} + +static inline void set_autofs_type_offset(unsigned int *type) +{ + *type = AUTOFS_TYPE_OFFSET; +} + +static inline unsigned int autofs_type_offset(unsigned int type) +{ + return (type == AUTOFS_TYPE_OFFSET); +} + +static inline unsigned int autofs_type_trigger(unsigned int type) +{ + return (type == AUTOFS_TYPE_DIRECT || type == AUTOFS_TYPE_OFFSET); +} + +/* + * This isn't really a type as we use it to say "no type set" to + * indicate we want to search for "any" mount in the + * autofs_dev_ioctl_ismountpoint() device ioctl function. + */ +static inline void set_autofs_type_any(unsigned int *type) +{ + *type = AUTOFS_TYPE_ANY; +} + +static inline unsigned int autofs_type_any(unsigned int type) +{ + return (type == AUTOFS_TYPE_ANY); +} + +/* Daemon notification packet types */ +enum autofs_notify { + NFY_NONE, + NFY_MOUNT, + NFY_EXPIRE +}; + +/* Kernel protocol version 4 packet types */ + +/* Expire entry (umount request) */ +#define autofs_ptype_expire_multi 2 + +/* Kernel protocol version 5 packet types */ + +/* Indirect mount missing and expire requests. */ +#define autofs_ptype_missing_indirect 3 +#define autofs_ptype_expire_indirect 4 + +/* Direct mount missing and expire requests */ +#define autofs_ptype_missing_direct 5 +#define autofs_ptype_expire_direct 6 + +/* v4 multi expire (via pipe) */ +struct autofs_packet_expire_multi { + struct autofs_packet_hdr hdr; + autofs_wqt_t wait_queue_token; + int len; + char name[NAME_MAX+1]; +}; + +union autofs_packet_union { + struct autofs_packet_hdr hdr; + struct autofs_packet_missing missing; + struct autofs_packet_expire expire; + struct autofs_packet_expire_multi expire_multi; +}; + +/* autofs v5 common packet struct */ +struct autofs_v5_packet { + struct autofs_packet_hdr hdr; + autofs_wqt_t wait_queue_token; + __u32 dev; + __u64 ino; + __u32 uid; + __u32 gid; + __u32 pid; + __u32 tgid; + __u32 len; + char name[NAME_MAX+1]; +}; + +typedef struct autofs_v5_packet autofs_packet_missing_indirect_t; +typedef struct autofs_v5_packet autofs_packet_expire_indirect_t; +typedef struct autofs_v5_packet autofs_packet_missing_direct_t; +typedef struct autofs_v5_packet autofs_packet_expire_direct_t; + +union autofs_v5_packet_union { + struct autofs_packet_hdr hdr; + struct autofs_v5_packet v5_packet; + autofs_packet_missing_indirect_t missing_indirect; + autofs_packet_expire_indirect_t expire_indirect; + autofs_packet_missing_direct_t missing_direct; + autofs_packet_expire_direct_t expire_direct; +}; + +enum { + AUTOFS_IOC_EXPIRE_MULTI_CMD = 0x66, /* AUTOFS_IOC_EXPIRE_CMD + 1 */ + AUTOFS_IOC_PROTOSUBVER_CMD, + AUTOFS_IOC_ASKUMOUNT_CMD = 0x70, /* AUTOFS_DEV_IOCTL_VERSION_CMD - 1 */ +}; + +#define AUTOFS_IOC_EXPIRE_MULTI _IOW(AUTOFS_IOCTL, \ + AUTOFS_IOC_EXPIRE_MULTI_CMD, int) +#define AUTOFS_IOC_PROTOSUBVER _IOR(AUTOFS_IOCTL, \ + AUTOFS_IOC_PROTOSUBVER_CMD, int) +#define AUTOFS_IOC_ASKUMOUNT _IOR(AUTOFS_IOCTL, \ + AUTOFS_IOC_ASKUMOUNT_CMD, int) #endif /* _UAPI_LINUX_AUTO_FS_H */ diff --git a/include/uapi/linux/auto_fs4.h b/include/uapi/linux/auto_fs4.h index 1f608e2..d01ef0a 100644 --- a/include/uapi/linux/auto_fs4.h +++ b/include/uapi/linux/auto_fs4.h @@ -7,156 +7,9 @@ * option, any later version, incorporated herein by reference. */ -#ifndef _LINUX_AUTO_FS4_H -#define _LINUX_AUTO_FS4_H +#ifndef _UAPI_LINUX_AUTO_FS4_H +#define _UAPI_LINUX_AUTO_FS4_H -/* Include common v3 definitions */ -#include <linux/types.h> #include <linux/auto_fs.h> -/* autofs v4 definitions */ -#undef AUTOFS_PROTO_VERSION -#undef AUTOFS_MIN_PROTO_VERSION -#undef AUTOFS_MAX_PROTO_VERSION - -#define AUTOFS_PROTO_VERSION 5 -#define AUTOFS_MIN_PROTO_VERSION 3 -#define AUTOFS_MAX_PROTO_VERSION 5 - -#define AUTOFS_PROTO_SUBVERSION 2 - -/* Mask for expire behaviour */ -#define AUTOFS_EXP_IMMEDIATE 1 -#define AUTOFS_EXP_LEAVES 2 - -#define AUTOFS_TYPE_ANY 0U -#define AUTOFS_TYPE_INDIRECT 1U -#define AUTOFS_TYPE_DIRECT 2U -#define AUTOFS_TYPE_OFFSET 4U - -static inline void set_autofs_type_indirect(unsigned int *type) -{ - *type = AUTOFS_TYPE_INDIRECT; -} - -static inline unsigned int autofs_type_indirect(unsigned int type) -{ - return (type == AUTOFS_TYPE_INDIRECT); -} - -static inline void set_autofs_type_direct(unsigned int *type) -{ - *type = AUTOFS_TYPE_DIRECT; -} - -static inline unsigned int autofs_type_direct(unsigned int type) -{ - return (type == AUTOFS_TYPE_DIRECT); -} - -static inline void set_autofs_type_offset(unsigned int *type) -{ - *type = AUTOFS_TYPE_OFFSET; -} - -static inline unsigned int autofs_type_offset(unsigned int type) -{ - return (type == AUTOFS_TYPE_OFFSET); -} - -static inline unsigned int autofs_type_trigger(unsigned int type) -{ - return (type == AUTOFS_TYPE_DIRECT || type == AUTOFS_TYPE_OFFSET); -} - -/* - * This isn't really a type as we use it to say "no type set" to - * indicate we want to search for "any" mount in the - * autofs_dev_ioctl_ismountpoint() device ioctl function. - */ -static inline void set_autofs_type_any(unsigned int *type) -{ - *type = AUTOFS_TYPE_ANY; -} - -static inline unsigned int autofs_type_any(unsigned int type) -{ - return (type == AUTOFS_TYPE_ANY); -} - -/* Daemon notification packet types */ -enum autofs_notify { - NFY_NONE, - NFY_MOUNT, - NFY_EXPIRE -}; - -/* Kernel protocol version 4 packet types */ - -/* Expire entry (umount request) */ -#define autofs_ptype_expire_multi 2 - -/* Kernel protocol version 5 packet types */ - -/* Indirect mount missing and expire requests. */ -#define autofs_ptype_missing_indirect 3 -#define autofs_ptype_expire_indirect 4 - -/* Direct mount missing and expire requests */ -#define autofs_ptype_missing_direct 5 -#define autofs_ptype_expire_direct 6 - -/* v4 multi expire (via pipe) */ -struct autofs_packet_expire_multi { - struct autofs_packet_hdr hdr; - autofs_wqt_t wait_queue_token; - int len; - char name[NAME_MAX+1]; -}; - -union autofs_packet_union { - struct autofs_packet_hdr hdr; - struct autofs_packet_missing missing; - struct autofs_packet_expire expire; - struct autofs_packet_expire_multi expire_multi; -}; - -/* autofs v5 common packet struct */ -struct autofs_v5_packet { - struct autofs_packet_hdr hdr; - autofs_wqt_t wait_queue_token; - __u32 dev; - __u64 ino; - __u32 uid; - __u32 gid; - __u32 pid; - __u32 tgid; - __u32 len; - char name[NAME_MAX+1]; -}; - -typedef struct autofs_v5_packet autofs_packet_missing_indirect_t; -typedef struct autofs_v5_packet autofs_packet_expire_indirect_t; -typedef struct autofs_v5_packet autofs_packet_missing_direct_t; -typedef struct autofs_v5_packet autofs_packet_expire_direct_t; - -union autofs_v5_packet_union { - struct autofs_packet_hdr hdr; - struct autofs_v5_packet v5_packet; - autofs_packet_missing_indirect_t missing_indirect; - autofs_packet_expire_indirect_t expire_indirect; - autofs_packet_missing_direct_t missing_direct; - autofs_packet_expire_direct_t expire_direct; -}; - -enum { - AUTOFS_IOC_EXPIRE_MULTI_CMD = 0x66, /* AUTOFS_IOC_EXPIRE_CMD + 1 */ - AUTOFS_IOC_PROTOSUBVER_CMD, - AUTOFS_IOC_ASKUMOUNT_CMD = 0x70, /* AUTOFS_DEV_IOCTL_VERSION_CMD - 1 */ -}; - -#define AUTOFS_IOC_EXPIRE_MULTI _IOW(AUTOFS_IOCTL, AUTOFS_IOC_EXPIRE_MULTI_CMD, int) -#define AUTOFS_IOC_PROTOSUBVER _IOR(AUTOFS_IOCTL, AUTOFS_IOC_PROTOSUBVER_CMD, int) -#define AUTOFS_IOC_ASKUMOUNT _IOR(AUTOFS_IOCTL, AUTOFS_IOC_ASKUMOUNT_CMD, int) - -#endif /* _LINUX_AUTO_FS4_H */ +#endif /* _UAPI_LINUX_AUTO_FS4_H */ diff --git a/include/uapi/linux/kernel-page-flags.h b/include/uapi/linux/kernel-page-flags.h index fa13984..21b9113 100644 --- a/include/uapi/linux/kernel-page-flags.h +++ b/include/uapi/linux/kernel-page-flags.h @@ -35,6 +35,6 @@ #define KPF_BALLOON 23 #define KPF_ZERO_PAGE 24 #define KPF_IDLE 25 - +#define KPF_PGTABLE 26 #endif /* _UAPILINUX_KERNEL_PAGE_FLAGS_H */ diff --git a/kernel/crash_core.c b/kernel/crash_core.c index f7674d6..b66aced 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -460,6 +460,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_NUMBER(PG_hwpoison); #endif VMCOREINFO_NUMBER(PG_head_mask); +#define PAGE_BUDDY_MAPCOUNT_VALUE (~PG_buddy) VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); #ifdef CONFIG_HUGETLB_PAGE VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR); diff --git a/kernel/fork.c b/kernel/fork.c index 80b48a8..c6d1c1c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -899,6 +899,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm->pinned_vm = 0; memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); spin_lock_init(&mm->page_table_lock); + spin_lock_init(&mm->arg_lock); mm_init_cpumask(mm); mm_init_aio(mm); mm_init_owner(mm, p); diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 751593e..32b4794 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -44,6 +44,7 @@ int __read_mostly sysctl_hung_task_warnings = 10; static int __read_mostly did_panic; static bool hung_task_show_lock; +static bool hung_task_call_panic; static struct task_struct *watchdog_task; @@ -127,10 +128,8 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) touch_nmi_watchdog(); if (sysctl_hung_task_panic) { - if (hung_task_show_lock) - debug_show_all_locks(); - trigger_all_cpu_backtrace(); - panic("hung_task: blocked tasks"); + hung_task_show_lock = true; + hung_task_call_panic = true; } } @@ -193,6 +192,10 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) rcu_read_unlock(); if (hung_task_show_lock) debug_show_all_locks(); + if (hung_task_call_panic) { + trigger_all_cpu_backtrace(); + panic("hung_task: blocked tasks"); + } } static long hung_timeout_jiffies(unsigned long last_checked, diff --git a/kernel/sys.c b/kernel/sys.c index d1b2b8d..38509dc 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2018,7 +2018,11 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data return error; } - down_write(&mm->mmap_sem); + /* + * arg_lock protects concurent updates but we still need mmap_sem for + * read to exclude races with sys_brk. + */ + down_read(&mm->mmap_sem); /* * We don't validate if these members are pointing to @@ -2032,6 +2036,7 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data * to any problem in kernel itself */ + spin_lock(&mm->arg_lock); mm->start_code = prctl_map.start_code; mm->end_code = prctl_map.end_code; mm->start_data = prctl_map.start_data; @@ -2043,6 +2048,7 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data mm->arg_end = prctl_map.arg_end; mm->env_start = prctl_map.env_start; mm->env_end = prctl_map.env_end; + spin_unlock(&mm->arg_lock); /* * Note this update of @saved_auxv is lockless thus @@ -2055,7 +2061,7 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data if (prctl_map.auxv_size) memcpy(mm->saved_auxv, user_auxv, sizeof(user_auxv)); - up_write(&mm->mmap_sem); + up_read(&mm->mmap_sem); return 0; } #endif /* CONFIG_CHECKPOINT_RESTORE */ diff --git a/lib/bitmap.c b/lib/bitmap.c index a42eff7..58f9750 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -64,12 +64,9 @@ EXPORT_SYMBOL(__bitmap_equal); void __bitmap_complement(unsigned long *dst, const unsigned long *src, unsigned int bits) { - unsigned int k, lim = bits/BITS_PER_LONG; + unsigned int k, lim = BITS_TO_LONGS(bits); for (k = 0; k < lim; ++k) dst[k] = ~src[k]; - - if (bits % BITS_PER_LONG) - dst[k] = ~src[k]; } EXPORT_SYMBOL(__bitmap_complement); diff --git a/lib/bucket_locks.c b/lib/bucket_locks.c index 266a97c..ade3ce6 100644 --- a/lib/bucket_locks.c +++ b/lib/bucket_locks.c @@ -30,10 +30,7 @@ int alloc_bucket_spinlocks(spinlock_t **locks, unsigned int *locks_mask, } if (sizeof(spinlock_t) != 0) { - if (gfpflags_allow_blocking(gfp)) - tlocks = kvmalloc(size * sizeof(spinlock_t), gfp); - else - tlocks = kmalloc_array(size, sizeof(spinlock_t), gfp); + tlocks = kvmalloc_array(size, sizeof(spinlock_t), gfp); if (!tlocks) return -ENOMEM; for (i = 0; i < size; i++) @@ -4,9 +4,9 @@ #include <linux/idr.h> #include <linux/slab.h> #include <linux/spinlock.h> +#include <linux/xarray.h> DEFINE_PER_CPU(struct ida_bitmap *, ida_bitmap); -static DEFINE_SPINLOCK(simple_ida_lock); /** * idr_alloc_u32() - Allocate an ID. @@ -581,7 +581,7 @@ again: if (!ida_pre_get(ida, gfp_mask)) return -ENOMEM; - spin_lock_irqsave(&simple_ida_lock, flags); + xa_lock_irqsave(&ida->ida_rt, flags); ret = ida_get_new_above(ida, start, &id); if (!ret) { if (id > max) { @@ -591,7 +591,7 @@ again: ret = id; } } - spin_unlock_irqrestore(&simple_ida_lock, flags); + xa_unlock_irqrestore(&ida->ida_rt, flags); if (unlikely(ret == -EAGAIN)) goto again; @@ -615,8 +615,8 @@ void ida_simple_remove(struct ida *ida, unsigned int id) unsigned long flags; BUG_ON((int)id < 0); - spin_lock_irqsave(&simple_ida_lock, flags); + xa_lock_irqsave(&ida->ida_rt, flags); ida_remove(ida, id); - spin_unlock_irqrestore(&simple_ida_lock, flags); + xa_unlock_irqrestore(&ida->ida_rt, flags); } EXPORT_SYMBOL(ida_simple_remove); diff --git a/lib/mpi/mpi-internal.h b/lib/mpi/mpi-internal.h index 7eceedd..c2d6f4e 100644 --- a/lib/mpi/mpi-internal.h +++ b/lib/mpi/mpi-internal.h @@ -65,13 +65,6 @@ typedef mpi_limb_t *mpi_ptr_t; /* pointer to a limb */ typedef int mpi_size_t; /* (must be a signed type) */ -static inline int RESIZE_IF_NEEDED(MPI a, unsigned b) -{ - if (a->alloced < b) - return mpi_resize(a, b); - return 0; -} - /* Copy N limbs from S to D. */ #define MPN_COPY(d, s, n) \ do { \ @@ -80,13 +73,6 @@ static inline int RESIZE_IF_NEEDED(MPI a, unsigned b) (d)[_i] = (s)[_i]; \ } while (0) -#define MPN_COPY_INCR(d, s, n) \ - do { \ - mpi_size_t _i; \ - for (_i = 0; _i < (n); _i++) \ - (d)[_i] = (s)[_i]; \ - } while (0) - #define MPN_COPY_DECR(d, s, n) \ do { \ mpi_size_t _i; \ @@ -111,15 +97,6 @@ static inline int RESIZE_IF_NEEDED(MPI a, unsigned b) } \ } while (0) -#define MPN_NORMALIZE_NOT_ZERO(d, n) \ - do { \ - for (;;) { \ - if ((d)[(n)-1]) \ - break; \ - (n)--; \ - } \ - } while (0) - #define MPN_MUL_N_RECURSE(prodp, up, vp, size, tspace) \ do { \ if ((size) < KARATSUBA_THRESHOLD) \ @@ -128,46 +105,11 @@ static inline int RESIZE_IF_NEEDED(MPI a, unsigned b) mul_n(prodp, up, vp, size, tspace); \ } while (0); -/* Divide the two-limb number in (NH,,NL) by D, with DI being the largest - * limb not larger than (2**(2*BITS_PER_MP_LIMB))/D - (2**BITS_PER_MP_LIMB). - * If this would yield overflow, DI should be the largest possible number - * (i.e., only ones). For correct operation, the most significant bit of D - * has to be set. Put the quotient in Q and the remainder in R. - */ -#define UDIV_QRNND_PREINV(q, r, nh, nl, d, di) \ - do { \ - mpi_limb_t _q, _ql, _r; \ - mpi_limb_t _xh, _xl; \ - umul_ppmm(_q, _ql, (nh), (di)); \ - _q += (nh); /* DI is 2**BITS_PER_MPI_LIMB too small */ \ - umul_ppmm(_xh, _xl, _q, (d)); \ - sub_ddmmss(_xh, _r, (nh), (nl), _xh, _xl); \ - if (_xh) { \ - sub_ddmmss(_xh, _r, _xh, _r, 0, (d)); \ - _q++; \ - if (_xh) { \ - sub_ddmmss(_xh, _r, _xh, _r, 0, (d)); \ - _q++; \ - } \ - } \ - if (_r >= (d)) { \ - _r -= (d); \ - _q++; \ - } \ - (r) = _r; \ - (q) = _q; \ - } while (0) - /*-- mpiutil.c --*/ mpi_ptr_t mpi_alloc_limb_space(unsigned nlimbs); void mpi_free_limb_space(mpi_ptr_t a); void mpi_assign_limb_space(MPI a, mpi_ptr_t ap, unsigned nlimbs); -/*-- mpi-bit.c --*/ -void mpi_rshift_limbs(MPI a, unsigned int count); -int mpi_lshift_limbs(MPI a, unsigned int count); - -/*-- mpihelp-add.c --*/ static inline mpi_limb_t mpihelp_add_1(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr, mpi_size_t s1_size, mpi_limb_t s2_limb); mpi_limb_t mpihelp_add_n(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr, @@ -175,7 +117,6 @@ mpi_limb_t mpihelp_add_n(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr, static inline mpi_limb_t mpihelp_add(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr, mpi_size_t s1_size, mpi_ptr_t s2_ptr, mpi_size_t s2_size); -/*-- mpihelp-sub.c --*/ static inline mpi_limb_t mpihelp_sub_1(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr, mpi_size_t s1_size, mpi_limb_t s2_limb); mpi_limb_t mpihelp_sub_n(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr, @@ -183,10 +124,10 @@ mpi_limb_t mpihelp_sub_n(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr, static inline mpi_limb_t mpihelp_sub(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr, mpi_size_t s1_size, mpi_ptr_t s2_ptr, mpi_size_t s2_size); -/*-- mpihelp-cmp.c --*/ +/*-- mpih-cmp.c --*/ int mpihelp_cmp(mpi_ptr_t op1_ptr, mpi_ptr_t op2_ptr, mpi_size_t size); -/*-- mpihelp-mul.c --*/ +/*-- mpih-mul.c --*/ struct karatsuba_ctx { struct karatsuba_ctx *next; @@ -202,7 +143,6 @@ mpi_limb_t mpihelp_addmul_1(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr, mpi_size_t s1_size, mpi_limb_t s2_limb); mpi_limb_t mpihelp_submul_1(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr, mpi_size_t s1_size, mpi_limb_t s2_limb); -int mpihelp_mul_n(mpi_ptr_t prodp, mpi_ptr_t up, mpi_ptr_t vp, mpi_size_t size); int mpihelp_mul(mpi_ptr_t prodp, mpi_ptr_t up, mpi_size_t usize, mpi_ptr_t vp, mpi_size_t vsize, mpi_limb_t *_result); void mpih_sqr_n_basecase(mpi_ptr_t prodp, mpi_ptr_t up, mpi_size_t size); @@ -214,21 +154,16 @@ int mpihelp_mul_karatsuba_case(mpi_ptr_t prodp, mpi_ptr_t vp, mpi_size_t vsize, struct karatsuba_ctx *ctx); -/*-- mpihelp-mul_1.c (or xxx/cpu/ *.S) --*/ +/*-- generic_mpih-mul1.c --*/ mpi_limb_t mpihelp_mul_1(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr, mpi_size_t s1_size, mpi_limb_t s2_limb); -/*-- mpihelp-div.c --*/ -mpi_limb_t mpihelp_mod_1(mpi_ptr_t dividend_ptr, mpi_size_t dividend_size, - mpi_limb_t divisor_limb); +/*-- mpih-div.c --*/ mpi_limb_t mpihelp_divrem(mpi_ptr_t qp, mpi_size_t qextra_limbs, mpi_ptr_t np, mpi_size_t nsize, mpi_ptr_t dp, mpi_size_t dsize); -mpi_limb_t mpihelp_divmod_1(mpi_ptr_t quot_ptr, - mpi_ptr_t dividend_ptr, mpi_size_t dividend_size, - mpi_limb_t divisor_limb); -/*-- mpihelp-shift.c --*/ +/*-- generic_mpih-[lr]shift.c --*/ mpi_limb_t mpihelp_lshift(mpi_ptr_t wp, mpi_ptr_t up, mpi_size_t usize, unsigned cnt); mpi_limb_t mpihelp_rshift(mpi_ptr_t wp, mpi_ptr_t up, mpi_size_t usize, diff --git a/lib/percpu_ida.c b/lib/percpu_ida.c index 6016f1d..9bbd9c5 100644 --- a/lib/percpu_ida.c +++ b/lib/percpu_ida.c @@ -112,18 +112,6 @@ static inline void alloc_global_tags(struct percpu_ida *pool, min(pool->nr_free, pool->percpu_batch_size)); } -static inline unsigned alloc_local_tag(struct percpu_ida_cpu *tags) -{ - int tag = -ENOSPC; - - spin_lock(&tags->lock); - if (tags->nr_free) - tag = tags->freelist[--tags->nr_free]; - spin_unlock(&tags->lock); - - return tag; -} - /** * percpu_ida_alloc - allocate a tag * @pool: pool to allocate from @@ -147,20 +135,22 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state) DEFINE_WAIT(wait); struct percpu_ida_cpu *tags; unsigned long flags; - int tag; + int tag = -ENOSPC; - local_irq_save(flags); - tags = this_cpu_ptr(pool->tag_cpu); + tags = raw_cpu_ptr(pool->tag_cpu); + spin_lock_irqsave(&tags->lock, flags); /* Fastpath */ - tag = alloc_local_tag(tags); - if (likely(tag >= 0)) { - local_irq_restore(flags); + if (likely(tags->nr_free >= 0)) { + tag = tags->freelist[--tags->nr_free]; + spin_unlock_irqrestore(&tags->lock, flags); return tag; } + spin_unlock_irqrestore(&tags->lock, flags); while (1) { - spin_lock(&pool->lock); + spin_lock_irqsave(&pool->lock, flags); + tags = this_cpu_ptr(pool->tag_cpu); /* * prepare_to_wait() must come before steal_tags(), in case @@ -184,8 +174,7 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state) &pool->cpus_have_tags); } - spin_unlock(&pool->lock); - local_irq_restore(flags); + spin_unlock_irqrestore(&pool->lock, flags); if (tag >= 0 || state == TASK_RUNNING) break; @@ -196,9 +185,6 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state) } schedule(); - - local_irq_save(flags); - tags = this_cpu_ptr(pool->tag_cpu); } if (state != TASK_RUNNING) finish_wait(&pool->wait, &wait); @@ -222,28 +208,24 @@ void percpu_ida_free(struct percpu_ida *pool, unsigned tag) BUG_ON(tag >= pool->nr_tags); - local_irq_save(flags); - tags = this_cpu_ptr(pool->tag_cpu); + tags = raw_cpu_ptr(pool->tag_cpu); - spin_lock(&tags->lock); + spin_lock_irqsave(&tags->lock, flags); tags->freelist[tags->nr_free++] = tag; nr_free = tags->nr_free; - spin_unlock(&tags->lock); if (nr_free == 1) { cpumask_set_cpu(smp_processor_id(), &pool->cpus_have_tags); wake_up(&pool->wait); } + spin_unlock_irqrestore(&tags->lock, flags); if (nr_free == pool->percpu_max_size) { - spin_lock(&pool->lock); + spin_lock_irqsave(&pool->lock, flags); + spin_lock(&tags->lock); - /* - * Global lock held and irqs disabled, don't need percpu - * lock - */ if (tags->nr_free == pool->percpu_max_size) { move_tags(pool->freelist, &pool->nr_free, tags->freelist, &tags->nr_free, @@ -251,10 +233,9 @@ void percpu_ida_free(struct percpu_ida *pool, unsigned tag) wake_up(&pool->wait); } - spin_unlock(&pool->lock); + spin_unlock(&tags->lock); + spin_unlock_irqrestore(&pool->lock, flags); } - - local_irq_restore(flags); } EXPORT_SYMBOL_GPL(percpu_ida_free); @@ -346,29 +327,27 @@ int percpu_ida_for_each_free(struct percpu_ida *pool, percpu_ida_cb fn, struct percpu_ida_cpu *remote; unsigned cpu, i, err = 0; - local_irq_save(flags); for_each_possible_cpu(cpu) { remote = per_cpu_ptr(pool->tag_cpu, cpu); - spin_lock(&remote->lock); + spin_lock_irqsave(&remote->lock, flags); for (i = 0; i < remote->nr_free; i++) { err = fn(remote->freelist[i], data); if (err) break; } - spin_unlock(&remote->lock); + spin_unlock_irqrestore(&remote->lock, flags); if (err) goto out; } - spin_lock(&pool->lock); + spin_lock_irqsave(&pool->lock, flags); for (i = 0; i < pool->nr_free; i++) { err = fn(pool->freelist[i], data); if (err) break; } - spin_unlock(&pool->lock); + spin_unlock_irqrestore(&pool->lock, flags); out: - local_irq_restore(flags); return err; } EXPORT_SYMBOL_GPL(percpu_ida_for_each_free); diff --git a/lib/ucs2_string.c b/lib/ucs2_string.c index d7e06b2..0a559a4 100644 --- a/lib/ucs2_string.c +++ b/lib/ucs2_string.c @@ -112,3 +112,5 @@ ucs2_as_utf8(u8 *dest, const ucs2_char_t *src, unsigned long maxlength) return j; } EXPORT_SYMBOL(ucs2_as_utf8); + +MODULE_LICENSE("GPL v2"); @@ -754,3 +754,6 @@ config GUP_BENCHMARK performance of get_user_pages_fast(). See tools/testing/selftests/vm/gup_benchmark.c + +config ARCH_HAS_PTE_SPECIAL + bool diff --git a/mm/Makefile b/mm/Makefile index b4e54a9a..8716bda 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -105,3 +105,4 @@ obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o obj-$(CONFIG_HMM) += hmm.o +obj-$(CONFIG_MEMFD_CREATE) += memfd.o diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 8fe3ebd..347cc83 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -557,7 +557,7 @@ static int cgwb_create(struct backing_dev_info *bdi, memcg = mem_cgroup_from_css(memcg_css); blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys); blkcg = css_to_blkcg(blkcg_css); - memcg_cgwb_list = mem_cgroup_cgwb_list(memcg); + memcg_cgwb_list = &memcg->cgwb_list; blkcg_cgwb_list = &blkcg->cgwb_list; /* look up again under lock and discard on blkcg mismatch */ @@ -736,7 +736,7 @@ static void cgwb_bdi_unregister(struct backing_dev_info *bdi) */ void wb_memcg_offline(struct mem_cgroup *memcg) { - struct list_head *memcg_cgwb_list = mem_cgroup_cgwb_list(memcg); + struct list_head *memcg_cgwb_list = &memcg->cgwb_list; struct bdi_writeback *wb, *next; spin_lock_irq(&cgwb_lock); diff --git a/mm/filemap.c b/mm/filemap.c index 0604cb0..52517f2 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2489,7 +2489,7 @@ static void do_async_mmap_readahead(struct vm_area_struct *vma, * * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set. */ -int filemap_fault(struct vm_fault *vmf) +vm_fault_t filemap_fault(struct vm_fault *vmf) { int error; struct file *file = vmf->vma->vm_file; @@ -2499,7 +2499,7 @@ int filemap_fault(struct vm_fault *vmf) pgoff_t offset = vmf->pgoff; pgoff_t max_off; struct page *page; - int ret = 0; + vm_fault_t ret = 0; max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); if (unlikely(offset >= max_off)) @@ -2693,11 +2693,11 @@ next: } EXPORT_SYMBOL(filemap_map_pages); -int filemap_page_mkwrite(struct vm_fault *vmf) +vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf) { struct page *page = vmf->page; struct inode *inode = file_inode(vmf->vma->vm_file); - int ret = VM_FAULT_LOCKED; + vm_fault_t ret = VM_FAULT_LOCKED; sb_start_pagefault(inode->i_sb); file_update_time(vmf->vma->vm_file); @@ -212,53 +212,69 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, unsigned long address, pud_t *pudp, unsigned int flags, unsigned int *page_mask) { - pmd_t *pmd; + pmd_t *pmd, pmdval; spinlock_t *ptl; struct page *page; struct mm_struct *mm = vma->vm_mm; pmd = pmd_offset(pudp, address); - if (pmd_none(*pmd)) + /* + * The READ_ONCE() will stabilize the pmdval in a register or + * on the stack so that it will stop changing under the code. + */ + pmdval = READ_ONCE(*pmd); + if (pmd_none(pmdval)) return no_page_table(vma, flags); - if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { + if (pmd_huge(pmdval) && vma->vm_flags & VM_HUGETLB) { page = follow_huge_pmd(mm, address, pmd, flags); if (page) return page; return no_page_table(vma, flags); } - if (is_hugepd(__hugepd(pmd_val(*pmd)))) { + if (is_hugepd(__hugepd(pmd_val(pmdval)))) { page = follow_huge_pd(vma, address, - __hugepd(pmd_val(*pmd)), flags, + __hugepd(pmd_val(pmdval)), flags, PMD_SHIFT); if (page) return page; return no_page_table(vma, flags); } retry: - if (!pmd_present(*pmd)) { + if (!pmd_present(pmdval)) { if (likely(!(flags & FOLL_MIGRATION))) return no_page_table(vma, flags); VM_BUG_ON(thp_migration_supported() && - !is_pmd_migration_entry(*pmd)); - if (is_pmd_migration_entry(*pmd)) + !is_pmd_migration_entry(pmdval)); + if (is_pmd_migration_entry(pmdval)) pmd_migration_entry_wait(mm, pmd); + pmdval = READ_ONCE(*pmd); + /* + * MADV_DONTNEED may convert the pmd to null because + * mmap_sem is held in read mode + */ + if (pmd_none(pmdval)) + return no_page_table(vma, flags); goto retry; } - if (pmd_devmap(*pmd)) { + if (pmd_devmap(pmdval)) { ptl = pmd_lock(mm, pmd); page = follow_devmap_pmd(vma, address, pmd, flags); spin_unlock(ptl); if (page) return page; } - if (likely(!pmd_trans_huge(*pmd))) + if (likely(!pmd_trans_huge(pmdval))) return follow_page_pte(vma, address, pmd, flags); - if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) + if ((flags & FOLL_NUMA) && pmd_protnone(pmdval)) return no_page_table(vma, flags); retry_locked: ptl = pmd_lock(mm, pmd); + if (unlikely(pmd_none(*pmd))) { + spin_unlock(ptl); + return no_page_table(vma, flags); + } if (unlikely(!pmd_present(*pmd))) { spin_unlock(ptl); if (likely(!(flags & FOLL_MIGRATION))) @@ -1354,7 +1370,7 @@ static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages) } } -#ifdef __HAVE_ARCH_PTE_SPECIAL +#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { @@ -1430,7 +1446,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, { return 0; } -#endif /* __HAVE_ARCH_PTE_SPECIAL */ +#endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */ #if defined(__HAVE_ARCH_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE) static int __gup_device_huge(unsigned long pfn, unsigned long addr, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index ac5591d..ba8fdc0 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -483,11 +483,8 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) static inline struct list_head *page_deferred_list(struct page *page) { - /* - * ->lru in the tail pages is occupied by compound_head. - * Let's use ->mapping + ->index in the second tail page as list_head. - */ - return (struct list_head *)&page[2].mapping; + /* ->lru in the tail pages is occupied by compound_head. */ + return &page[2].deferred_list; } void prep_transhuge_page(struct page *page) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 1290887..696beff 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3159,7 +3159,7 @@ static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma) * hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get * this far. */ -static int hugetlb_vm_op_fault(struct vm_fault *vmf) +static vm_fault_t hugetlb_vm_op_fault(struct vm_fault *vmf) { BUG(); return 0; @@ -3686,6 +3686,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page; pte_t new_pte; spinlock_t *ptl; + unsigned long haddr = address & huge_page_mask(h); /* * Currently, we are forced to kill the process in the event the @@ -3716,7 +3717,7 @@ retry: u32 hash; struct vm_fault vmf = { .vma = vma, - .address = address, + .address = haddr, .flags = flags, /* * Hard to debug if it ends up being @@ -3733,14 +3734,14 @@ retry: * fault to make calling code simpler. */ hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, - idx, address); + idx, haddr); mutex_unlock(&hugetlb_fault_mutex_table[hash]); ret = handle_userfault(&vmf, VM_UFFD_MISSING); mutex_lock(&hugetlb_fault_mutex_table[hash]); goto out; } - page = alloc_huge_page(vma, address, 0); + page = alloc_huge_page(vma, haddr, 0); if (IS_ERR(page)) { ret = PTR_ERR(page); if (ret == -ENOMEM) @@ -3789,12 +3790,12 @@ retry: * the spinlock. */ if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { - if (vma_needs_reservation(h, vma, address) < 0) { + if (vma_needs_reservation(h, vma, haddr) < 0) { ret = VM_FAULT_OOM; goto backout_unlocked; } /* Just decrements count, does not deallocate */ - vma_end_reservation(h, vma, address); + vma_end_reservation(h, vma, haddr); } ptl = huge_pte_lock(h, mm, ptep); @@ -3808,17 +3809,17 @@ retry: if (anon_rmap) { ClearPagePrivate(page); - hugepage_add_new_anon_rmap(page, vma, address); + hugepage_add_new_anon_rmap(page, vma, haddr); } else page_dup_rmap(page, true); new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) && (vma->vm_flags & VM_SHARED))); - set_huge_pte_at(mm, address, ptep, new_pte); + set_huge_pte_at(mm, haddr, ptep, new_pte); hugetlb_count_add(pages_per_huge_page(h), mm); if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { /* Optimization, do the COW without a second fault */ - ret = hugetlb_cow(mm, vma, address, ptep, page, ptl); + ret = hugetlb_cow(mm, vma, haddr, ptep, page, ptl); } spin_unlock(ptl); @@ -3830,7 +3831,7 @@ backout: spin_unlock(ptl); backout_unlocked: unlock_page(page); - restore_reserve_on_error(h, vma, address, page); + restore_reserve_on_error(h, vma, haddr, page); put_page(page); goto out; } @@ -3883,10 +3884,9 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, struct hstate *h = hstate_vma(vma); struct address_space *mapping; int need_wait_lock = 0; + unsigned long haddr = address & huge_page_mask(h); - address &= huge_page_mask(h); - - ptep = huge_pte_offset(mm, address, huge_page_size(h)); + ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); if (ptep) { entry = huge_ptep_get(ptep); if (unlikely(is_hugetlb_entry_migration(entry))) { @@ -3896,20 +3896,20 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, return VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(hstate_index(h)); } else { - ptep = huge_pte_alloc(mm, address, huge_page_size(h)); + ptep = huge_pte_alloc(mm, haddr, huge_page_size(h)); if (!ptep) return VM_FAULT_OOM; } mapping = vma->vm_file->f_mapping; - idx = vma_hugecache_offset(h, vma, address); + idx = vma_hugecache_offset(h, vma, haddr); /* * Serialize hugepage allocation and instantiation, so that we don't * get spurious allocation failures if two CPUs race to instantiate * the same page in the page cache. */ - hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, address); + hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, haddr); mutex_lock(&hugetlb_fault_mutex_table[hash]); entry = huge_ptep_get(ptep); @@ -3939,16 +3939,16 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * consumed. */ if ((flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) { - if (vma_needs_reservation(h, vma, address) < 0) { + if (vma_needs_reservation(h, vma, haddr) < 0) { ret = VM_FAULT_OOM; goto out_mutex; } /* Just decrements count, does not deallocate */ - vma_end_reservation(h, vma, address); + vma_end_reservation(h, vma, haddr); if (!(vma->vm_flags & VM_MAYSHARE)) pagecache_page = hugetlbfs_pagecache_page(h, - vma, address); + vma, haddr); } ptl = huge_pte_lock(h, mm, ptep); @@ -3973,16 +3973,16 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (flags & FAULT_FLAG_WRITE) { if (!huge_pte_write(entry)) { - ret = hugetlb_cow(mm, vma, address, ptep, + ret = hugetlb_cow(mm, vma, haddr, ptep, pagecache_page, ptl); goto out_put_page; } entry = huge_pte_mkdirty(entry); } entry = pte_mkyoung(entry); - if (huge_ptep_set_access_flags(vma, address, ptep, entry, + if (huge_ptep_set_access_flags(vma, haddr, ptep, entry, flags & FAULT_FLAG_WRITE)) - update_mmu_cache(vma, address, ptep); + update_mmu_cache(vma, haddr, ptep); out_put_page: if (page != pagecache_page) unlock_page(page); diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index eec1150..68c2f2f 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -84,7 +84,7 @@ static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup, limit = round_down(PAGE_COUNTER_MAX, 1 << huge_page_order(&hstates[idx])); - ret = page_counter_limit(counter, limit); + ret = page_counter_set_max(counter, limit); VM_BUG_ON(ret); } } @@ -273,7 +273,7 @@ static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css, case RES_USAGE: return (u64)page_counter_read(counter) * PAGE_SIZE; case RES_LIMIT: - return (u64)counter->limit * PAGE_SIZE; + return (u64)counter->max * PAGE_SIZE; case RES_MAX_USAGE: return (u64)counter->watermark * PAGE_SIZE; case RES_FAILCNT: @@ -306,7 +306,7 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of, switch (MEMFILE_ATTR(of_cft(of)->private)) { case RES_LIMIT: mutex_lock(&hugetlb_limit_mutex); - ret = page_counter_limit(&h_cg->hugepage[idx], nr_pages); + ret = page_counter_set_max(&h_cg->hugepage[idx], nr_pages); mutex_unlock(&hugetlb_limit_mutex); break; default: diff --git a/mm/init-mm.c b/mm/init-mm.c index f94d5d1..f0179c9 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c @@ -22,6 +22,7 @@ struct mm_struct init_mm = { .mm_count = ATOMIC_INIT(1), .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem), .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), + .arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock), .mmlist = LIST_HEAD_INIT(init_mm.mmlist), .user_ns = &init_user_ns, INIT_MM_CONTEXT(init_mm) @@ -840,6 +840,17 @@ static int unmerge_ksm_pages(struct vm_area_struct *vma, return err; } +static inline struct stable_node *page_stable_node(struct page *page) +{ + return PageKsm(page) ? page_rmapping(page) : NULL; +} + +static inline void set_page_stable_node(struct page *page, + struct stable_node *stable_node) +{ + page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM); +} + #ifdef CONFIG_SYSFS /* * Only called through the sysfs control interface: diff --git a/mm/memblock.c b/mm/memblock.c index 5108356..93ad42bc 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -68,7 +68,7 @@ ulong __init_memblock choose_memblock_flags(void) /* adjust *@size so that (@base + *@size) doesn't overflow, return new size */ static inline phys_addr_t memblock_cap_size(phys_addr_t base, phys_addr_t *size) { - return *size = min(*size, (phys_addr_t)ULLONG_MAX - base); + return *size = min(*size, PHYS_ADDR_MAX - base); } /* @@ -697,6 +697,11 @@ static int __init_memblock memblock_remove_range(struct memblock_type *type, int __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size) { + phys_addr_t end = base + size - 1; + + memblock_dbg("memblock_remove: [%pa-%pa] %pS\n", + &base, &end, (void *)_RET_IP_); + return memblock_remove_range(&memblock.memory, base, size); } @@ -925,7 +930,7 @@ void __init_memblock __next_mem_range(u64 *idx, int nid, ulong flags, r = &type_b->regions[idx_b]; r_start = idx_b ? r[-1].base + r[-1].size : 0; r_end = idx_b < type_b->cnt ? - r->base : (phys_addr_t)ULLONG_MAX; + r->base : PHYS_ADDR_MAX; /* * if idx_b advanced past idx_a, @@ -1041,7 +1046,7 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid, ulong flags, r = &type_b->regions[idx_b]; r_start = idx_b ? r[-1].base + r[-1].size : 0; r_end = idx_b < type_b->cnt ? - r->base : (phys_addr_t)ULLONG_MAX; + r->base : PHYS_ADDR_MAX; /* * if idx_b advanced past idx_a, * break out to advance idx_a @@ -1516,13 +1521,13 @@ phys_addr_t __init_memblock memblock_end_of_DRAM(void) static phys_addr_t __init_memblock __find_max_addr(phys_addr_t limit) { - phys_addr_t max_addr = (phys_addr_t)ULLONG_MAX; + phys_addr_t max_addr = PHYS_ADDR_MAX; struct memblock_region *r; /* * translate the memory @limit size into the max address within one of * the memory memblock regions, if the @limit exceeds the total size - * of those regions, max_addr will keep original value ULLONG_MAX + * of those regions, max_addr will keep original value PHYS_ADDR_MAX */ for_each_memblock(memory, r) { if (limit <= r->size) { @@ -1537,7 +1542,7 @@ static phys_addr_t __init_memblock __find_max_addr(phys_addr_t limit) void __init memblock_enforce_memory_limit(phys_addr_t limit) { - phys_addr_t max_addr = (phys_addr_t)ULLONG_MAX; + phys_addr_t max_addr = PHYS_ADDR_MAX; if (!limit) return; @@ -1545,14 +1550,14 @@ void __init memblock_enforce_memory_limit(phys_addr_t limit) max_addr = __find_max_addr(limit); /* @limit exceeds the total size of the memory, do nothing */ - if (max_addr == (phys_addr_t)ULLONG_MAX) + if (max_addr == PHYS_ADDR_MAX) return; /* truncate both memory and reserved regions */ memblock_remove_range(&memblock.memory, max_addr, - (phys_addr_t)ULLONG_MAX); + PHYS_ADDR_MAX); memblock_remove_range(&memblock.reserved, max_addr, - (phys_addr_t)ULLONG_MAX); + PHYS_ADDR_MAX); } void __init memblock_cap_memory_range(phys_addr_t base, phys_addr_t size) @@ -1580,7 +1585,7 @@ void __init memblock_cap_memory_range(phys_addr_t base, phys_addr_t size) /* truncate the reserved regions */ memblock_remove_range(&memblock.reserved, 0, base); memblock_remove_range(&memblock.reserved, - base + size, (phys_addr_t)ULLONG_MAX); + base + size, PHYS_ADDR_MAX); } void __init memblock_mem_limit_remove_map(phys_addr_t limit) @@ -1593,7 +1598,7 @@ void __init memblock_mem_limit_remove_map(phys_addr_t limit) max_addr = __find_max_addr(limit); /* @limit exceeds the total size of the memory, do nothing */ - if (max_addr == (phys_addr_t)ULLONG_MAX) + if (max_addr == PHYS_ADDR_MAX) return; memblock_cap_memory_range(0, max_addr); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1695f38..c1e64d6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1034,13 +1034,13 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) unsigned long limit; count = page_counter_read(&memcg->memory); - limit = READ_ONCE(memcg->memory.limit); + limit = READ_ONCE(memcg->memory.max); if (count < limit) margin = limit - count; if (do_memsw_account()) { count = page_counter_read(&memcg->memsw); - limit = READ_ONCE(memcg->memsw.limit); + limit = READ_ONCE(memcg->memsw.max); if (count <= limit) margin = min(margin, limit - count); else @@ -1148,13 +1148,13 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n", K((u64)page_counter_read(&memcg->memory)), - K((u64)memcg->memory.limit), memcg->memory.failcnt); + K((u64)memcg->memory.max), memcg->memory.failcnt); pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n", K((u64)page_counter_read(&memcg->memsw)), - K((u64)memcg->memsw.limit), memcg->memsw.failcnt); + K((u64)memcg->memsw.max), memcg->memsw.failcnt); pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n", K((u64)page_counter_read(&memcg->kmem)), - K((u64)memcg->kmem.limit), memcg->kmem.failcnt); + K((u64)memcg->kmem.max), memcg->kmem.failcnt); for_each_mem_cgroup_tree(iter, memcg) { pr_info("Memory cgroup stats for "); @@ -1179,21 +1179,21 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) /* * Return the memory (and swap, if configured) limit for a memcg. */ -unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg) +unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg) { - unsigned long limit; + unsigned long max; - limit = memcg->memory.limit; + max = memcg->memory.max; if (mem_cgroup_swappiness(memcg)) { - unsigned long memsw_limit; - unsigned long swap_limit; + unsigned long memsw_max; + unsigned long swap_max; - memsw_limit = memcg->memsw.limit; - swap_limit = memcg->swap.limit; - swap_limit = min(swap_limit, (unsigned long)total_swap_pages); - limit = min(limit + swap_limit, memsw_limit); + memsw_max = memcg->memsw.max; + swap_max = memcg->swap.max; + swap_max = min(swap_max, (unsigned long)total_swap_pages); + max = min(max + swap_max, memsw_max); } - return limit; + return max; } static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, @@ -2444,12 +2444,13 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry, } #endif -static DEFINE_MUTEX(memcg_limit_mutex); +static DEFINE_MUTEX(memcg_max_mutex); -static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, - unsigned long limit, bool memsw) +static int mem_cgroup_resize_max(struct mem_cgroup *memcg, + unsigned long max, bool memsw) { bool enlarge = false; + bool drained = false; int ret; bool limits_invariant; struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory; @@ -2460,26 +2461,32 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, break; } - mutex_lock(&memcg_limit_mutex); + mutex_lock(&memcg_max_mutex); /* * Make sure that the new limit (memsw or memory limit) doesn't - * break our basic invariant rule memory.limit <= memsw.limit. + * break our basic invariant rule memory.max <= memsw.max. */ - limits_invariant = memsw ? limit >= memcg->memory.limit : - limit <= memcg->memsw.limit; + limits_invariant = memsw ? max >= memcg->memory.max : + max <= memcg->memsw.max; if (!limits_invariant) { - mutex_unlock(&memcg_limit_mutex); + mutex_unlock(&memcg_max_mutex); ret = -EINVAL; break; } - if (limit > counter->limit) + if (max > counter->max) enlarge = true; - ret = page_counter_limit(counter, limit); - mutex_unlock(&memcg_limit_mutex); + ret = page_counter_set_max(counter, max); + mutex_unlock(&memcg_max_mutex); if (!ret) break; + if (!drained) { + drain_all_stock(memcg); + drained = true; + continue; + } + if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, !memsw)) { ret = -EBUSY; @@ -2603,6 +2610,9 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg) /* we call try-to-free pages for make this cgroup empty */ lru_add_drain_all(); + + drain_all_stock(memcg); + /* try to free all pages in this cgroup */ while (nr_retries && page_counter_read(&memcg->memory)) { int progress; @@ -2757,7 +2767,7 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE; return (u64)page_counter_read(counter) * PAGE_SIZE; case RES_LIMIT: - return (u64)counter->limit * PAGE_SIZE; + return (u64)counter->max * PAGE_SIZE; case RES_MAX_USAGE: return (u64)counter->watermark * PAGE_SIZE; case RES_FAILCNT: @@ -2871,24 +2881,24 @@ static void memcg_free_kmem(struct mem_cgroup *memcg) } #endif /* !CONFIG_SLOB */ -static int memcg_update_kmem_limit(struct mem_cgroup *memcg, - unsigned long limit) +static int memcg_update_kmem_max(struct mem_cgroup *memcg, + unsigned long max) { int ret; - mutex_lock(&memcg_limit_mutex); - ret = page_counter_limit(&memcg->kmem, limit); - mutex_unlock(&memcg_limit_mutex); + mutex_lock(&memcg_max_mutex); + ret = page_counter_set_max(&memcg->kmem, max); + mutex_unlock(&memcg_max_mutex); return ret; } -static int memcg_update_tcp_limit(struct mem_cgroup *memcg, unsigned long limit) +static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max) { int ret; - mutex_lock(&memcg_limit_mutex); + mutex_lock(&memcg_max_mutex); - ret = page_counter_limit(&memcg->tcpmem, limit); + ret = page_counter_set_max(&memcg->tcpmem, max); if (ret) goto out; @@ -2913,7 +2923,7 @@ static int memcg_update_tcp_limit(struct mem_cgroup *memcg, unsigned long limit) memcg->tcpmem_active = true; } out: - mutex_unlock(&memcg_limit_mutex); + mutex_unlock(&memcg_max_mutex); return ret; } @@ -2941,16 +2951,16 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of, } switch (MEMFILE_TYPE(of_cft(of)->private)) { case _MEM: - ret = mem_cgroup_resize_limit(memcg, nr_pages, false); + ret = mem_cgroup_resize_max(memcg, nr_pages, false); break; case _MEMSWAP: - ret = mem_cgroup_resize_limit(memcg, nr_pages, true); + ret = mem_cgroup_resize_max(memcg, nr_pages, true); break; case _KMEM: - ret = memcg_update_kmem_limit(memcg, nr_pages); + ret = memcg_update_kmem_max(memcg, nr_pages); break; case _TCP: - ret = memcg_update_tcp_limit(memcg, nr_pages); + ret = memcg_update_tcp_max(memcg, nr_pages); break; } break; @@ -3083,7 +3093,7 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v) #endif /* CONFIG_NUMA */ /* Universal VM events cgroup1 shows, original sort order */ -unsigned int memcg1_events[] = { +static const unsigned int memcg1_events[] = { PGPGIN, PGPGOUT, PGFAULT, @@ -3126,8 +3136,8 @@ static int memcg_stat_show(struct seq_file *m, void *v) /* Hierarchical information */ memory = memsw = PAGE_COUNTER_MAX; for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) { - memory = min(memory, mi->memory.limit); - memsw = min(memsw, mi->memsw.limit); + memory = min(memory, mi->memory.max); + memsw = min(memsw, mi->memsw.max); } seq_printf(m, "hierarchical_memory_limit %llu\n", (u64)memory * PAGE_SIZE); @@ -3562,11 +3572,6 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, #ifdef CONFIG_CGROUP_WRITEBACK -struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg) -{ - return &memcg->cgwb_list; -} - static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) { return wb_domain_init(&memcg->cgwb_domain, gfp); @@ -3626,7 +3631,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, *pheadroom = PAGE_COUNTER_MAX; while ((parent = parent_mem_cgroup(memcg))) { - unsigned long ceiling = min(memcg->memory.limit, memcg->high); + unsigned long ceiling = min(memcg->memory.max, memcg->high); unsigned long used = page_counter_read(&memcg->memory); *pheadroom = min(*pheadroom, ceiling - min(ceiling, used)); @@ -4270,7 +4275,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) } spin_unlock(&memcg->event_list_lock); - memcg->low = 0; + page_counter_set_min(&memcg->memory, 0); + page_counter_set_low(&memcg->memory, 0); memcg_offline_kmem(memcg); wb_memcg_offline(memcg); @@ -4319,12 +4325,13 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - page_counter_limit(&memcg->memory, PAGE_COUNTER_MAX); - page_counter_limit(&memcg->swap, PAGE_COUNTER_MAX); - page_counter_limit(&memcg->memsw, PAGE_COUNTER_MAX); - page_counter_limit(&memcg->kmem, PAGE_COUNTER_MAX); - page_counter_limit(&memcg->tcpmem, PAGE_COUNTER_MAX); - memcg->low = 0; + page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX); + page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX); + page_counter_set_max(&memcg->memsw, PAGE_COUNTER_MAX); + page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX); + page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX); + page_counter_set_min(&memcg->memory, 0); + page_counter_set_low(&memcg->memory, 0); memcg->high = PAGE_COUNTER_MAX; memcg->soft_limit = PAGE_COUNTER_MAX; memcg_wb_domain_size_changed(memcg); @@ -5061,10 +5068,40 @@ static u64 memory_current_read(struct cgroup_subsys_state *css, return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE; } +static int memory_min_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + unsigned long min = READ_ONCE(memcg->memory.min); + + if (min == PAGE_COUNTER_MAX) + seq_puts(m, "max\n"); + else + seq_printf(m, "%llu\n", (u64)min * PAGE_SIZE); + + return 0; +} + +static ssize_t memory_min_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned long min; + int err; + + buf = strstrip(buf); + err = page_counter_memparse(buf, "max", &min); + if (err) + return err; + + page_counter_set_min(&memcg->memory, min); + + return nbytes; +} + static int memory_low_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); - unsigned long low = READ_ONCE(memcg->low); + unsigned long low = READ_ONCE(memcg->memory.low); if (low == PAGE_COUNTER_MAX) seq_puts(m, "max\n"); @@ -5086,7 +5123,7 @@ static ssize_t memory_low_write(struct kernfs_open_file *of, if (err) return err; - memcg->low = low; + page_counter_set_low(&memcg->memory, low); return nbytes; } @@ -5131,7 +5168,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, static int memory_max_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); - unsigned long max = READ_ONCE(memcg->memory.limit); + unsigned long max = READ_ONCE(memcg->memory.max); if (max == PAGE_COUNTER_MAX) seq_puts(m, "max\n"); @@ -5155,7 +5192,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, if (err) return err; - xchg(&memcg->memory.limit, max); + xchg(&memcg->memory.max, max); for (;;) { unsigned long nr_pages = page_counter_read(&memcg->memory); @@ -5296,6 +5333,12 @@ static struct cftype memory_files[] = { .read_u64 = memory_current_read, }, { + .name = "min", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = memory_min_show, + .write = memory_min_write, + }, + { .name = "low", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = memory_low_show, @@ -5344,54 +5387,140 @@ struct cgroup_subsys memory_cgrp_subsys = { }; /** - * mem_cgroup_low - check if memory consumption is below the normal range + * mem_cgroup_protected - check if memory consumption is in the normal range * @root: the top ancestor of the sub-tree being checked * @memcg: the memory cgroup to check * - * Returns %true if memory consumption of @memcg, and that of all - * ancestors up to (but not including) @root, is below the normal range. + * WARNING: This function is not stateless! It can only be used as part + * of a top-down tree iteration, not for isolated queries. + * + * Returns one of the following: + * MEMCG_PROT_NONE: cgroup memory is not protected + * MEMCG_PROT_LOW: cgroup memory is protected as long there is + * an unprotected supply of reclaimable memory from other cgroups. + * MEMCG_PROT_MIN: cgroup memory is protected * - * @root is exclusive; it is never low when looked at directly and isn't - * checked when traversing the hierarchy. + * @root is exclusive; it is never protected when looked at directly * - * Excluding @root enables using memory.low to prioritize memory usage - * between cgroups within a subtree of the hierarchy that is limited by - * memory.high or memory.max. + * To provide a proper hierarchical behavior, effective memory.min/low values + * are used. Below is the description of how effective memory.low is calculated. + * Effective memory.min values is calculated in the same way. * - * For example, given cgroup A with children B and C: + * Effective memory.low is always equal or less than the original memory.low. + * If there is no memory.low overcommittment (which is always true for + * top-level memory cgroups), these two values are equal. + * Otherwise, it's a part of parent's effective memory.low, + * calculated as a cgroup's memory.low usage divided by sum of sibling's + * memory.low usages, where memory.low usage is the size of actually + * protected memory. * - * A - * / \ - * B C + * low_usage + * elow = min( memory.low, parent->elow * ------------------ ), + * siblings_low_usage * - * and + * | memory.current, if memory.current < memory.low + * low_usage = | + | 0, otherwise. * - * 1. A/memory.current > A/memory.high - * 2. A/B/memory.current < A/B/memory.low - * 3. A/C/memory.current >= A/C/memory.low * - * As 'A' is high, i.e. triggers reclaim from 'A', and 'B' is low, we - * should reclaim from 'C' until 'A' is no longer high or until we can - * no longer reclaim from 'C'. If 'A', i.e. @root, isn't excluded by - * mem_cgroup_low when reclaming from 'A', then 'B' won't be considered - * low and we will reclaim indiscriminately from both 'B' and 'C'. + * Such definition of the effective memory.low provides the expected + * hierarchical behavior: parent's memory.low value is limiting + * children, unprotected memory is reclaimed first and cgroups, + * which are not using their guarantee do not affect actual memory + * distribution. + * + * For example, if there are memcgs A, A/B, A/C, A/D and A/E: + * + * A A/memory.low = 2G, A/memory.current = 6G + * //\\ + * BC DE B/memory.low = 3G B/memory.current = 2G + * C/memory.low = 1G C/memory.current = 2G + * D/memory.low = 0 D/memory.current = 2G + * E/memory.low = 10G E/memory.current = 0 + * + * and the memory pressure is applied, the following memory distribution + * is expected (approximately): + * + * A/memory.current = 2G + * + * B/memory.current = 1.3G + * C/memory.current = 0.6G + * D/memory.current = 0 + * E/memory.current = 0 + * + * These calculations require constant tracking of the actual low usages + * (see propagate_protected_usage()), as well as recursive calculation of + * effective memory.low values. But as we do call mem_cgroup_protected() + * path for each memory cgroup top-down from the reclaim, + * it's possible to optimize this part, and save calculated elow + * for next usage. This part is intentionally racy, but it's ok, + * as memory.low is a best-effort mechanism. */ -bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg) +enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root, + struct mem_cgroup *memcg) { + struct mem_cgroup *parent; + unsigned long emin, parent_emin; + unsigned long elow, parent_elow; + unsigned long usage; + if (mem_cgroup_disabled()) - return false; + return MEMCG_PROT_NONE; if (!root) root = root_mem_cgroup; if (memcg == root) - return false; + return MEMCG_PROT_NONE; + + usage = page_counter_read(&memcg->memory); + if (!usage) + return MEMCG_PROT_NONE; + + emin = memcg->memory.min; + elow = memcg->memory.low; + + parent = parent_mem_cgroup(memcg); + if (parent == root) + goto exit; - for (; memcg != root; memcg = parent_mem_cgroup(memcg)) { - if (page_counter_read(&memcg->memory) >= memcg->low) - return false; + parent_emin = READ_ONCE(parent->memory.emin); + emin = min(emin, parent_emin); + if (emin && parent_emin) { + unsigned long min_usage, siblings_min_usage; + + min_usage = min(usage, memcg->memory.min); + siblings_min_usage = atomic_long_read( + &parent->memory.children_min_usage); + + if (min_usage && siblings_min_usage) + emin = min(emin, parent_emin * min_usage / + siblings_min_usage); } - return true; + parent_elow = READ_ONCE(parent->memory.elow); + elow = min(elow, parent_elow); + if (elow && parent_elow) { + unsigned long low_usage, siblings_low_usage; + + low_usage = min(usage, memcg->memory.low); + siblings_low_usage = atomic_long_read( + &parent->memory.children_low_usage); + + if (low_usage && siblings_low_usage) + elow = min(elow, parent_elow * low_usage / + siblings_low_usage); + } + +exit: + memcg->memory.emin = emin; + memcg->memory.elow = elow; + + if (usage <= emin) + return MEMCG_PROT_MIN; + else if (usage <= elow) + return MEMCG_PROT_LOW; + else + return MEMCG_PROT_NONE; } /** @@ -6012,10 +6141,17 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) if (!memcg) return 0; + if (!entry.val) { + memcg_memory_event(memcg, MEMCG_SWAP_FAIL); + return 0; + } + memcg = mem_cgroup_id_get_online(memcg); if (!mem_cgroup_is_root(memcg) && !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) { + memcg_memory_event(memcg, MEMCG_SWAP_MAX); + memcg_memory_event(memcg, MEMCG_SWAP_FAIL); mem_cgroup_id_put(memcg); return -ENOMEM; } @@ -6067,7 +6203,7 @@ long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) return nr_swap_pages; for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) nr_swap_pages = min_t(long, nr_swap_pages, - READ_ONCE(memcg->swap.limit) - + READ_ONCE(memcg->swap.max) - page_counter_read(&memcg->swap)); return nr_swap_pages; } @@ -6088,7 +6224,7 @@ bool mem_cgroup_swap_full(struct page *page) return false; for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) - if (page_counter_read(&memcg->swap) * 2 >= memcg->swap.limit) + if (page_counter_read(&memcg->swap) * 2 >= memcg->swap.max) return true; return false; @@ -6122,7 +6258,7 @@ static u64 swap_current_read(struct cgroup_subsys_state *css, static int swap_max_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); - unsigned long max = READ_ONCE(memcg->swap.limit); + unsigned long max = READ_ONCE(memcg->swap.max); if (max == PAGE_COUNTER_MAX) seq_puts(m, "max\n"); @@ -6144,15 +6280,23 @@ static ssize_t swap_max_write(struct kernfs_open_file *of, if (err) return err; - mutex_lock(&memcg_limit_mutex); - err = page_counter_limit(&memcg->swap, max); - mutex_unlock(&memcg_limit_mutex); - if (err) - return err; + xchg(&memcg->swap.max, max); return nbytes; } +static int swap_events_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + + seq_printf(m, "max %lu\n", + atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX])); + seq_printf(m, "fail %lu\n", + atomic_long_read(&memcg->memory_events[MEMCG_SWAP_FAIL])); + + return 0; +} + static struct cftype swap_files[] = { { .name = "swap.current", @@ -6165,6 +6309,12 @@ static struct cftype swap_files[] = { .seq_show = swap_max_show, .write = swap_max_write, }, + { + .name = "swap.events", + .flags = CFTYPE_NOT_ON_ROOT, + .file_offset = offsetof(struct mem_cgroup, swap_events_file), + .seq_show = swap_events_show, + }, { } /* terminate */ }; diff --git a/mm/memfd.c b/mm/memfd.c new file mode 100644 index 0000000..2706951 --- /dev/null +++ b/mm/memfd.c @@ -0,0 +1,345 @@ +/* + * memfd_create system call and file sealing support + * + * Code was originally included in shmem.c, and broken out to facilitate + * use by hugetlbfs as well as tmpfs. + * + * This file is released under the GPL. + */ + +#include <linux/fs.h> +#include <linux/vfs.h> +#include <linux/pagemap.h> +#include <linux/file.h> +#include <linux/mm.h> +#include <linux/sched/signal.h> +#include <linux/khugepaged.h> +#include <linux/syscalls.h> +#include <linux/hugetlb.h> +#include <linux/shmem_fs.h> +#include <linux/memfd.h> +#include <uapi/linux/memfd.h> + +/* + * We need a tag: a new tag would expand every radix_tree_node by 8 bytes, + * so reuse a tag which we firmly believe is never set or cleared on tmpfs + * or hugetlbfs because they are memory only filesystems. + */ +#define MEMFD_TAG_PINNED PAGECACHE_TAG_TOWRITE +#define LAST_SCAN 4 /* about 150ms max */ + +static void memfd_tag_pins(struct address_space *mapping) +{ + struct radix_tree_iter iter; + void __rcu **slot; + pgoff_t start; + struct page *page; + + lru_add_drain(); + start = 0; + rcu_read_lock(); + + radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { + page = radix_tree_deref_slot(slot); + if (!page || radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) { + slot = radix_tree_iter_retry(&iter); + continue; + } + } else if (page_count(page) - page_mapcount(page) > 1) { + xa_lock_irq(&mapping->i_pages); + radix_tree_tag_set(&mapping->i_pages, iter.index, + MEMFD_TAG_PINNED); + xa_unlock_irq(&mapping->i_pages); + } + + if (need_resched()) { + slot = radix_tree_iter_resume(slot, &iter); + cond_resched_rcu(); + } + } + rcu_read_unlock(); +} + +/* + * Setting SEAL_WRITE requires us to verify there's no pending writer. However, + * via get_user_pages(), drivers might have some pending I/O without any active + * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all pages + * and see whether it has an elevated ref-count. If so, we tag them and wait for + * them to be dropped. + * The caller must guarantee that no new user will acquire writable references + * to those pages to avoid races. + */ +static int memfd_wait_for_pins(struct address_space *mapping) +{ + struct radix_tree_iter iter; + void __rcu **slot; + pgoff_t start; + struct page *page; + int error, scan; + + memfd_tag_pins(mapping); + + error = 0; + for (scan = 0; scan <= LAST_SCAN; scan++) { + if (!radix_tree_tagged(&mapping->i_pages, MEMFD_TAG_PINNED)) + break; + + if (!scan) + lru_add_drain_all(); + else if (schedule_timeout_killable((HZ << scan) / 200)) + scan = LAST_SCAN; + + start = 0; + rcu_read_lock(); + radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, + start, MEMFD_TAG_PINNED) { + + page = radix_tree_deref_slot(slot); + if (radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) { + slot = radix_tree_iter_retry(&iter); + continue; + } + + page = NULL; + } + + if (page && + page_count(page) - page_mapcount(page) != 1) { + if (scan < LAST_SCAN) + goto continue_resched; + + /* + * On the last scan, we clean up all those tags + * we inserted; but make a note that we still + * found pages pinned. + */ + error = -EBUSY; + } + + xa_lock_irq(&mapping->i_pages); + radix_tree_tag_clear(&mapping->i_pages, + iter.index, MEMFD_TAG_PINNED); + xa_unlock_irq(&mapping->i_pages); +continue_resched: + if (need_resched()) { + slot = radix_tree_iter_resume(slot, &iter); + cond_resched_rcu(); + } + } + rcu_read_unlock(); + } + + return error; +} + +static unsigned int *memfd_file_seals_ptr(struct file *file) +{ + if (shmem_file(file)) + return &SHMEM_I(file_inode(file))->seals; + +#ifdef CONFIG_HUGETLBFS + if (is_file_hugepages(file)) + return &HUGETLBFS_I(file_inode(file))->seals; +#endif + + return NULL; +} + +#define F_ALL_SEALS (F_SEAL_SEAL | \ + F_SEAL_SHRINK | \ + F_SEAL_GROW | \ + F_SEAL_WRITE) + +static int memfd_add_seals(struct file *file, unsigned int seals) +{ + struct inode *inode = file_inode(file); + unsigned int *file_seals; + int error; + + /* + * SEALING + * Sealing allows multiple parties to share a tmpfs or hugetlbfs file + * but restrict access to a specific subset of file operations. Seals + * can only be added, but never removed. This way, mutually untrusted + * parties can share common memory regions with a well-defined policy. + * A malicious peer can thus never perform unwanted operations on a + * shared object. + * + * Seals are only supported on special tmpfs or hugetlbfs files and + * always affect the whole underlying inode. Once a seal is set, it + * may prevent some kinds of access to the file. Currently, the + * following seals are defined: + * SEAL_SEAL: Prevent further seals from being set on this file + * SEAL_SHRINK: Prevent the file from shrinking + * SEAL_GROW: Prevent the file from growing + * SEAL_WRITE: Prevent write access to the file + * + * As we don't require any trust relationship between two parties, we + * must prevent seals from being removed. Therefore, sealing a file + * only adds a given set of seals to the file, it never touches + * existing seals. Furthermore, the "setting seals"-operation can be + * sealed itself, which basically prevents any further seal from being + * added. + * + * Semantics of sealing are only defined on volatile files. Only + * anonymous tmpfs and hugetlbfs files support sealing. More + * importantly, seals are never written to disk. Therefore, there's + * no plan to support it on other file types. + */ + + if (!(file->f_mode & FMODE_WRITE)) + return -EPERM; + if (seals & ~(unsigned int)F_ALL_SEALS) + return -EINVAL; + + inode_lock(inode); + + file_seals = memfd_file_seals_ptr(file); + if (!file_seals) { + error = -EINVAL; + goto unlock; + } + + if (*file_seals & F_SEAL_SEAL) { + error = -EPERM; + goto unlock; + } + + if ((seals & F_SEAL_WRITE) && !(*file_seals & F_SEAL_WRITE)) { + error = mapping_deny_writable(file->f_mapping); + if (error) + goto unlock; + + error = memfd_wait_for_pins(file->f_mapping); + if (error) { + mapping_allow_writable(file->f_mapping); + goto unlock; + } + } + + *file_seals |= seals; + error = 0; + +unlock: + inode_unlock(inode); + return error; +} + +static int memfd_get_seals(struct file *file) +{ + unsigned int *seals = memfd_file_seals_ptr(file); + + return seals ? *seals : -EINVAL; +} + +long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg) +{ + long error; + + switch (cmd) { + case F_ADD_SEALS: + /* disallow upper 32bit */ + if (arg > UINT_MAX) + return -EINVAL; + + error = memfd_add_seals(file, arg); + break; + case F_GET_SEALS: + error = memfd_get_seals(file); + break; + default: + error = -EINVAL; + break; + } + + return error; +} + +#define MFD_NAME_PREFIX "memfd:" +#define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1) +#define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN) + +#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB) + +SYSCALL_DEFINE2(memfd_create, + const char __user *, uname, + unsigned int, flags) +{ + unsigned int *file_seals; + struct file *file; + int fd, error; + char *name; + long len; + + if (!(flags & MFD_HUGETLB)) { + if (flags & ~(unsigned int)MFD_ALL_FLAGS) + return -EINVAL; + } else { + /* Allow huge page size encoding in flags. */ + if (flags & ~(unsigned int)(MFD_ALL_FLAGS | + (MFD_HUGE_MASK << MFD_HUGE_SHIFT))) + return -EINVAL; + } + + /* length includes terminating zero */ + len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1); + if (len <= 0) + return -EFAULT; + if (len > MFD_NAME_MAX_LEN + 1) + return -EINVAL; + + name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_KERNEL); + if (!name) + return -ENOMEM; + + strcpy(name, MFD_NAME_PREFIX); + if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) { + error = -EFAULT; + goto err_name; + } + + /* terminating-zero may have changed after strnlen_user() returned */ + if (name[len + MFD_NAME_PREFIX_LEN - 1]) { + error = -EFAULT; + goto err_name; + } + + fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0); + if (fd < 0) { + error = fd; + goto err_name; + } + + if (flags & MFD_HUGETLB) { + struct user_struct *user = NULL; + + file = hugetlb_file_setup(name, 0, VM_NORESERVE, &user, + HUGETLB_ANONHUGE_INODE, + (flags >> MFD_HUGE_SHIFT) & + MFD_HUGE_MASK); + } else + file = shmem_file_setup(name, 0, VM_NORESERVE); + if (IS_ERR(file)) { + error = PTR_ERR(file); + goto err_fd; + } + file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; + file->f_flags |= O_RDWR | O_LARGEFILE; + + if (flags & MFD_ALLOW_SEALING) { + file_seals = memfd_file_seals_ptr(file); + *file_seals &= ~F_SEAL_SEAL; + } + + fd_install(fd, file); + kfree(name); + return fd; + +err_fd: + put_unused_fd(fd); +err_name: + kfree(name); + return error; +} diff --git a/mm/memory.c b/mm/memory.c index 5d8c2af..7206a63 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -817,17 +817,12 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, * PFNMAP mappings in order to support COWable mappings. * */ -#ifdef __HAVE_ARCH_PTE_SPECIAL -# define HAVE_PTE_SPECIAL 1 -#else -# define HAVE_PTE_SPECIAL 0 -#endif struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte, bool with_public_device) { unsigned long pfn = pte_pfn(pte); - if (HAVE_PTE_SPECIAL) { + if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL)) { if (likely(!pte_special(pte))) goto check_pfn; if (vma->vm_ops && vma->vm_ops->find_special_page) @@ -862,7 +857,7 @@ struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr, return NULL; } - /* !HAVE_PTE_SPECIAL case follows: */ + /* !CONFIG_ARCH_HAS_PTE_SPECIAL case follows: */ if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { if (vma->vm_flags & VM_MIXEDMAP) { @@ -881,6 +876,7 @@ struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr, if (is_zero_pfn(pfn)) return NULL; + check_pfn: if (unlikely(pfn > highest_memmap_pfn)) { print_bad_pte(vma, addr, pte, NULL); @@ -904,7 +900,7 @@ struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, /* * There is no pmd_special() but there may be special pmds, e.g. * in a direct-access (dax) mapping, so let's just replicate the - * !HAVE_PTE_SPECIAL case from vm_normal_page() here. + * !CONFIG_ARCH_HAS_PTE_SPECIAL case from vm_normal_page() here. */ if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { if (vma->vm_flags & VM_MIXEDMAP) { @@ -1932,7 +1928,8 @@ static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, * than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP * without pte special, it would there be refcounted as a normal page. */ - if (!HAVE_PTE_SPECIAL && !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) { + if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && + !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) { struct page *page; /* @@ -1954,12 +1951,25 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, } EXPORT_SYMBOL(vm_insert_mixed); -int vm_insert_mixed_mkwrite(struct vm_area_struct *vma, unsigned long addr, - pfn_t pfn) +/* + * If the insertion of PTE failed because someone else already added a + * different entry in the mean time, we treat that as success as we assume + * the same entry was actually inserted. + */ + +vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma, + unsigned long addr, pfn_t pfn) { - return __vm_insert_mixed(vma, addr, pfn, true); + int err; + + err = __vm_insert_mixed(vma, addr, pfn, true); + if (err == -ENOMEM) + return VM_FAULT_OOM; + if (err < 0 && err != -EBUSY) + return VM_FAULT_SIGBUS; + return VM_FAULT_NOPAGE; } -EXPORT_SYMBOL(vm_insert_mixed_mkwrite); +EXPORT_SYMBOL(vmf_insert_mixed_mkwrite); /* * maps a range of physical memory into the requested pages. the old diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 2598246..7deb49f 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1237,6 +1237,29 @@ static struct page *next_active_pageblock(struct page *page) return page + pageblock_nr_pages; } +static bool is_pageblock_removable_nolock(struct page *page) +{ + struct zone *zone; + unsigned long pfn; + + /* + * We have to be careful here because we are iterating over memory + * sections which are not zone aware so we might end up outside of + * the zone but still within the section. + * We have to take care about the node as well. If the node is offline + * its NODE_DATA will be NULL - see page_zone. + */ + if (!node_online(page_to_nid(page))) + return false; + + zone = page_zone(page); + pfn = page_to_pfn(page); + if (!zone_spans_pfn(zone, pfn)) + return false; + + return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE, true); +} + /* Checks if this range of memory is likely to be hot-removable. */ bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) { @@ -3277,7 +3277,7 @@ void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages) mm->data_vm += npages; } -static int special_mapping_fault(struct vm_fault *vmf); +static vm_fault_t special_mapping_fault(struct vm_fault *vmf); /* * Having a close hook prevents vma merging regardless of flags. @@ -3316,7 +3316,7 @@ static const struct vm_operations_struct legacy_special_mapping_vmops = { .fault = special_mapping_fault, }; -static int special_mapping_fault(struct vm_fault *vmf) +static vm_fault_t special_mapping_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; pgoff_t pgoff; @@ -1763,7 +1763,7 @@ unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr, return -ENOMEM; } -int filemap_fault(struct vm_fault *vmf) +vm_fault_t filemap_fault(struct vm_fault *vmf) { BUG(); return 0; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 8ba6cb8..6694348 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -256,7 +256,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc) int nid; if (is_memcg_oom(oc)) { - oc->totalpages = mem_cgroup_get_limit(oc->memcg) ?: 1; + oc->totalpages = mem_cgroup_get_max(oc->memcg) ?: 1; return CONSTRAINT_MEMCG; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 22320ea27..07b3c23 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -705,16 +705,14 @@ static inline void rmv_page_order(struct page *page) /* * This function checks whether a page is free && is the buddy - * we can do coalesce a page and its buddy if + * we can coalesce a page and its buddy if * (a) the buddy is not in a hole (check before calling!) && * (b) the buddy is in the buddy system && * (c) a page and its buddy have the same order && * (d) a page and its buddy are in the same zone. * - * For recording whether a page is in the buddy system, we set ->_mapcount - * PAGE_BUDDY_MAPCOUNT_VALUE. - * Setting, clearing, and testing _mapcount PAGE_BUDDY_MAPCOUNT_VALUE is - * serialized by zone->lock. + * For recording whether a page is in the buddy system, we set PageBuddy. + * Setting, clearing, and testing PageBuddy is serialized by zone->lock. * * For recording page's order, we use page_private(page). */ @@ -759,9 +757,8 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, * as necessary, plus some accounting needed to play nicely with other * parts of the VM system. * At each level, we keep a list of pages, which are heads of continuous - * free pages of length of (1 << order) and marked with _mapcount - * PAGE_BUDDY_MAPCOUNT_VALUE. Page's order is recorded in page_private(page) - * field. + * free pages of length of (1 << order) and marked with PageBuddy. + * Page's order is recorded in page_private(page) field. * So when we are allocating or freeing one, we can derive the state of the * other. That is, if we allocate a small block, and both were * free, the remainder of the region must be split into blocks. @@ -946,7 +943,7 @@ static int free_tail_pages_check(struct page *head_page, struct page *page) } switch (page - head_page) { case 1: - /* the first tail page: ->mapping is compound_mapcount() */ + /* the first tail page: ->mapping may be compound_mapcount() */ if (unlikely(compound_mapcount(page))) { bad_page(page, "nonzero compound_mapcount", 0); goto out; @@ -955,7 +952,7 @@ static int free_tail_pages_check(struct page *head_page, struct page *page) case 2: /* * the second tail page: ->mapping is - * page_deferred_list().next -- ignore value. + * deferred_list.next -- ignore value. */ break; default: @@ -3701,7 +3698,7 @@ should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_fla #endif /* CONFIG_COMPACTION */ #ifdef CONFIG_LOCKDEP -struct lockdep_map __fs_reclaim_map = +static struct lockdep_map __fs_reclaim_map = STATIC_LOCKDEP_MAP_INIT("fs_reclaim", &__fs_reclaim_map); static bool __need_fs_reclaim(gfp_t gfp_mask) @@ -3726,17 +3723,27 @@ static bool __need_fs_reclaim(gfp_t gfp_mask) return true; } +void __fs_reclaim_acquire(void) +{ + lock_map_acquire(&__fs_reclaim_map); +} + +void __fs_reclaim_release(void) +{ + lock_map_release(&__fs_reclaim_map); +} + void fs_reclaim_acquire(gfp_t gfp_mask) { if (__need_fs_reclaim(gfp_mask)) - lock_map_acquire(&__fs_reclaim_map); + __fs_reclaim_acquire(); } EXPORT_SYMBOL_GPL(fs_reclaim_acquire); void fs_reclaim_release(gfp_t gfp_mask) { if (__need_fs_reclaim(gfp_mask)) - lock_map_release(&__fs_reclaim_map); + __fs_reclaim_release(); } EXPORT_SYMBOL_GPL(fs_reclaim_release); #endif @@ -3754,8 +3761,8 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, /* We now go into synchronous reclaim */ cpuset_memory_pressure_bump(); - noreclaim_flag = memalloc_noreclaim_save(); fs_reclaim_acquire(gfp_mask); + noreclaim_flag = memalloc_noreclaim_save(); reclaim_state.reclaimed_slab = 0; current->reclaim_state = &reclaim_state; @@ -3763,8 +3770,8 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, ac->nodemask); current->reclaim_state = NULL; - fs_reclaim_release(gfp_mask); memalloc_noreclaim_restore(noreclaim_flag); + fs_reclaim_release(gfp_mask); cond_resched(); @@ -4162,7 +4169,6 @@ retry: * orientated. */ if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) { - ac->zonelist = node_zonelist(numa_node_id(), gfp_mask); ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, ac->high_zoneidx, ac->nodemask); } @@ -4326,8 +4332,7 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, } /* Determine whether to spread dirty pages and what the first usable zone */ -static inline void finalise_ac(gfp_t gfp_mask, - unsigned int order, struct alloc_context *ac) +static inline void finalise_ac(gfp_t gfp_mask, struct alloc_context *ac) { /* Dirty zone balancing only done in the fast path */ ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE); @@ -4358,7 +4363,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags)) return NULL; - finalise_ac(gfp_mask, order, &ac); + finalise_ac(gfp_mask, &ac); /* First allocation attempt */ page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac); @@ -6229,18 +6234,18 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; - unsigned long size, realsize, freesize, memmap_pages; + unsigned long size, freesize, memmap_pages; unsigned long zone_start_pfn = zone->zone_start_pfn; size = zone->spanned_pages; - realsize = freesize = zone->present_pages; + freesize = zone->present_pages; /* * Adjust freesize so that it accounts for how much memory * is used by this zone for memmap. This affects the watermark * and per-cpu initialisations */ - memmap_pages = calc_memmap_size(size, realsize); + memmap_pages = calc_memmap_size(size, freesize); if (!is_highmem_idx(j)) { if (freesize >= memmap_pages) { freesize -= memmap_pages; @@ -6272,7 +6277,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) * when the bootmem allocator frees pages into the buddy system. * And all highmem pages will be managed by the buddy system. */ - zone->managed_pages = is_highmem_idx(j) ? realsize : freesize; + zone->managed_pages = freesize; #ifdef CONFIG_NUMA zone->node = nid; #endif @@ -7682,29 +7687,6 @@ unmovable: return true; } -bool is_pageblock_removable_nolock(struct page *page) -{ - struct zone *zone; - unsigned long pfn; - - /* - * We have to be careful here because we are iterating over memory - * sections which are not zone aware so we might end up outside of - * the zone but still within the section. - * We have to take care about the node as well. If the node is offline - * its NODE_DATA will be NULL - see page_zone. - */ - if (!node_online(page_to_nid(page))) - return false; - - zone = page_zone(page); - pfn = page_to_pfn(page); - if (!zone_spans_pfn(zone, pfn)) - return false; - - return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE, true); -} - #if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA) static unsigned long pfn_max_align_down(unsigned long pfn) diff --git a/mm/page_counter.c b/mm/page_counter.c index 2a8df3a..de31470 100644 --- a/mm/page_counter.c +++ b/mm/page_counter.c @@ -13,6 +13,40 @@ #include <linux/bug.h> #include <asm/page.h> +static void propagate_protected_usage(struct page_counter *c, + unsigned long usage) +{ + unsigned long protected, old_protected; + long delta; + + if (!c->parent) + return; + + if (c->min || atomic_long_read(&c->min_usage)) { + if (usage <= c->min) + protected = usage; + else + protected = 0; + + old_protected = atomic_long_xchg(&c->min_usage, protected); + delta = protected - old_protected; + if (delta) + atomic_long_add(delta, &c->parent->children_min_usage); + } + + if (c->low || atomic_long_read(&c->low_usage)) { + if (usage <= c->low) + protected = usage; + else + protected = 0; + + old_protected = atomic_long_xchg(&c->low_usage, protected); + delta = protected - old_protected; + if (delta) + atomic_long_add(delta, &c->parent->children_low_usage); + } +} + /** * page_counter_cancel - take pages out of the local counter * @counter: counter @@ -22,7 +56,8 @@ void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages) { long new; - new = atomic_long_sub_return(nr_pages, &counter->count); + new = atomic_long_sub_return(nr_pages, &counter->usage); + propagate_protected_usage(counter, new); /* More uncharges than charges? */ WARN_ON_ONCE(new < 0); } @@ -41,7 +76,8 @@ void page_counter_charge(struct page_counter *counter, unsigned long nr_pages) for (c = counter; c; c = c->parent) { long new; - new = atomic_long_add_return(nr_pages, &c->count); + new = atomic_long_add_return(nr_pages, &c->usage); + propagate_protected_usage(counter, new); /* * This is indeed racy, but we can live with some * inaccuracy in the watermark. @@ -82,9 +118,10 @@ bool page_counter_try_charge(struct page_counter *counter, * we either see the new limit or the setter sees the * counter has changed and retries. */ - new = atomic_long_add_return(nr_pages, &c->count); - if (new > c->limit) { - atomic_long_sub(nr_pages, &c->count); + new = atomic_long_add_return(nr_pages, &c->usage); + if (new > c->max) { + atomic_long_sub(nr_pages, &c->usage); + propagate_protected_usage(counter, new); /* * This is racy, but we can live with some * inaccuracy in the failcnt. @@ -93,6 +130,7 @@ bool page_counter_try_charge(struct page_counter *counter, *fail = c; goto failed; } + propagate_protected_usage(counter, new); /* * Just like with failcnt, we can live with some * inaccuracy in the watermark. @@ -123,20 +161,20 @@ void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages) } /** - * page_counter_limit - limit the number of pages allowed + * page_counter_set_max - set the maximum number of pages allowed * @counter: counter - * @limit: limit to set + * @nr_pages: limit to set * * Returns 0 on success, -EBUSY if the current number of pages on the * counter already exceeds the specified limit. * * The caller must serialize invocations on the same counter. */ -int page_counter_limit(struct page_counter *counter, unsigned long limit) +int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages) { for (;;) { unsigned long old; - long count; + long usage; /* * Update the limit while making sure that it's not @@ -149,22 +187,56 @@ int page_counter_limit(struct page_counter *counter, unsigned long limit) * the limit, so if it sees the old limit, we see the * modified counter and retry. */ - count = atomic_long_read(&counter->count); + usage = atomic_long_read(&counter->usage); - if (count > limit) + if (usage > nr_pages) return -EBUSY; - old = xchg(&counter->limit, limit); + old = xchg(&counter->max, nr_pages); - if (atomic_long_read(&counter->count) <= count) + if (atomic_long_read(&counter->usage) <= usage) return 0; - counter->limit = old; + counter->max = old; cond_resched(); } } /** + * page_counter_set_min - set the amount of protected memory + * @counter: counter + * @nr_pages: value to set + * + * The caller must serialize invocations on the same counter. + */ +void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages) +{ + struct page_counter *c; + + counter->min = nr_pages; + + for (c = counter; c; c = c->parent) + propagate_protected_usage(c, atomic_long_read(&c->usage)); +} + +/** + * page_counter_set_low - set the amount of protected memory + * @counter: counter + * @nr_pages: value to set + * + * The caller must serialize invocations on the same counter. + */ +void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages) +{ + struct page_counter *c; + + counter->low = nr_pages; + + for (c = counter; c; c = c->parent) + propagate_protected_usage(c, atomic_long_read(&c->usage)); +} + +/** * page_counter_memparse - memparse() for page counter limits * @buf: string to parse * @max: string meaning maximum possible value @@ -327,7 +327,7 @@ static int shmem_radix_tree_replace(struct address_space *mapping, pgoff_t index, void *expected, void *replacement) { struct radix_tree_node *node; - void **pslot; + void __rcu **pslot; void *item; VM_BUG_ON(!expected); @@ -395,7 +395,7 @@ static bool shmem_confirm_swap(struct address_space *mapping, #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE /* ifdef here to avoid bloating shmem.o when not necessary */ -int shmem_huge __read_mostly; +static int shmem_huge __read_mostly; #if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS) static int shmem_parse_huge(const char *str) @@ -571,6 +571,15 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, } #endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE */ +static inline bool is_huge_enabled(struct shmem_sb_info *sbinfo) +{ + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && + (shmem_huge == SHMEM_HUGE_FORCE || sbinfo->huge) && + shmem_huge != SHMEM_HUGE_DENY) + return true; + return false; +} + /* * Like add_to_page_cache_locked, but error if expected item has gone. */ @@ -682,7 +691,7 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping, pgoff_t start, pgoff_t end) { struct radix_tree_iter iter; - void **slot; + void __rcu **slot; struct page *page; unsigned long swapped = 0; @@ -988,6 +997,7 @@ static int shmem_getattr(const struct path *path, struct kstat *stat, { struct inode *inode = path->dentry->d_inode; struct shmem_inode_info *info = SHMEM_I(inode); + struct shmem_sb_info *sb_info = SHMEM_SB(inode->i_sb); if (info->alloced - info->swapped != inode->i_mapping->nrpages) { spin_lock_irq(&info->lock); @@ -995,6 +1005,10 @@ static int shmem_getattr(const struct path *path, struct kstat *stat, spin_unlock_irq(&info->lock); } generic_fillattr(inode, stat); + + if (is_huge_enabled(sb_info)) + stat->blksize = HPAGE_PMD_SIZE; + return 0; } @@ -1098,13 +1112,19 @@ static void shmem_evict_inode(struct inode *inode) static unsigned long find_swap_entry(struct radix_tree_root *root, void *item) { struct radix_tree_iter iter; - void **slot; + void __rcu **slot; unsigned long found = -1; unsigned int checked = 0; rcu_read_lock(); radix_tree_for_each_slot(slot, root, &iter, 0) { - if (*slot == item) { + void *entry = radix_tree_deref_slot(slot); + + if (radix_tree_deref_retry(entry)) { + slot = radix_tree_iter_retry(&iter); + continue; + } + if (entry == item) { found = iter.index; break; } @@ -1322,9 +1342,6 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) if (!swap.val) goto redirty; - if (mem_cgroup_try_charge_swap(page, swap)) - goto free_swap; - /* * Add inode to shmem_unuse()'s list of swapped-out inodes, * if it's not already there. Do it now before the page is @@ -1353,7 +1370,6 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) } mutex_unlock(&shmem_swaplist_mutex); -free_swap: put_swap_page(page, swap); redirty: set_page_dirty(page); @@ -1404,10 +1420,9 @@ static void shmem_pseudo_vma_init(struct vm_area_struct *vma, struct shmem_inode_info *info, pgoff_t index) { /* Create a pseudo vma that just contains the policy */ - vma->vm_start = 0; + memset(vma, 0, sizeof(*vma)); /* Bias interleave by inode number to distribute better across nodes */ vma->vm_pgoff = index + info->vfs_inode.i_ino; - vma->vm_ops = NULL; vma->vm_policy = mpol_shared_policy_lookup(&info->policy, index); } @@ -1931,14 +1946,14 @@ static int synchronous_wake_function(wait_queue_entry_t *wait, unsigned mode, in return ret; } -static int shmem_fault(struct vm_fault *vmf) +static vm_fault_t shmem_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct inode *inode = file_inode(vma->vm_file); gfp_t gfp = mapping_gfp_mask(inode->i_mapping); enum sgp_type sgp; - int error; - int ret = VM_FAULT_LOCKED; + int err; + vm_fault_t ret = VM_FAULT_LOCKED; /* * Trinity finds that probing a hole which tmpfs is punching can @@ -2006,10 +2021,10 @@ static int shmem_fault(struct vm_fault *vmf) else if (vma->vm_flags & VM_HUGEPAGE) sgp = SGP_HUGE; - error = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, sgp, + err = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, sgp, gfp, vma, vmf, &ret); - if (error) - return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); + if (err) + return vmf_error(err); return ret; } @@ -2616,241 +2631,6 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence) return offset; } -/* - * We need a tag: a new tag would expand every radix_tree_node by 8 bytes, - * so reuse a tag which we firmly believe is never set or cleared on shmem. - */ -#define SHMEM_TAG_PINNED PAGECACHE_TAG_TOWRITE -#define LAST_SCAN 4 /* about 150ms max */ - -static void shmem_tag_pins(struct address_space *mapping) -{ - struct radix_tree_iter iter; - void **slot; - pgoff_t start; - struct page *page; - - lru_add_drain(); - start = 0; - rcu_read_lock(); - - radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { - page = radix_tree_deref_slot(slot); - if (!page || radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - } else if (page_count(page) - page_mapcount(page) > 1) { - xa_lock_irq(&mapping->i_pages); - radix_tree_tag_set(&mapping->i_pages, iter.index, - SHMEM_TAG_PINNED); - xa_unlock_irq(&mapping->i_pages); - } - - if (need_resched()) { - slot = radix_tree_iter_resume(slot, &iter); - cond_resched_rcu(); - } - } - rcu_read_unlock(); -} - -/* - * Setting SEAL_WRITE requires us to verify there's no pending writer. However, - * via get_user_pages(), drivers might have some pending I/O without any active - * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all pages - * and see whether it has an elevated ref-count. If so, we tag them and wait for - * them to be dropped. - * The caller must guarantee that no new user will acquire writable references - * to those pages to avoid races. - */ -static int shmem_wait_for_pins(struct address_space *mapping) -{ - struct radix_tree_iter iter; - void **slot; - pgoff_t start; - struct page *page; - int error, scan; - - shmem_tag_pins(mapping); - - error = 0; - for (scan = 0; scan <= LAST_SCAN; scan++) { - if (!radix_tree_tagged(&mapping->i_pages, SHMEM_TAG_PINNED)) - break; - - if (!scan) - lru_add_drain_all(); - else if (schedule_timeout_killable((HZ << scan) / 200)) - scan = LAST_SCAN; - - start = 0; - rcu_read_lock(); - radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, - start, SHMEM_TAG_PINNED) { - - page = radix_tree_deref_slot(slot); - if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - - page = NULL; - } - - if (page && - page_count(page) - page_mapcount(page) != 1) { - if (scan < LAST_SCAN) - goto continue_resched; - - /* - * On the last scan, we clean up all those tags - * we inserted; but make a note that we still - * found pages pinned. - */ - error = -EBUSY; - } - - xa_lock_irq(&mapping->i_pages); - radix_tree_tag_clear(&mapping->i_pages, - iter.index, SHMEM_TAG_PINNED); - xa_unlock_irq(&mapping->i_pages); -continue_resched: - if (need_resched()) { - slot = radix_tree_iter_resume(slot, &iter); - cond_resched_rcu(); - } - } - rcu_read_unlock(); - } - - return error; -} - -static unsigned int *memfd_file_seals_ptr(struct file *file) -{ - if (file->f_op == &shmem_file_operations) - return &SHMEM_I(file_inode(file))->seals; - -#ifdef CONFIG_HUGETLBFS - if (file->f_op == &hugetlbfs_file_operations) - return &HUGETLBFS_I(file_inode(file))->seals; -#endif - - return NULL; -} - -#define F_ALL_SEALS (F_SEAL_SEAL | \ - F_SEAL_SHRINK | \ - F_SEAL_GROW | \ - F_SEAL_WRITE) - -static int memfd_add_seals(struct file *file, unsigned int seals) -{ - struct inode *inode = file_inode(file); - unsigned int *file_seals; - int error; - - /* - * SEALING - * Sealing allows multiple parties to share a shmem-file but restrict - * access to a specific subset of file operations. Seals can only be - * added, but never removed. This way, mutually untrusted parties can - * share common memory regions with a well-defined policy. A malicious - * peer can thus never perform unwanted operations on a shared object. - * - * Seals are only supported on special shmem-files and always affect - * the whole underlying inode. Once a seal is set, it may prevent some - * kinds of access to the file. Currently, the following seals are - * defined: - * SEAL_SEAL: Prevent further seals from being set on this file - * SEAL_SHRINK: Prevent the file from shrinking - * SEAL_GROW: Prevent the file from growing - * SEAL_WRITE: Prevent write access to the file - * - * As we don't require any trust relationship between two parties, we - * must prevent seals from being removed. Therefore, sealing a file - * only adds a given set of seals to the file, it never touches - * existing seals. Furthermore, the "setting seals"-operation can be - * sealed itself, which basically prevents any further seal from being - * added. - * - * Semantics of sealing are only defined on volatile files. Only - * anonymous shmem files support sealing. More importantly, seals are - * never written to disk. Therefore, there's no plan to support it on - * other file types. - */ - - if (!(file->f_mode & FMODE_WRITE)) - return -EPERM; - if (seals & ~(unsigned int)F_ALL_SEALS) - return -EINVAL; - - inode_lock(inode); - - file_seals = memfd_file_seals_ptr(file); - if (!file_seals) { - error = -EINVAL; - goto unlock; - } - - if (*file_seals & F_SEAL_SEAL) { - error = -EPERM; - goto unlock; - } - - if ((seals & F_SEAL_WRITE) && !(*file_seals & F_SEAL_WRITE)) { - error = mapping_deny_writable(file->f_mapping); - if (error) - goto unlock; - - error = shmem_wait_for_pins(file->f_mapping); - if (error) { - mapping_allow_writable(file->f_mapping); - goto unlock; - } - } - - *file_seals |= seals; - error = 0; - -unlock: - inode_unlock(inode); - return error; -} - -static int memfd_get_seals(struct file *file) -{ - unsigned int *seals = memfd_file_seals_ptr(file); - - return seals ? *seals : -EINVAL; -} - -long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg) -{ - long error; - - switch (cmd) { - case F_ADD_SEALS: - /* disallow upper 32bit */ - if (arg > UINT_MAX) - return -EINVAL; - - error = memfd_add_seals(file, arg); - break; - case F_GET_SEALS: - error = memfd_get_seals(file); - break; - default: - error = -EINVAL; - break; - } - - return error; -} - static long shmem_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { @@ -3428,6 +3208,15 @@ static int shmem_match(struct inode *ino, void *vfh) return ino->i_ino == inum && fh[0] == ino->i_generation; } +/* Find any alias of inode, but prefer a hashed alias */ +static struct dentry *shmem_find_alias(struct inode *inode) +{ + struct dentry *alias = d_find_alias(inode); + + return alias ?: d_find_any_alias(inode); +} + + static struct dentry *shmem_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { @@ -3444,7 +3233,7 @@ static struct dentry *shmem_fh_to_dentry(struct super_block *sb, inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]), shmem_match, fid->raw); if (inode) { - dentry = d_find_alias(inode); + dentry = shmem_find_alias(inode); iput(inode); } @@ -3673,93 +3462,6 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root) return 0; } -#define MFD_NAME_PREFIX "memfd:" -#define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1) -#define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN) - -#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB) - -SYSCALL_DEFINE2(memfd_create, - const char __user *, uname, - unsigned int, flags) -{ - unsigned int *file_seals; - struct file *file; - int fd, error; - char *name; - long len; - - if (!(flags & MFD_HUGETLB)) { - if (flags & ~(unsigned int)MFD_ALL_FLAGS) - return -EINVAL; - } else { - /* Allow huge page size encoding in flags. */ - if (flags & ~(unsigned int)(MFD_ALL_FLAGS | - (MFD_HUGE_MASK << MFD_HUGE_SHIFT))) - return -EINVAL; - } - - /* length includes terminating zero */ - len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1); - if (len <= 0) - return -EFAULT; - if (len > MFD_NAME_MAX_LEN + 1) - return -EINVAL; - - name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_KERNEL); - if (!name) - return -ENOMEM; - - strcpy(name, MFD_NAME_PREFIX); - if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) { - error = -EFAULT; - goto err_name; - } - - /* terminating-zero may have changed after strnlen_user() returned */ - if (name[len + MFD_NAME_PREFIX_LEN - 1]) { - error = -EFAULT; - goto err_name; - } - - fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0); - if (fd < 0) { - error = fd; - goto err_name; - } - - if (flags & MFD_HUGETLB) { - struct user_struct *user = NULL; - - file = hugetlb_file_setup(name, 0, VM_NORESERVE, &user, - HUGETLB_ANONHUGE_INODE, - (flags >> MFD_HUGE_SHIFT) & - MFD_HUGE_MASK); - } else - file = shmem_file_setup(name, 0, VM_NORESERVE); - if (IS_ERR(file)) { - error = PTR_ERR(file); - goto err_fd; - } - file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; - file->f_flags |= O_RDWR | O_LARGEFILE; - - if (flags & MFD_ALLOW_SEALING) { - file_seals = memfd_file_seals_ptr(file); - *file_seals &= ~F_SEAL_SEAL; - } - - fd_install(fd, file); - kfree(name); - return fd; - -err_fd: - put_unused_fd(fd); -err_name: - kfree(name); - return error; -} - #endif /* CONFIG_TMPFS */ static void shmem_put_super(struct super_block *sb) @@ -1235,8 +1235,6 @@ void __init kmem_cache_init(void) { int i; - BUILD_BUG_ON(sizeof(((struct page *)NULL)->lru) < - sizeof(struct rcu_head)); kmem_cache = &kmem_cache_boot; if (!IS_ENABLED(CONFIG_NUMA) || num_possible_nodes() == 1) @@ -2665,6 +2663,7 @@ static struct page *cache_grow_begin(struct kmem_cache *cachep, invalid_mask, &invalid_mask, flags, &flags); dump_stack(); } + WARN_ON_ONCE(cachep->ctor && (flags & __GFP_ZERO)); local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); check_irq_off(); @@ -3071,6 +3070,7 @@ static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, gfp_t flags, void *objp, unsigned long caller) { + WARN_ON_ONCE(cachep->ctor && (flags & __GFP_ZERO)); if (!objp) return objp; if (cachep->flags & SLAB_POISON) { @@ -555,8 +555,10 @@ static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node) flags, node); } - if (b && c->ctor) + if (b && c->ctor) { + WARN_ON_ONCE(flags & __GFP_ZERO); c->ctor(b); + } kmemleak_alloc_recursive(b, c->size, 1, c->flags, flags); return b; @@ -52,11 +52,11 @@ * and to synchronize major metadata changes to slab cache structures. * * The slab_lock is only used for debugging and on arches that do not - * have the ability to do a cmpxchg_double. It only protects the second - * double word in the page struct. Meaning + * have the ability to do a cmpxchg_double. It only protects: * A. page->freelist -> List of object free in a page - * B. page->counters -> Counters of objects - * C. page->frozen -> frozen state + * B. page->inuse -> Number of objects in use + * C. page->objects -> Number of objects in page + * D. page->frozen -> frozen state * * If a slab is frozen then it is exempt from list management. It is not * on any list. The processor that froze the slab is the one who can @@ -316,16 +316,16 @@ static inline unsigned int slab_index(void *p, struct kmem_cache *s, void *addr) return (p - addr) / s->size; } -static inline unsigned int order_objects(unsigned int order, unsigned int size, unsigned int reserved) +static inline unsigned int order_objects(unsigned int order, unsigned int size) { - return (((unsigned int)PAGE_SIZE << order) - reserved) / size; + return ((unsigned int)PAGE_SIZE << order) / size; } static inline struct kmem_cache_order_objects oo_make(unsigned int order, - unsigned int size, unsigned int reserved) + unsigned int size) { struct kmem_cache_order_objects x = { - (order << OO_SHIFT) + order_objects(order, size, reserved) + (order << OO_SHIFT) + order_objects(order, size) }; return x; @@ -356,21 +356,6 @@ static __always_inline void slab_unlock(struct page *page) __bit_spin_unlock(PG_locked, &page->flags); } -static inline void set_page_slub_counters(struct page *page, unsigned long counters_new) -{ - struct page tmp; - tmp.counters = counters_new; - /* - * page->counters can cover frozen/inuse/objects as well - * as page->_refcount. If we assign to ->counters directly - * we run the risk of losing updates to page->_refcount, so - * be careful and only assign to the fields we need. - */ - page->frozen = tmp.frozen; - page->inuse = tmp.inuse; - page->objects = tmp.objects; -} - /* Interrupts must be disabled (for the fallback code to work right) */ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page, void *freelist_old, unsigned long counters_old, @@ -392,7 +377,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page if (page->freelist == freelist_old && page->counters == counters_old) { page->freelist = freelist_new; - set_page_slub_counters(page, counters_new); + page->counters = counters_new; slab_unlock(page); return true; } @@ -431,7 +416,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, if (page->freelist == freelist_old && page->counters == counters_old) { page->freelist = freelist_new; - set_page_slub_counters(page, counters_new); + page->counters = counters_new; slab_unlock(page); local_irq_restore(flags); return true; @@ -711,7 +696,7 @@ void object_err(struct kmem_cache *s, struct page *page, print_trailer(s, page, object); } -static void slab_err(struct kmem_cache *s, struct page *page, +static __printf(3, 4) void slab_err(struct kmem_cache *s, struct page *page, const char *fmt, ...) { va_list args; @@ -847,7 +832,7 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page) return 1; start = page_address(page); - length = (PAGE_SIZE << compound_order(page)) - s->reserved; + length = PAGE_SIZE << compound_order(page); end = start + length; remainder = length % s->size; if (!remainder) @@ -936,7 +921,7 @@ static int check_slab(struct kmem_cache *s, struct page *page) return 0; } - maxobj = order_objects(compound_order(page), s->size, s->reserved); + maxobj = order_objects(compound_order(page), s->size); if (page->objects > maxobj) { slab_err(s, page, "objects %u > max %u", page->objects, maxobj); @@ -986,7 +971,7 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search) nr++; } - max_objects = order_objects(compound_order(page), s->size, s->reserved); + max_objects = order_objects(compound_order(page), s->size); if (max_objects > MAX_OBJS_PER_PAGE) max_objects = MAX_OBJS_PER_PAGE; @@ -1694,24 +1679,16 @@ static void __free_slab(struct kmem_cache *s, struct page *page) __ClearPageSlabPfmemalloc(page); __ClearPageSlab(page); - page_mapcount_reset(page); + page->mapping = NULL; if (current->reclaim_state) current->reclaim_state->reclaimed_slab += pages; memcg_uncharge_slab(page, order, s); __free_pages(page, order); } -#define need_reserve_slab_rcu \ - (sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head)) - static void rcu_free_slab(struct rcu_head *h) { - struct page *page; - - if (need_reserve_slab_rcu) - page = virt_to_head_page(h); - else - page = container_of((struct list_head *)h, struct page, lru); + struct page *page = container_of(h, struct page, rcu_head); __free_slab(page->slab_cache, page); } @@ -1719,19 +1696,7 @@ static void rcu_free_slab(struct rcu_head *h) static void free_slab(struct kmem_cache *s, struct page *page) { if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU)) { - struct rcu_head *head; - - if (need_reserve_slab_rcu) { - int order = compound_order(page); - int offset = (PAGE_SIZE << order) - s->reserved; - - VM_BUG_ON(s->reserved != sizeof(*head)); - head = page_address(page) + offset; - } else { - head = &page->rcu_head; - } - - call_rcu(head, rcu_free_slab); + call_rcu(&page->rcu_head, rcu_free_slab); } else __free_slab(s, page); } @@ -2444,6 +2409,8 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, struct kmem_cache_cpu *c = *pc; struct page *page; + WARN_ON_ONCE(s->ctor && (flags & __GFP_ZERO)); + freelist = get_partial(s, flags, node, c); if (freelist) @@ -3226,21 +3193,21 @@ static unsigned int slub_min_objects; */ static inline unsigned int slab_order(unsigned int size, unsigned int min_objects, unsigned int max_order, - unsigned int fract_leftover, unsigned int reserved) + unsigned int fract_leftover) { unsigned int min_order = slub_min_order; unsigned int order; - if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE) + if (order_objects(min_order, size) > MAX_OBJS_PER_PAGE) return get_order(size * MAX_OBJS_PER_PAGE) - 1; - for (order = max(min_order, (unsigned int)get_order(min_objects * size + reserved)); + for (order = max(min_order, (unsigned int)get_order(min_objects * size)); order <= max_order; order++) { unsigned int slab_size = (unsigned int)PAGE_SIZE << order; unsigned int rem; - rem = (slab_size - reserved) % size; + rem = slab_size % size; if (rem <= slab_size / fract_leftover) break; @@ -3249,7 +3216,7 @@ static inline unsigned int slab_order(unsigned int size, return order; } -static inline int calculate_order(unsigned int size, unsigned int reserved) +static inline int calculate_order(unsigned int size) { unsigned int order; unsigned int min_objects; @@ -3266,7 +3233,7 @@ static inline int calculate_order(unsigned int size, unsigned int reserved) min_objects = slub_min_objects; if (!min_objects) min_objects = 4 * (fls(nr_cpu_ids) + 1); - max_objects = order_objects(slub_max_order, size, reserved); + max_objects = order_objects(slub_max_order, size); min_objects = min(min_objects, max_objects); while (min_objects > 1) { @@ -3275,7 +3242,7 @@ static inline int calculate_order(unsigned int size, unsigned int reserved) fraction = 16; while (fraction >= 4) { order = slab_order(size, min_objects, - slub_max_order, fraction, reserved); + slub_max_order, fraction); if (order <= slub_max_order) return order; fraction /= 2; @@ -3287,14 +3254,14 @@ static inline int calculate_order(unsigned int size, unsigned int reserved) * We were unable to place multiple objects in a slab. Now * lets see if we can place a single object there. */ - order = slab_order(size, 1, slub_max_order, 1, reserved); + order = slab_order(size, 1, slub_max_order, 1); if (order <= slub_max_order) return order; /* * Doh this slab cannot be placed using slub_max_order. */ - order = slab_order(size, 1, MAX_ORDER, 1, reserved); + order = slab_order(size, 1, MAX_ORDER, 1); if (order < MAX_ORDER) return order; return -ENOSYS; @@ -3562,7 +3529,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) if (forced_order >= 0) order = forced_order; else - order = calculate_order(size, s->reserved); + order = calculate_order(size); if ((int)order < 0) return 0; @@ -3580,8 +3547,8 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) /* * Determine the number of objects per slab */ - s->oo = oo_make(order, size, s->reserved); - s->min = oo_make(get_order(size), size, s->reserved); + s->oo = oo_make(order, size); + s->min = oo_make(get_order(size), size); if (oo_objects(s->oo) > oo_objects(s->max)) s->max = s->oo; @@ -3591,14 +3558,10 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags) { s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor); - s->reserved = 0; #ifdef CONFIG_SLAB_FREELIST_HARDENED s->random = get_random_long(); #endif - if (need_reserve_slab_rcu && (s->flags & SLAB_TYPESAFE_BY_RCU)) - s->reserved = sizeof(struct rcu_head); - if (!calculate_sizes(s, -1)) goto error; if (disable_higher_order_debug) { @@ -4239,12 +4202,6 @@ void __init kmem_cache_init(void) SLAB_HWCACHE_ALIGN, 0, 0); kmem_cache = bootstrap(&boot_kmem_cache); - - /* - * Allocate kmem_cache_node properly from the kmem_cache slab. - * kmem_cache_node is separately allocated so no need to - * update any list pointers. - */ kmem_cache_node = bootstrap(&boot_kmem_cache_node); /* Now we can use the kmem_cache to allocate kmalloc slabs */ @@ -5117,12 +5074,6 @@ static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf) } SLAB_ATTR_RO(destroy_by_rcu); -static ssize_t reserved_show(struct kmem_cache *s, char *buf) -{ - return sprintf(buf, "%u\n", s->reserved); -} -SLAB_ATTR_RO(reserved); - #ifdef CONFIG_SLUB_DEBUG static ssize_t slabs_show(struct kmem_cache *s, char *buf) { @@ -5435,7 +5386,6 @@ static struct attribute *slab_attrs[] = { &reclaim_account_attr.attr, &destroy_by_rcu_attr.attr, &shrink_attr.attr, - &reserved_attr.attr, &slabs_cpu_partial_attr.attr, #ifdef CONFIG_SLUB_DEBUG &total_objects_attr.attr, diff --git a/mm/sparse.c b/mm/sparse.c index 73dc2fc..f13f272 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -190,15 +190,13 @@ static inline int next_present_section_nr(int section_nr) section_nr++; if (present_section_nr(section_nr)) return section_nr; - } while ((section_nr < NR_MEM_SECTIONS) && - (section_nr <= __highest_present_section_nr)); + } while ((section_nr <= __highest_present_section_nr)); return -1; } #define for_each_present_section_nr(start, section_nr) \ for (section_nr = next_present_section_nr(start-1); \ ((section_nr >= 0) && \ - (section_nr < NR_MEM_SECTIONS) && \ (section_nr <= __highest_present_section_nr)); \ section_nr = next_present_section_nr(section_nr)) @@ -524,7 +522,7 @@ static void __init alloc_usemap_and_memmap(void (*alloc_func) map_count = 1; } /* ok, last chunk */ - alloc_func(data, pnum_begin, NR_MEM_SECTIONS, + alloc_func(data, pnum_begin, __highest_present_section_nr+1, map_count, nodeid_begin); } diff --git a/mm/swap_slots.c b/mm/swap_slots.c index f264189..f51ac05 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -317,7 +317,7 @@ swp_entry_t get_swap_page(struct page *page) if (PageTransHuge(page)) { if (IS_ENABLED(CONFIG_THP_SWAP)) get_swap_pages(1, true, &entry); - return entry; + goto out; } /* @@ -347,10 +347,14 @@ repeat: } mutex_unlock(&cache->alloc_lock); if (entry.val) - return entry; + goto out; } get_swap_pages(1, false, &entry); - +out: + if (mem_cgroup_try_charge_swap(page, entry)) { + put_swap_page(page, entry); + entry.val = 0; + } return entry; } diff --git a/mm/swap_state.c b/mm/swap_state.c index 07f9aa2..ab8e59c 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -216,9 +216,6 @@ int add_to_swap(struct page *page) if (!entry.val) return 0; - if (mem_cgroup_try_charge_swap(page, entry)) - goto fail; - /* * Radix-tree node allocations from PF_MEMALLOC contexts could * completely exhaust the page allocator. __GFP_NOMEMALLOC diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 39791b8..5029f24 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -404,7 +404,8 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long src_start, unsigned long len, - bool zeropage) + bool zeropage, + bool *mmap_changing) { struct vm_area_struct *dst_vma; ssize_t err; @@ -431,6 +432,15 @@ retry: down_read(&dst_mm->mmap_sem); /* + * If memory mappings are changing because of non-cooperative + * operation (e.g. mremap) running in parallel, bail out and + * request the user to retry later + */ + err = -EAGAIN; + if (mmap_changing && READ_ONCE(*mmap_changing)) + goto out_unlock; + + /* * Make sure the vma is not shared, that the dst range is * both valid and fully within a single existing vma. */ @@ -563,13 +573,15 @@ out: } ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, - unsigned long src_start, unsigned long len) + unsigned long src_start, unsigned long len, + bool *mmap_changing) { - return __mcopy_atomic(dst_mm, dst_start, src_start, len, false); + return __mcopy_atomic(dst_mm, dst_start, src_start, len, false, + mmap_changing); } ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start, - unsigned long len) + unsigned long len, bool *mmap_changing) { - return __mcopy_atomic(dst_mm, start, 0, len, true); + return __mcopy_atomic(dst_mm, start, 0, len, true, mmap_changing); } @@ -391,7 +391,8 @@ EXPORT_SYMBOL(vm_mmap); * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is * preferable to the vmalloc fallback, due to visible performance drawbacks. * - * Any use of gfp flags outside of GFP_KERNEL should be consulted with mm people. + * Please note that any use of gfp flags outside of GFP_KERNEL is careful to not + * fall back to vmalloc. */ void *kvmalloc_node(size_t size, gfp_t flags, int node) { @@ -402,7 +403,8 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node) * vmalloc uses GFP_KERNEL for some internal allocations (e.g page tables) * so the given set of flags has to be compatible. */ - WARN_ON_ONCE((flags & GFP_KERNEL) != GFP_KERNEL); + if ((flags & GFP_KERNEL) != GFP_KERNEL) + return kmalloc_node(size, flags, node); /* * We want to attempt a large physically contiguous block first because diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 63a5f50..89efac3 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -603,26 +603,6 @@ static void unmap_vmap_area(struct vmap_area *va) vunmap_page_range(va->va_start, va->va_end); } -static void vmap_debug_free_range(unsigned long start, unsigned long end) -{ - /* - * Unmap page tables and force a TLB flush immediately if pagealloc - * debugging is enabled. This catches use after free bugs similarly to - * those in linear kernel virtual address space after a page has been - * freed. - * - * All the lazy freeing logic is still retained, in order to minimise - * intrusiveness of this debugging feature. - * - * This is going to be *slow* (linear kernel virtual address debugging - * doesn't do a broadcast TLB flush so it is a lot faster). - */ - if (debug_pagealloc_enabled()) { - vunmap_page_range(start, end); - flush_tlb_kernel_range(start, end); - } -} - /* * lazy_max_pages is the maximum amount of virtual address space we gather up * before attempting to purge with a TLB flush. @@ -756,6 +736,9 @@ static void free_unmap_vmap_area(struct vmap_area *va) { flush_cache_vunmap(va->va_start, va->va_end); unmap_vmap_area(va); + if (debug_pagealloc_enabled()) + flush_tlb_kernel_range(va->va_start, va->va_end); + free_vmap_area_noflush(va); } @@ -1053,6 +1036,10 @@ static void vb_free(const void *addr, unsigned long size) vunmap_page_range((unsigned long)addr, (unsigned long)addr + size); + if (debug_pagealloc_enabled()) + flush_tlb_kernel_range((unsigned long)addr, + (unsigned long)addr + size); + spin_lock(&vb->lock); /* Expand dirty range */ @@ -1141,16 +1128,16 @@ void vm_unmap_ram(const void *mem, unsigned int count) BUG_ON(addr > VMALLOC_END); BUG_ON(!PAGE_ALIGNED(addr)); - debug_check_no_locks_freed(mem, size); - vmap_debug_free_range(addr, addr+size); - if (likely(count <= VMAP_MAX_ALLOC)) { + debug_check_no_locks_freed(mem, size); vb_free(mem, size); return; } va = find_vmap_area(addr); BUG_ON(!va); + debug_check_no_locks_freed((void *)va->va_start, + (va->va_end - va->va_start)); free_unmap_vmap_area(va); } EXPORT_SYMBOL(vm_unmap_ram); @@ -1499,7 +1486,6 @@ struct vm_struct *remove_vm_area(const void *addr) va->flags |= VM_LAZY_FREE; spin_unlock(&vmap_area_lock); - vmap_debug_free_range(va->va_start, va->va_end); kasan_free_shadow(vm); free_unmap_vmap_area(va); @@ -1519,16 +1505,17 @@ static void __vunmap(const void *addr, int deallocate_pages) addr)) return; - area = remove_vm_area(addr); + area = find_vmap_area((unsigned long)addr)->vm; if (unlikely(!area)) { WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", addr); return; } - debug_check_no_locks_freed(addr, get_vm_area_size(area)); - debug_check_no_obj_freed(addr, get_vm_area_size(area)); + debug_check_no_locks_freed(area->addr, get_vm_area_size(area)); + debug_check_no_obj_freed(area->addr, get_vm_area_size(area)); + remove_vm_area(addr); if (deallocate_pages) { int i; diff --git a/mm/vmpressure.c b/mm/vmpressure.c index 85350ce..4854584 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -342,26 +342,6 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio) vmpressure(gfp, memcg, true, vmpressure_win, 0); } -static enum vmpressure_levels str_to_level(const char *arg) -{ - enum vmpressure_levels level; - - for (level = 0; level < VMPRESSURE_NUM_LEVELS; level++) - if (!strcmp(vmpressure_str_levels[level], arg)) - return level; - return -1; -} - -static enum vmpressure_modes str_to_mode(const char *arg) -{ - enum vmpressure_modes mode; - - for (mode = 0; mode < VMPRESSURE_NUM_MODES; mode++) - if (!strcmp(vmpressure_str_modes[mode], arg)) - return mode; - return -1; -} - #define MAX_VMPRESSURE_ARGS_LEN (strlen("critical") + strlen("hierarchy") + 2) /** @@ -390,27 +370,26 @@ int vmpressure_register_event(struct mem_cgroup *memcg, char *token; int ret = 0; - spec_orig = spec = kzalloc(MAX_VMPRESSURE_ARGS_LEN + 1, GFP_KERNEL); + spec_orig = spec = kstrndup(args, MAX_VMPRESSURE_ARGS_LEN, GFP_KERNEL); if (!spec) { ret = -ENOMEM; goto out; } - strncpy(spec, args, MAX_VMPRESSURE_ARGS_LEN); /* Find required level */ token = strsep(&spec, ","); - level = str_to_level(token); - if (level == -1) { - ret = -EINVAL; + level = match_string(vmpressure_str_levels, VMPRESSURE_NUM_LEVELS, token); + if (level < 0) { + ret = level; goto out; } /* Find optional mode */ token = strsep(&spec, ","); if (token) { - mode = str_to_mode(token); - if (mode == -1) { - ret = -EINVAL; + mode = match_string(vmpressure_str_modes, VMPRESSURE_NUM_MODES, token); + if (mode < 0) { + ret = mode; goto out; } } diff --git a/mm/vmscan.c b/mm/vmscan.c index 9270a43..03822f8 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2544,12 +2544,28 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) unsigned long reclaimed; unsigned long scanned; - if (mem_cgroup_low(root, memcg)) { + switch (mem_cgroup_protected(root, memcg)) { + case MEMCG_PROT_MIN: + /* + * Hard protection. + * If there is no reclaimable memory, OOM. + */ + continue; + case MEMCG_PROT_LOW: + /* + * Soft protection. + * Respect the protection only as long as + * there is an unprotected supply + * of reclaimable memory from other cgroups. + */ if (!sc->memcg_low_reclaim) { sc->memcg_low_skipped = 1; continue; } memcg_memory_event(memcg, MEMCG_LOW); + break; + case MEMCG_PROT_NONE: + break; } reclaimed = sc->nr_reclaimed; @@ -3318,11 +3334,15 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) .may_unmap = 1, .may_swap = 1, }; + + __fs_reclaim_acquire(); + count_vm_event(PAGEOUTRUN); do { unsigned long nr_reclaimed = sc.nr_reclaimed; bool raise_priority = true; + bool ret; sc.reclaim_idx = classzone_idx; @@ -3395,7 +3415,10 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) wake_up_all(&pgdat->pfmemalloc_wait); /* Check if kswapd should be suspending */ - if (try_to_freeze() || kthread_should_stop()) + __fs_reclaim_release(); + ret = try_to_freeze(); + __fs_reclaim_acquire(); + if (ret || kthread_should_stop()) break; /* @@ -3412,6 +3435,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) out: snapshot_refaults(NULL, pgdat); + __fs_reclaim_release(); /* * Return the order kswapd stopped reclaiming at as * prepare_kswapd_sleep() takes it into account. If another caller @@ -3600,9 +3624,7 @@ kswapd_try_sleep: */ trace_mm_vmscan_kswapd_wake(pgdat->node_id, classzone_idx, alloc_order); - fs_reclaim_acquire(GFP_KERNEL); reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx); - fs_reclaim_release(GFP_KERNEL); if (reclaim_order < alloc_order) goto kswapd_try_sleep; } @@ -3684,16 +3706,16 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) unsigned long nr_reclaimed; unsigned int noreclaim_flag; - noreclaim_flag = memalloc_noreclaim_save(); fs_reclaim_acquire(sc.gfp_mask); + noreclaim_flag = memalloc_noreclaim_save(); reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; nr_reclaimed = do_try_to_free_pages(zonelist, &sc); p->reclaim_state = NULL; - fs_reclaim_release(sc.gfp_mask); memalloc_noreclaim_restore(noreclaim_flag); + fs_reclaim_release(sc.gfp_mask); return nr_reclaimed; } @@ -3870,6 +3892,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in }; cond_resched(); + fs_reclaim_acquire(sc.gfp_mask); /* * We need to be able to allocate from the reserves for RECLAIM_UNMAP * and we also need to be able to write out pages for RECLAIM_WRITE @@ -3877,7 +3900,6 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in */ noreclaim_flag = memalloc_noreclaim_save(); p->flags |= PF_SWAPWRITE; - fs_reclaim_acquire(sc.gfp_mask); reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; @@ -3892,9 +3914,9 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in } p->reclaim_state = NULL; - fs_reclaim_release(gfp_mask); current->flags &= ~PF_SWAPWRITE; memalloc_noreclaim_restore(noreclaim_flag); + fs_reclaim_release(sc.gfp_mask); return sc.nr_reclaimed >= nr_pages; } diff --git a/net/9p/client.c b/net/9p/client.c index 21e6df1..18c5271 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -198,8 +198,6 @@ static int parse_opts(char *opts, struct p9_client *clnt) pr_info("Could not find request transport: %s\n", s); ret = -EINVAL; - kfree(s); - goto free_and_return; } kfree(s); break; @@ -214,13 +212,12 @@ static int parse_opts(char *opts, struct p9_client *clnt) "problem allocating copy of version arg\n"); goto free_and_return; } - ret = get_protocol_version(s); - if (ret == -EINVAL) { - kfree(s); - goto free_and_return; - } + r = get_protocol_version(s); + if (r < 0) + ret = r; + else + clnt->proto_version = r; kfree(s); - clnt->proto_version = ret; break; default: continue; diff --git a/net/9p/trans_xen.c b/net/9p/trans_xen.c index 0f19960..2e2b8bc 100644 --- a/net/9p/trans_xen.c +++ b/net/9p/trans_xen.c @@ -38,7 +38,6 @@ #include <linux/module.h> #include <linux/spinlock.h> -#include <linux/rwlock.h> #include <net/9p/9p.h> #include <net/9p/client.h> #include <net/9p/transport.h> diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index e6033d3..e3b7362 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -1,9 +1,11 @@ #!/usr/bin/env perl +# SPDX-License-Identifier: GPL-2.0 +# # (c) 2001, Dave Jones. (the file handling bit) # (c) 2005, Joel Schopp <jschopp@austin.ibm.com> (the ugly bit) # (c) 2007,2008, Andy Whitcroft <apw@uk.ibm.com> (new conditions, test suite) # (c) 2008-2010 Andy Whitcroft <apw@canonical.com> -# Licensed under the terms of the GNU GPL License version 2 +# (c) 2010-2018 Joe Perches <joe@perches.com> use strict; use warnings; @@ -2375,6 +2377,14 @@ sub process { my $rawline = $rawlines[$linenr - 1]; +# check if it's a mode change, rename or start of a patch + if (!$in_commit_log && + ($line =~ /^ mode change [0-7]+ => [0-7]+ \S+\s*$/ || + ($line =~ /^rename (?:from|to) \S+\s*$/ || + $line =~ /^diff --git a\/[\w\/\.\_\-]+ b\/\S+\s*$/))) { + $is_patch = 1; + } + #extract the line range in the file after the patch is applied if (!$in_commit_log && $line =~ /^\@\@ -\d+(?:,\d+)? \+(\d+)(,(\d+))? \@\@(.*)/) { diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl index 99c96e8..c87fa73 100755 --- a/scripts/get_maintainer.pl +++ b/scripts/get_maintainer.pl @@ -1,4 +1,6 @@ #!/usr/bin/env perl +# SPDX-License-Identifier: GPL-2.0 +# # (c) 2007, Joe Perches <joe@perches.com> # created from checkpatch.pl # @@ -7,8 +9,6 @@ # # usage: perl scripts/get_maintainer.pl [OPTIONS] <patch> # perl scripts/get_maintainer.pl [OPTIONS] -f <file> -# -# Licensed under the terms of the GNU GPL License version 2 use warnings; use strict; @@ -542,7 +542,18 @@ foreach my $file (@ARGV) { while (<$patch>) { my $patch_line = $_; - if (m/^\+\+\+\s+(\S+)/ or m/^---\s+(\S+)/) { + if (m/^ mode change [0-7]+ => [0-7]+ (\S+)\s*$/) { + my $filename = $1; + push(@files, $filename); + } elsif (m/^rename (?:from|to) (\S+)\s*$/) { + my $filename = $1; + push(@files, $filename); + } elsif (m/^diff --git a\/(\S+) b\/(\S+)\s*$/) { + my $filename1 = $1; + my $filename2 = $2; + push(@files, $filename1); + push(@files, $filename2); + } elsif (m/^\+\+\+\s+(\S+)/ or m/^---\s+(\S+)/) { my $filename = $1; $filename =~ s@^[^/]*/@@; $filename =~ s@\n@@; diff --git a/scripts/tags.sh b/scripts/tags.sh index e587610..66f08bb 100755 --- a/scripts/tags.sh +++ b/scripts/tags.sh @@ -179,9 +179,9 @@ regex_c=( '/\<CLEARPAGEFLAG_NOOP(\([[:alnum:]_]*\).*/ClearPage\1/' '/\<__CLEARPAGEFLAG_NOOP(\([[:alnum:]_]*\).*/__ClearPage\1/' '/\<TESTCLEARFLAG_FALSE(\([[:alnum:]_]*\).*/TestClearPage\1/' - '/^PAGE_MAPCOUNT_OPS(\([[:alnum:]_]*\).*/Page\1/' - '/^PAGE_MAPCOUNT_OPS(\([[:alnum:]_]*\).*/__SetPage\1/' - '/^PAGE_MAPCOUNT_OPS(\([[:alnum:]_]*\).*/__ClearPage\1/' + '/^PAGE_TYPE_OPS(\([[:alnum:]_]*\).*/Page\1/' + '/^PAGE_TYPE_OPS(\([[:alnum:]_]*\).*/__SetPage\1/' + '/^PAGE_TYPE_OPS(\([[:alnum:]_]*\).*/__ClearPage\1/' '/^TASK_PFA_TEST([^,]*, *\([[:alnum:]_]*\))/task_\1/' '/^TASK_PFA_SET([^,]*, *\([[:alnum:]_]*\))/task_set_\1/' '/^TASK_PFA_CLEAR([^,]*, *\([[:alnum:]_]*\))/task_clear_\1/' diff --git a/tools/testing/selftests/proc/.gitignore b/tools/testing/selftests/proc/.gitignore index 6c16f77..74e5912 100644 --- a/tools/testing/selftests/proc/.gitignore +++ b/tools/testing/selftests/proc/.gitignore @@ -1,3 +1,6 @@ +/fd-001-lookup +/fd-002-posix-eq +/fd-003-kthread /proc-loadavg-001 /proc-self-map-files-001 /proc-self-map-files-002 diff --git a/tools/testing/selftests/proc/Makefile b/tools/testing/selftests/proc/Makefile index dbb87e5..db310ee 100644 --- a/tools/testing/selftests/proc/Makefile +++ b/tools/testing/selftests/proc/Makefile @@ -1,6 +1,9 @@ -CFLAGS += -Wall -O2 +CFLAGS += -Wall -O2 -Wno-unused-function TEST_GEN_PROGS := +TEST_GEN_PROGS += fd-001-lookup +TEST_GEN_PROGS += fd-002-posix-eq +TEST_GEN_PROGS += fd-003-kthread TEST_GEN_PROGS += proc-loadavg-001 TEST_GEN_PROGS += proc-self-map-files-001 TEST_GEN_PROGS += proc-self-map-files-002 diff --git a/tools/testing/selftests/proc/fd-001-lookup.c b/tools/testing/selftests/proc/fd-001-lookup.c new file mode 100644 index 0000000..a2010df --- /dev/null +++ b/tools/testing/selftests/proc/fd-001-lookup.c @@ -0,0 +1,168 @@ +/* + * Copyright © 2018 Alexey Dobriyan <adobriyan@gmail.com> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +// Test /proc/*/fd lookup. +#define _GNU_SOURCE +#undef NDEBUG +#include <assert.h> +#include <dirent.h> +#include <errno.h> +#include <limits.h> +#include <sched.h> +#include <stdio.h> +#include <unistd.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> + +#include "proc.h" + +/* lstat(2) has more "coverage" in case non-symlink pops up somehow. */ +static void test_lookup_pass(const char *pathname) +{ + struct stat st; + ssize_t rv; + + memset(&st, 0, sizeof(struct stat)); + rv = lstat(pathname, &st); + assert(rv == 0); + assert(S_ISLNK(st.st_mode)); +} + +static void test_lookup_fail(const char *pathname) +{ + struct stat st; + ssize_t rv; + + rv = lstat(pathname, &st); + assert(rv == -1 && errno == ENOENT); +} + +static void test_lookup(unsigned int fd) +{ + char buf[64]; + unsigned int c; + unsigned int u; + int i; + + snprintf(buf, sizeof(buf), "/proc/self/fd/%u", fd); + test_lookup_pass(buf); + + /* leading junk */ + for (c = 1; c <= 255; c++) { + if (c == '/') + continue; + snprintf(buf, sizeof(buf), "/proc/self/fd/%c%u", c, fd); + test_lookup_fail(buf); + } + + /* trailing junk */ + for (c = 1; c <= 255; c++) { + if (c == '/') + continue; + snprintf(buf, sizeof(buf), "/proc/self/fd/%u%c", fd, c); + test_lookup_fail(buf); + } + + for (i = INT_MIN; i < INT_MIN + 1024; i++) { + snprintf(buf, sizeof(buf), "/proc/self/fd/%d", i); + test_lookup_fail(buf); + } + for (i = -1024; i < 0; i++) { + snprintf(buf, sizeof(buf), "/proc/self/fd/%d", i); + test_lookup_fail(buf); + } + for (u = INT_MAX - 1024; u <= (unsigned int)INT_MAX + 1024; u++) { + snprintf(buf, sizeof(buf), "/proc/self/fd/%u", u); + test_lookup_fail(buf); + } + for (u = UINT_MAX - 1024; u != 0; u++) { + snprintf(buf, sizeof(buf), "/proc/self/fd/%u", u); + test_lookup_fail(buf); + } + + +} + +int main(void) +{ + struct dirent *de; + unsigned int fd, target_fd; + + if (unshare(CLONE_FILES) == -1) + return 1; + + /* Wipe fdtable. */ + do { + DIR *d; + + d = opendir("/proc/self/fd"); + if (!d) + return 1; + + de = xreaddir(d); + assert(de->d_type == DT_DIR); + assert(streq(de->d_name, ".")); + + de = xreaddir(d); + assert(de->d_type == DT_DIR); + assert(streq(de->d_name, "..")); +next: + de = xreaddir(d); + if (de) { + unsigned long long fd_ull; + unsigned int fd; + char *end; + + assert(de->d_type == DT_LNK); + + fd_ull = xstrtoull(de->d_name, &end); + assert(*end == '\0'); + assert(fd_ull == (unsigned int)fd_ull); + + fd = fd_ull; + if (fd == dirfd(d)) + goto next; + close(fd); + } + + closedir(d); + } while (de); + + /* Now fdtable is clean. */ + + fd = open("/", O_PATH|O_DIRECTORY); + assert(fd == 0); + test_lookup(fd); + close(fd); + + /* Clean again! */ + + fd = open("/", O_PATH|O_DIRECTORY); + assert(fd == 0); + /* Default RLIMIT_NOFILE-1 */ + target_fd = 1023; + while (target_fd > 0) { + if (dup2(fd, target_fd) == target_fd) + break; + target_fd /= 2; + } + assert(target_fd > 0); + close(fd); + test_lookup(target_fd); + close(target_fd); + + return 0; +} diff --git a/tools/testing/selftests/proc/fd-002-posix-eq.c b/tools/testing/selftests/proc/fd-002-posix-eq.c new file mode 100644 index 0000000..417322c --- /dev/null +++ b/tools/testing/selftests/proc/fd-002-posix-eq.c @@ -0,0 +1,57 @@ +/* + * Copyright © 2018 Alexey Dobriyan <adobriyan@gmail.com> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +// Test that open(/proc/*/fd/*) opens the same file. +#undef NDEBUG +#include <assert.h> +#include <stdio.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <unistd.h> + +int main(void) +{ + int fd0, fd1, fd2; + struct stat st0, st1, st2; + char buf[64]; + int rv; + + fd0 = open("/", O_DIRECTORY|O_RDONLY); + assert(fd0 >= 0); + + snprintf(buf, sizeof(buf), "/proc/self/fd/%u", fd0); + fd1 = open(buf, O_RDONLY); + assert(fd1 >= 0); + + snprintf(buf, sizeof(buf), "/proc/thread-self/fd/%u", fd0); + fd2 = open(buf, O_RDONLY); + assert(fd2 >= 0); + + rv = fstat(fd0, &st0); + assert(rv == 0); + rv = fstat(fd1, &st1); + assert(rv == 0); + rv = fstat(fd2, &st2); + assert(rv == 0); + + assert(st0.st_dev == st1.st_dev); + assert(st0.st_ino == st1.st_ino); + + assert(st0.st_dev == st2.st_dev); + assert(st0.st_ino == st2.st_ino); + + return 0; +} diff --git a/tools/testing/selftests/proc/fd-003-kthread.c b/tools/testing/selftests/proc/fd-003-kthread.c new file mode 100644 index 0000000..1d659d5 --- /dev/null +++ b/tools/testing/selftests/proc/fd-003-kthread.c @@ -0,0 +1,178 @@ +/* + * Copyright © 2018 Alexey Dobriyan <adobriyan@gmail.com> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +// Test that /proc/$KERNEL_THREAD/fd/ is empty. +#define _GNU_SOURCE +#undef NDEBUG +#include <sys/syscall.h> +#include <assert.h> +#include <dirent.h> +#include <limits.h> +#include <stdio.h> +#include <string.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <unistd.h> + +#include "proc.h" + +#define PF_KHTREAD 0x00200000 + +/* + * Test for kernel threadness atomically with openat(). + * + * Return /proc/$PID/fd descriptor if process is kernel thread. + * Return -1 if a process is userspace process. + */ +static int kernel_thread_fd(unsigned int pid) +{ + unsigned int flags = 0; + char buf[4096]; + int dir_fd, fd; + ssize_t rv; + + snprintf(buf, sizeof(buf), "/proc/%u", pid); + dir_fd = open(buf, O_RDONLY|O_DIRECTORY); + if (dir_fd == -1) + return -1; + + /* + * Believe it or not, struct task_struct::flags is directly exposed + * to userspace! + */ + fd = openat(dir_fd, "stat", O_RDONLY); + if (fd == -1) { + close(dir_fd); + return -1; + } + rv = read(fd, buf, sizeof(buf)); + close(fd); + if (0 < rv && rv <= sizeof(buf)) { + unsigned long long flags_ull; + char *p, *end; + int i; + + assert(buf[rv - 1] == '\n'); + buf[rv - 1] = '\0'; + + /* Search backwards: ->comm can contain whitespace and ')'. */ + for (i = 0; i < 43; i++) { + p = strrchr(buf, ' '); + assert(p); + *p = '\0'; + } + + p = strrchr(buf, ' '); + assert(p); + + flags_ull = xstrtoull(p + 1, &end); + assert(*end == '\0'); + assert(flags_ull == (unsigned int)flags_ull); + + flags = flags_ull; + } + + fd = -1; + if (flags & PF_KHTREAD) { + fd = openat(dir_fd, "fd", O_RDONLY|O_DIRECTORY); + } + close(dir_fd); + return fd; +} + +static void test_readdir(int fd) +{ + DIR *d; + struct dirent *de; + + d = fdopendir(fd); + assert(d); + + de = xreaddir(d); + assert(streq(de->d_name, ".")); + assert(de->d_type == DT_DIR); + + de = xreaddir(d); + assert(streq(de->d_name, "..")); + assert(de->d_type == DT_DIR); + + de = xreaddir(d); + assert(!de); +} + +static inline int sys_statx(int dirfd, const char *pathname, int flags, + unsigned int mask, void *stx) +{ + return syscall(SYS_statx, dirfd, pathname, flags, mask, stx); +} + +static void test_lookup_fail(int fd, const char *pathname) +{ + char stx[256] __attribute__((aligned(8))); + int rv; + + rv = sys_statx(fd, pathname, AT_SYMLINK_NOFOLLOW, 0, (void *)stx); + assert(rv == -1 && errno == ENOENT); +} + +static void test_lookup(int fd) +{ + char buf[64]; + unsigned int u; + int i; + + for (i = INT_MIN; i < INT_MIN + 1024; i++) { + snprintf(buf, sizeof(buf), "%d", i); + test_lookup_fail(fd, buf); + } + for (i = -1024; i < 1024; i++) { + snprintf(buf, sizeof(buf), "%d", i); + test_lookup_fail(fd, buf); + } + for (u = INT_MAX - 1024; u < (unsigned int)INT_MAX + 1024; u++) { + snprintf(buf, sizeof(buf), "%u", u); + test_lookup_fail(fd, buf); + } + for (u = UINT_MAX - 1024; u != 0; u++) { + snprintf(buf, sizeof(buf), "%u", u); + test_lookup_fail(fd, buf); + } +} + +int main(void) +{ + unsigned int pid; + int fd; + + /* + * In theory this will loop indefinitely if kernel threads are exiled + * from /proc. + * + * Start with kthreadd. + */ + pid = 2; + while ((fd = kernel_thread_fd(pid)) == -1 && pid < 1024) { + pid++; + } + /* EACCES if run as non-root. */ + if (pid >= 1024) + return 1; + + test_readdir(fd); + test_lookup(fd); + + return 0; +} diff --git a/tools/testing/selftests/proc/proc-uptime.h b/tools/testing/selftests/proc/proc-uptime.h index 0e464b5..dc6a42b 100644 --- a/tools/testing/selftests/proc/proc-uptime.h +++ b/tools/testing/selftests/proc/proc-uptime.h @@ -20,21 +20,7 @@ #include <stdlib.h> #include <unistd.h> -static unsigned long long xstrtoull(const char *p, char **end) -{ - if (*p == '0') { - *end = (char *)p + 1; - return 0; - } else if ('1' <= *p && *p <= '9') { - unsigned long long val; - - errno = 0; - val = strtoull(p, end, 10); - assert(errno == 0); - return val; - } else - assert(0); -} +#include "proc.h" static void proc_uptime(int fd, uint64_t *uptime, uint64_t *idle) { diff --git a/tools/testing/selftests/proc/proc.h b/tools/testing/selftests/proc/proc.h new file mode 100644 index 0000000..4e17816 --- /dev/null +++ b/tools/testing/selftests/proc/proc.h @@ -0,0 +1,39 @@ +#pragma once +#undef NDEBUG +#include <assert.h> +#include <dirent.h> +#include <errno.h> +#include <stdbool.h> +#include <stdlib.h> +#include <string.h> + +static inline bool streq(const char *s1, const char *s2) +{ + return strcmp(s1, s2) == 0; +} + +static unsigned long long xstrtoull(const char *p, char **end) +{ + if (*p == '0') { + *end = (char *)p + 1; + return 0; + } else if ('1' <= *p && *p <= '9') { + unsigned long long val; + + errno = 0; + val = strtoull(p, end, 10); + assert(errno == 0); + return val; + } else + assert(0); +} + +static struct dirent *xreaddir(DIR *d) +{ + struct dirent *de; + + errno = 0; + de = readdir(d); + assert(de || errno == 0); + return de; +} diff --git a/tools/testing/selftests/proc/read.c b/tools/testing/selftests/proc/read.c index 1e73c22..563e752 100644 --- a/tools/testing/selftests/proc/read.c +++ b/tools/testing/selftests/proc/read.c @@ -31,22 +31,7 @@ #include <fcntl.h> #include <unistd.h> -static inline bool streq(const char *s1, const char *s2) -{ - return strcmp(s1, s2) == 0; -} - -static struct dirent *xreaddir(DIR *d) -{ - struct dirent *de; - - errno = 0; - de = readdir(d); - if (!de && errno != 0) { - exit(1); - } - return de; -} +#include "proc.h" static void f_reg(DIR *d, const char *filename) { diff --git a/tools/vm/page-types.c b/tools/vm/page-types.c index a8783f4..cce853d 100644 --- a/tools/vm/page-types.c +++ b/tools/vm/page-types.c @@ -131,6 +131,7 @@ static const char * const page_flag_names[] = { [KPF_KSM] = "x:ksm", [KPF_THP] = "t:thp", [KPF_BALLOON] = "o:balloon", + [KPF_PGTABLE] = "g:pgtable", [KPF_ZERO_PAGE] = "z:zero_page", [KPF_IDLE] = "i:idle_page", |