diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/backing-dev.c | 4 | ||||
-rw-r--r-- | mm/mmap.c | 2 | ||||
-rw-r--r-- | mm/nommu.c | 2 | ||||
-rw-r--r-- | mm/page-writeback.c | 2 | ||||
-rw-r--r-- | mm/percpu-km.c | 16 | ||||
-rw-r--r-- | mm/percpu-vm.c | 162 | ||||
-rw-r--r-- | mm/percpu.c | 526 | ||||
-rw-r--r-- | mm/shmem.c | 2 |
8 files changed, 477 insertions, 239 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 1706cbb..64ec49d 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -455,7 +455,7 @@ int bdi_init(struct backing_dev_info *bdi) bdi_wb_init(&bdi->wb, bdi); for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { - err = percpu_counter_init(&bdi->bdi_stat[i], 0); + err = percpu_counter_init(&bdi->bdi_stat[i], 0, GFP_KERNEL); if (err) goto err; } @@ -470,7 +470,7 @@ int bdi_init(struct backing_dev_info *bdi) bdi->write_bandwidth = INIT_BW; bdi->avg_write_bandwidth = INIT_BW; - err = fprop_local_init_percpu(&bdi->completions); + err = fprop_local_init_percpu(&bdi->completions, GFP_KERNEL); if (err) { err: @@ -3196,7 +3196,7 @@ void __init mmap_init(void) { int ret; - ret = percpu_counter_init(&vm_committed_as, 0); + ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL); VM_BUG_ON(ret); } @@ -539,7 +539,7 @@ void __init mmap_init(void) { int ret; - ret = percpu_counter_init(&vm_committed_as, 0); + ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL); VM_BUG_ON(ret); vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC); } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 91d73ef..5085994 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1777,7 +1777,7 @@ void __init page_writeback_init(void) writeback_set_ratelimit(); register_cpu_notifier(&ratelimit_nb); - fprop_global_init(&writeout_completions); + fprop_global_init(&writeout_completions, GFP_KERNEL); } /** diff --git a/mm/percpu-km.c b/mm/percpu-km.c index 89633fe..10e3d0b 100644 --- a/mm/percpu-km.c +++ b/mm/percpu-km.c @@ -33,17 +33,14 @@ #include <linux/log2.h> -static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) +static int pcpu_populate_chunk(struct pcpu_chunk *chunk, + int page_start, int page_end) { - unsigned int cpu; - - for_each_possible_cpu(cpu) - memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); - return 0; } -static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size) +static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, + int page_start, int page_end) { /* nada */ } @@ -70,6 +67,11 @@ static struct pcpu_chunk *pcpu_create_chunk(void) chunk->data = pages; chunk->base_addr = page_address(pages) - pcpu_group_offsets[0]; + + spin_lock_irq(&pcpu_lock); + pcpu_chunk_populated(chunk, 0, nr_pages); + spin_unlock_irq(&pcpu_lock); + return chunk; } diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index 5110816..538998a 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -20,46 +20,25 @@ static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk, } /** - * pcpu_get_pages_and_bitmap - get temp pages array and bitmap + * pcpu_get_pages - get temp pages array * @chunk: chunk of interest - * @bitmapp: output parameter for bitmap - * @may_alloc: may allocate the array * - * Returns pointer to array of pointers to struct page and bitmap, - * both of which can be indexed with pcpu_page_idx(). The returned - * array is cleared to zero and *@bitmapp is copied from - * @chunk->populated. Note that there is only one array and bitmap - * and access exclusion is the caller's responsibility. - * - * CONTEXT: - * pcpu_alloc_mutex and does GFP_KERNEL allocation if @may_alloc. - * Otherwise, don't care. + * Returns pointer to array of pointers to struct page which can be indexed + * with pcpu_page_idx(). Note that there is only one array and accesses + * should be serialized by pcpu_alloc_mutex. * * RETURNS: - * Pointer to temp pages array on success, NULL on failure. + * Pointer to temp pages array on success. */ -static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk, - unsigned long **bitmapp, - bool may_alloc) +static struct page **pcpu_get_pages(struct pcpu_chunk *chunk_alloc) { static struct page **pages; - static unsigned long *bitmap; size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]); - size_t bitmap_size = BITS_TO_LONGS(pcpu_unit_pages) * - sizeof(unsigned long); - - if (!pages || !bitmap) { - if (may_alloc && !pages) - pages = pcpu_mem_zalloc(pages_size); - if (may_alloc && !bitmap) - bitmap = pcpu_mem_zalloc(bitmap_size); - if (!pages || !bitmap) - return NULL; - } - bitmap_copy(bitmap, chunk->populated, pcpu_unit_pages); + lockdep_assert_held(&pcpu_alloc_mutex); - *bitmapp = bitmap; + if (!pages) + pages = pcpu_mem_zalloc(pages_size); return pages; } @@ -67,7 +46,6 @@ static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk, * pcpu_free_pages - free pages which were allocated for @chunk * @chunk: chunk pages were allocated for * @pages: array of pages to be freed, indexed by pcpu_page_idx() - * @populated: populated bitmap * @page_start: page index of the first page to be freed * @page_end: page index of the last page to be freed + 1 * @@ -75,8 +53,7 @@ static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk, * The pages were allocated for @chunk. */ static void pcpu_free_pages(struct pcpu_chunk *chunk, - struct page **pages, unsigned long *populated, - int page_start, int page_end) + struct page **pages, int page_start, int page_end) { unsigned int cpu; int i; @@ -95,7 +72,6 @@ static void pcpu_free_pages(struct pcpu_chunk *chunk, * pcpu_alloc_pages - allocates pages for @chunk * @chunk: target chunk * @pages: array to put the allocated pages into, indexed by pcpu_page_idx() - * @populated: populated bitmap * @page_start: page index of the first page to be allocated * @page_end: page index of the last page to be allocated + 1 * @@ -104,8 +80,7 @@ static void pcpu_free_pages(struct pcpu_chunk *chunk, * content of @pages and will pass it verbatim to pcpu_map_pages(). */ static int pcpu_alloc_pages(struct pcpu_chunk *chunk, - struct page **pages, unsigned long *populated, - int page_start, int page_end) + struct page **pages, int page_start, int page_end) { const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD; unsigned int cpu, tcpu; @@ -164,7 +139,6 @@ static void __pcpu_unmap_pages(unsigned long addr, int nr_pages) * pcpu_unmap_pages - unmap pages out of a pcpu_chunk * @chunk: chunk of interest * @pages: pages array which can be used to pass information to free - * @populated: populated bitmap * @page_start: page index of the first page to unmap * @page_end: page index of the last page to unmap + 1 * @@ -175,8 +149,7 @@ static void __pcpu_unmap_pages(unsigned long addr, int nr_pages) * proper pre/post flush functions. */ static void pcpu_unmap_pages(struct pcpu_chunk *chunk, - struct page **pages, unsigned long *populated, - int page_start, int page_end) + struct page **pages, int page_start, int page_end) { unsigned int cpu; int i; @@ -192,8 +165,6 @@ static void pcpu_unmap_pages(struct pcpu_chunk *chunk, __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start), page_end - page_start); } - - bitmap_clear(populated, page_start, page_end - page_start); } /** @@ -228,7 +199,6 @@ static int __pcpu_map_pages(unsigned long addr, struct page **pages, * pcpu_map_pages - map pages into a pcpu_chunk * @chunk: chunk of interest * @pages: pages array containing pages to be mapped - * @populated: populated bitmap * @page_start: page index of the first page to map * @page_end: page index of the last page to map + 1 * @@ -236,13 +206,11 @@ static int __pcpu_map_pages(unsigned long addr, struct page **pages, * caller is responsible for calling pcpu_post_map_flush() after all * mappings are complete. * - * This function is responsible for setting corresponding bits in - * @chunk->populated bitmap and whatever is necessary for reverse - * lookup (addr -> chunk). + * This function is responsible for setting up whatever is necessary for + * reverse lookup (addr -> chunk). */ static int pcpu_map_pages(struct pcpu_chunk *chunk, - struct page **pages, unsigned long *populated, - int page_start, int page_end) + struct page **pages, int page_start, int page_end) { unsigned int cpu, tcpu; int i, err; @@ -253,18 +221,12 @@ static int pcpu_map_pages(struct pcpu_chunk *chunk, page_end - page_start); if (err < 0) goto err; - } - /* mapping successful, link chunk and mark populated */ - for (i = page_start; i < page_end; i++) { - for_each_possible_cpu(cpu) + for (i = page_start; i < page_end; i++) pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)], chunk); - __set_bit(i, populated); } - return 0; - err: for_each_possible_cpu(tcpu) { if (tcpu == cpu) @@ -299,123 +261,69 @@ static void pcpu_post_map_flush(struct pcpu_chunk *chunk, /** * pcpu_populate_chunk - populate and map an area of a pcpu_chunk * @chunk: chunk of interest - * @off: offset to the area to populate - * @size: size of the area to populate in bytes + * @page_start: the start page + * @page_end: the end page * * For each cpu, populate and map pages [@page_start,@page_end) into - * @chunk. The area is cleared on return. + * @chunk. * * CONTEXT: * pcpu_alloc_mutex, does GFP_KERNEL allocation. */ -static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) +static int pcpu_populate_chunk(struct pcpu_chunk *chunk, + int page_start, int page_end) { - int page_start = PFN_DOWN(off); - int page_end = PFN_UP(off + size); - int free_end = page_start, unmap_end = page_start; struct page **pages; - unsigned long *populated; - unsigned int cpu; - int rs, re, rc; - - /* quick path, check whether all pages are already there */ - rs = page_start; - pcpu_next_pop(chunk, &rs, &re, page_end); - if (rs == page_start && re == page_end) - goto clear; - /* need to allocate and map pages, this chunk can't be immutable */ - WARN_ON(chunk->immutable); - - pages = pcpu_get_pages_and_bitmap(chunk, &populated, true); + pages = pcpu_get_pages(chunk); if (!pages) return -ENOMEM; - /* alloc and map */ - pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { - rc = pcpu_alloc_pages(chunk, pages, populated, rs, re); - if (rc) - goto err_free; - free_end = re; - } + if (pcpu_alloc_pages(chunk, pages, page_start, page_end)) + return -ENOMEM; - pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { - rc = pcpu_map_pages(chunk, pages, populated, rs, re); - if (rc) - goto err_unmap; - unmap_end = re; + if (pcpu_map_pages(chunk, pages, page_start, page_end)) { + pcpu_free_pages(chunk, pages, page_start, page_end); + return -ENOMEM; } pcpu_post_map_flush(chunk, page_start, page_end); - /* commit new bitmap */ - bitmap_copy(chunk->populated, populated, pcpu_unit_pages); -clear: - for_each_possible_cpu(cpu) - memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); return 0; - -err_unmap: - pcpu_pre_unmap_flush(chunk, page_start, unmap_end); - pcpu_for_each_unpop_region(chunk, rs, re, page_start, unmap_end) - pcpu_unmap_pages(chunk, pages, populated, rs, re); - pcpu_post_unmap_tlb_flush(chunk, page_start, unmap_end); -err_free: - pcpu_for_each_unpop_region(chunk, rs, re, page_start, free_end) - pcpu_free_pages(chunk, pages, populated, rs, re); - return rc; } /** * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk * @chunk: chunk to depopulate - * @off: offset to the area to depopulate - * @size: size of the area to depopulate in bytes + * @page_start: the start page + * @page_end: the end page * * For each cpu, depopulate and unmap pages [@page_start,@page_end) - * from @chunk. If @flush is true, vcache is flushed before unmapping - * and tlb after. + * from @chunk. * * CONTEXT: * pcpu_alloc_mutex. */ -static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size) +static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, + int page_start, int page_end) { - int page_start = PFN_DOWN(off); - int page_end = PFN_UP(off + size); struct page **pages; - unsigned long *populated; - int rs, re; - - /* quick path, check whether it's empty already */ - rs = page_start; - pcpu_next_unpop(chunk, &rs, &re, page_end); - if (rs == page_start && re == page_end) - return; - - /* immutable chunks can't be depopulated */ - WARN_ON(chunk->immutable); /* * If control reaches here, there must have been at least one * successful population attempt so the temp pages array must * be available now. */ - pages = pcpu_get_pages_and_bitmap(chunk, &populated, false); + pages = pcpu_get_pages(chunk); BUG_ON(!pages); /* unmap and free */ pcpu_pre_unmap_flush(chunk, page_start, page_end); - pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) - pcpu_unmap_pages(chunk, pages, populated, rs, re); + pcpu_unmap_pages(chunk, pages, page_start, page_end); /* no need to flush tlb, vmalloc will handle it lazily */ - pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) - pcpu_free_pages(chunk, pages, populated, rs, re); - - /* commit new bitmap */ - bitmap_copy(chunk->populated, populated, pcpu_unit_pages); + pcpu_free_pages(chunk, pages, page_start, page_end); } static struct pcpu_chunk *pcpu_create_chunk(void) diff --git a/mm/percpu.c b/mm/percpu.c index da997f9..e10f9f7 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -76,6 +76,10 @@ #define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */ #define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ +#define PCPU_ATOMIC_MAP_MARGIN_LOW 32 +#define PCPU_ATOMIC_MAP_MARGIN_HIGH 64 +#define PCPU_EMPTY_POP_PAGES_LOW 2 +#define PCPU_EMPTY_POP_PAGES_HIGH 4 #ifdef CONFIG_SMP /* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */ @@ -102,12 +106,16 @@ struct pcpu_chunk { int free_size; /* free bytes in the chunk */ int contig_hint; /* max contiguous size hint */ void *base_addr; /* base address of this chunk */ + int map_used; /* # of map entries used before the sentry */ int map_alloc; /* # of map entries allocated */ int *map; /* allocation map */ + struct work_struct map_extend_work;/* async ->map[] extension */ + void *data; /* chunk data */ int first_free; /* no free below this */ bool immutable; /* no [de]population allowed */ + int nr_populated; /* # of populated pages */ unsigned long populated[]; /* populated bitmap */ }; @@ -151,38 +159,33 @@ static struct pcpu_chunk *pcpu_first_chunk; static struct pcpu_chunk *pcpu_reserved_chunk; static int pcpu_reserved_chunk_limit; +static DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */ +static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop */ + +static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ + /* - * Synchronization rules. - * - * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former - * protects allocation/reclaim paths, chunks, populated bitmap and - * vmalloc mapping. The latter is a spinlock and protects the index - * data structures - chunk slots, chunks and area maps in chunks. - * - * During allocation, pcpu_alloc_mutex is kept locked all the time and - * pcpu_lock is grabbed and released as necessary. All actual memory - * allocations are done using GFP_KERNEL with pcpu_lock released. In - * general, percpu memory can't be allocated with irq off but - * irqsave/restore are still used in alloc path so that it can be used - * from early init path - sched_init() specifically. - * - * Free path accesses and alters only the index data structures, so it - * can be safely called from atomic context. When memory needs to be - * returned to the system, free path schedules reclaim_work which - * grabs both pcpu_alloc_mutex and pcpu_lock, unlinks chunks to be - * reclaimed, release both locks and frees the chunks. Note that it's - * necessary to grab both locks to remove a chunk from circulation as - * allocation path might be referencing the chunk with only - * pcpu_alloc_mutex locked. + * The number of empty populated pages, protected by pcpu_lock. The + * reserved chunk doesn't contribute to the count. */ -static DEFINE_MUTEX(pcpu_alloc_mutex); /* protects whole alloc and reclaim */ -static DEFINE_SPINLOCK(pcpu_lock); /* protects index data structures */ +static int pcpu_nr_empty_pop_pages; -static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ +/* + * Balance work is used to populate or destroy chunks asynchronously. We + * try to keep the number of populated free pages between + * PCPU_EMPTY_POP_PAGES_LOW and HIGH for atomic allocations and at most one + * empty chunk. + */ +static void pcpu_balance_workfn(struct work_struct *work); +static DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn); +static bool pcpu_async_enabled __read_mostly; +static bool pcpu_atomic_alloc_failed; -/* reclaim work to release fully free chunks, scheduled from free path */ -static void pcpu_reclaim(struct work_struct *work); -static DECLARE_WORK(pcpu_reclaim_work, pcpu_reclaim); +static void pcpu_schedule_balance_work(void) +{ + if (pcpu_async_enabled) + schedule_work(&pcpu_balance_work); +} static bool pcpu_addr_in_first_chunk(void *addr) { @@ -315,6 +318,38 @@ static void pcpu_mem_free(void *ptr, size_t size) } /** + * pcpu_count_occupied_pages - count the number of pages an area occupies + * @chunk: chunk of interest + * @i: index of the area in question + * + * Count the number of pages chunk's @i'th area occupies. When the area's + * start and/or end address isn't aligned to page boundary, the straddled + * page is included in the count iff the rest of the page is free. + */ +static int pcpu_count_occupied_pages(struct pcpu_chunk *chunk, int i) +{ + int off = chunk->map[i] & ~1; + int end = chunk->map[i + 1] & ~1; + + if (!PAGE_ALIGNED(off) && i > 0) { + int prev = chunk->map[i - 1]; + + if (!(prev & 1) && prev <= round_down(off, PAGE_SIZE)) + off = round_down(off, PAGE_SIZE); + } + + if (!PAGE_ALIGNED(end) && i + 1 < chunk->map_used) { + int next = chunk->map[i + 1]; + int nend = chunk->map[i + 2] & ~1; + + if (!(next & 1) && nend >= round_up(end, PAGE_SIZE)) + end = round_up(end, PAGE_SIZE); + } + + return max_t(int, PFN_DOWN(end) - PFN_UP(off), 0); +} + +/** * pcpu_chunk_relocate - put chunk in the appropriate chunk slot * @chunk: chunk of interest * @oslot: the previous slot it was on @@ -342,9 +377,14 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) /** * pcpu_need_to_extend - determine whether chunk area map needs to be extended * @chunk: chunk of interest + * @is_atomic: the allocation context * - * Determine whether area map of @chunk needs to be extended to - * accommodate a new allocation. + * Determine whether area map of @chunk needs to be extended. If + * @is_atomic, only the amount necessary for a new allocation is + * considered; however, async extension is scheduled if the left amount is + * low. If !@is_atomic, it aims for more empty space. Combined, this + * ensures that the map is likely to have enough available space to + * accomodate atomic allocations which can't extend maps directly. * * CONTEXT: * pcpu_lock. @@ -353,15 +393,26 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) * New target map allocation length if extension is necessary, 0 * otherwise. */ -static int pcpu_need_to_extend(struct pcpu_chunk *chunk) +static int pcpu_need_to_extend(struct pcpu_chunk *chunk, bool is_atomic) { - int new_alloc; + int margin, new_alloc; + + if (is_atomic) { + margin = 3; + + if (chunk->map_alloc < + chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW && + pcpu_async_enabled) + schedule_work(&chunk->map_extend_work); + } else { + margin = PCPU_ATOMIC_MAP_MARGIN_HIGH; + } - if (chunk->map_alloc >= chunk->map_used + 3) + if (chunk->map_alloc >= chunk->map_used + margin) return 0; new_alloc = PCPU_DFL_MAP_ALLOC; - while (new_alloc < chunk->map_used + 3) + while (new_alloc < chunk->map_used + margin) new_alloc *= 2; return new_alloc; @@ -418,11 +469,76 @@ out_unlock: return 0; } +static void pcpu_map_extend_workfn(struct work_struct *work) +{ + struct pcpu_chunk *chunk = container_of(work, struct pcpu_chunk, + map_extend_work); + int new_alloc; + + spin_lock_irq(&pcpu_lock); + new_alloc = pcpu_need_to_extend(chunk, false); + spin_unlock_irq(&pcpu_lock); + + if (new_alloc) + pcpu_extend_area_map(chunk, new_alloc); +} + +/** + * pcpu_fit_in_area - try to fit the requested allocation in a candidate area + * @chunk: chunk the candidate area belongs to + * @off: the offset to the start of the candidate area + * @this_size: the size of the candidate area + * @size: the size of the target allocation + * @align: the alignment of the target allocation + * @pop_only: only allocate from already populated region + * + * We're trying to allocate @size bytes aligned at @align. @chunk's area + * at @off sized @this_size is a candidate. This function determines + * whether the target allocation fits in the candidate area and returns the + * number of bytes to pad after @off. If the target area doesn't fit, -1 + * is returned. + * + * If @pop_only is %true, this function only considers the already + * populated part of the candidate area. + */ +static int pcpu_fit_in_area(struct pcpu_chunk *chunk, int off, int this_size, + int size, int align, bool pop_only) +{ + int cand_off = off; + + while (true) { + int head = ALIGN(cand_off, align) - off; + int page_start, page_end, rs, re; + + if (this_size < head + size) + return -1; + + if (!pop_only) + return head; + + /* + * If the first unpopulated page is beyond the end of the + * allocation, the whole allocation is populated; + * otherwise, retry from the end of the unpopulated area. + */ + page_start = PFN_DOWN(head + off); + page_end = PFN_UP(head + off + size); + + rs = page_start; + pcpu_next_unpop(chunk, &rs, &re, PFN_UP(off + this_size)); + if (rs >= page_end) + return head; + cand_off = re * PAGE_SIZE; + } +} + /** * pcpu_alloc_area - allocate area from a pcpu_chunk * @chunk: chunk of interest * @size: wanted size in bytes * @align: wanted align + * @pop_only: allocate only from the populated area + * @occ_pages_p: out param for the number of pages the area occupies * * Try to allocate @size bytes area aligned at @align from @chunk. * Note that this function only allocates the offset. It doesn't @@ -437,7 +553,8 @@ out_unlock: * Allocated offset in @chunk on success, -1 if no matching area is * found. */ -static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) +static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align, + bool pop_only, int *occ_pages_p) { int oslot = pcpu_chunk_slot(chunk); int max_contig = 0; @@ -453,11 +570,11 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) if (off & 1) continue; - /* extra for alignment requirement */ - head = ALIGN(off, align) - off; - this_size = (p[1] & ~1) - off; - if (this_size < head + size) { + + head = pcpu_fit_in_area(chunk, off, this_size, size, align, + pop_only); + if (head < 0) { if (!seen_free) { chunk->first_free = i; seen_free = true; @@ -526,6 +643,7 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) chunk->free_size -= size; *p |= 1; + *occ_pages_p = pcpu_count_occupied_pages(chunk, i); pcpu_chunk_relocate(chunk, oslot); return off; } @@ -541,6 +659,7 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) * pcpu_free_area - free area to a pcpu_chunk * @chunk: chunk of interest * @freeme: offset of area to free + * @occ_pages_p: out param for the number of pages the area occupies * * Free area starting from @freeme to @chunk. Note that this function * only modifies the allocation map. It doesn't depopulate or unmap @@ -549,7 +668,8 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) * CONTEXT: * pcpu_lock. */ -static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme) +static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme, + int *occ_pages_p) { int oslot = pcpu_chunk_slot(chunk); int off = 0; @@ -580,6 +700,8 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme) *p = off &= ~1; chunk->free_size += (p[1] & ~1) - off; + *occ_pages_p = pcpu_count_occupied_pages(chunk, i); + /* merge with next? */ if (!(p[1] & 1)) to_free++; @@ -620,6 +742,7 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void) chunk->map_used = 1; INIT_LIST_HEAD(&chunk->list); + INIT_WORK(&chunk->map_extend_work, pcpu_map_extend_workfn); chunk->free_size = pcpu_unit_size; chunk->contig_hint = pcpu_unit_size; @@ -634,6 +757,50 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk) pcpu_mem_free(chunk, pcpu_chunk_struct_size); } +/** + * pcpu_chunk_populated - post-population bookkeeping + * @chunk: pcpu_chunk which got populated + * @page_start: the start page + * @page_end: the end page + * + * Pages in [@page_start,@page_end) have been populated to @chunk. Update + * the bookkeeping information accordingly. Must be called after each + * successful population. + */ +static void pcpu_chunk_populated(struct pcpu_chunk *chunk, + int page_start, int page_end) +{ + int nr = page_end - page_start; + + lockdep_assert_held(&pcpu_lock); + + bitmap_set(chunk->populated, page_start, nr); + chunk->nr_populated += nr; + pcpu_nr_empty_pop_pages += nr; +} + +/** + * pcpu_chunk_depopulated - post-depopulation bookkeeping + * @chunk: pcpu_chunk which got depopulated + * @page_start: the start page + * @page_end: the end page + * + * Pages in [@page_start,@page_end) have been depopulated from @chunk. + * Update the bookkeeping information accordingly. Must be called after + * each successful depopulation. + */ +static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk, + int page_start, int page_end) +{ + int nr = page_end - page_start; + + lockdep_assert_held(&pcpu_lock); + + bitmap_clear(chunk->populated, page_start, nr); + chunk->nr_populated -= nr; + pcpu_nr_empty_pop_pages -= nr; +} + /* * Chunk management implementation. * @@ -695,21 +862,23 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) * @size: size of area to allocate in bytes * @align: alignment of area (max PAGE_SIZE) * @reserved: allocate from the reserved chunk if available + * @gfp: allocation flags * - * Allocate percpu area of @size bytes aligned at @align. - * - * CONTEXT: - * Does GFP_KERNEL allocation. + * Allocate percpu area of @size bytes aligned at @align. If @gfp doesn't + * contain %GFP_KERNEL, the allocation is atomic. * * RETURNS: * Percpu pointer to the allocated area on success, NULL on failure. */ -static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) +static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, + gfp_t gfp) { static int warn_limit = 10; struct pcpu_chunk *chunk; const char *err; - int slot, off, new_alloc; + bool is_atomic = !(gfp & GFP_KERNEL); + int occ_pages = 0; + int slot, off, new_alloc, cpu, ret; unsigned long flags; void __percpu *ptr; @@ -728,7 +897,6 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) return NULL; } - mutex_lock(&pcpu_alloc_mutex); spin_lock_irqsave(&pcpu_lock, flags); /* serve reserved allocations from the reserved chunk if available */ @@ -740,16 +908,18 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) goto fail_unlock; } - while ((new_alloc = pcpu_need_to_extend(chunk))) { + while ((new_alloc = pcpu_need_to_extend(chunk, is_atomic))) { spin_unlock_irqrestore(&pcpu_lock, flags); - if (pcpu_extend_area_map(chunk, new_alloc) < 0) { + if (is_atomic || + pcpu_extend_area_map(chunk, new_alloc) < 0) { err = "failed to extend area map of reserved chunk"; - goto fail_unlock_mutex; + goto fail; } spin_lock_irqsave(&pcpu_lock, flags); } - off = pcpu_alloc_area(chunk, size, align); + off = pcpu_alloc_area(chunk, size, align, is_atomic, + &occ_pages); if (off >= 0) goto area_found; @@ -764,13 +934,15 @@ restart: if (size > chunk->contig_hint) continue; - new_alloc = pcpu_need_to_extend(chunk); + new_alloc = pcpu_need_to_extend(chunk, is_atomic); if (new_alloc) { + if (is_atomic) + continue; spin_unlock_irqrestore(&pcpu_lock, flags); if (pcpu_extend_area_map(chunk, new_alloc) < 0) { err = "failed to extend area map"; - goto fail_unlock_mutex; + goto fail; } spin_lock_irqsave(&pcpu_lock, flags); /* @@ -780,74 +952,134 @@ restart: goto restart; } - off = pcpu_alloc_area(chunk, size, align); + off = pcpu_alloc_area(chunk, size, align, is_atomic, + &occ_pages); if (off >= 0) goto area_found; } } - /* hmmm... no space left, create a new chunk */ spin_unlock_irqrestore(&pcpu_lock, flags); - chunk = pcpu_create_chunk(); - if (!chunk) { - err = "failed to allocate new chunk"; - goto fail_unlock_mutex; + /* + * No space left. Create a new chunk. We don't want multiple + * tasks to create chunks simultaneously. Serialize and create iff + * there's still no empty chunk after grabbing the mutex. + */ + if (is_atomic) + goto fail; + + mutex_lock(&pcpu_alloc_mutex); + + if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) { + chunk = pcpu_create_chunk(); + if (!chunk) { + mutex_unlock(&pcpu_alloc_mutex); + err = "failed to allocate new chunk"; + goto fail; + } + + spin_lock_irqsave(&pcpu_lock, flags); + pcpu_chunk_relocate(chunk, -1); + } else { + spin_lock_irqsave(&pcpu_lock, flags); } - spin_lock_irqsave(&pcpu_lock, flags); - pcpu_chunk_relocate(chunk, -1); + mutex_unlock(&pcpu_alloc_mutex); goto restart; area_found: spin_unlock_irqrestore(&pcpu_lock, flags); - /* populate, map and clear the area */ - if (pcpu_populate_chunk(chunk, off, size)) { - spin_lock_irqsave(&pcpu_lock, flags); - pcpu_free_area(chunk, off); - err = "failed to populate"; - goto fail_unlock; + /* populate if not all pages are already there */ + if (!is_atomic) { + int page_start, page_end, rs, re; + + mutex_lock(&pcpu_alloc_mutex); + + page_start = PFN_DOWN(off); + page_end = PFN_UP(off + size); + + pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { + WARN_ON(chunk->immutable); + + ret = pcpu_populate_chunk(chunk, rs, re); + + spin_lock_irqsave(&pcpu_lock, flags); + if (ret) { + mutex_unlock(&pcpu_alloc_mutex); + pcpu_free_area(chunk, off, &occ_pages); + err = "failed to populate"; + goto fail_unlock; + } + pcpu_chunk_populated(chunk, rs, re); + spin_unlock_irqrestore(&pcpu_lock, flags); + } + + mutex_unlock(&pcpu_alloc_mutex); } - mutex_unlock(&pcpu_alloc_mutex); + if (chunk != pcpu_reserved_chunk) + pcpu_nr_empty_pop_pages -= occ_pages; + + if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW) + pcpu_schedule_balance_work(); + + /* clear the areas and return address relative to base address */ + for_each_possible_cpu(cpu) + memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); - /* return address relative to base address */ ptr = __addr_to_pcpu_ptr(chunk->base_addr + off); kmemleak_alloc_percpu(ptr, size); return ptr; fail_unlock: spin_unlock_irqrestore(&pcpu_lock, flags); -fail_unlock_mutex: - mutex_unlock(&pcpu_alloc_mutex); - if (warn_limit) { - pr_warning("PERCPU: allocation failed, size=%zu align=%zu, " - "%s\n", size, align, err); +fail: + if (!is_atomic && warn_limit) { + pr_warning("PERCPU: allocation failed, size=%zu align=%zu atomic=%d, %s\n", + size, align, is_atomic, err); dump_stack(); if (!--warn_limit) pr_info("PERCPU: limit reached, disable warning\n"); } + if (is_atomic) { + /* see the flag handling in pcpu_blance_workfn() */ + pcpu_atomic_alloc_failed = true; + pcpu_schedule_balance_work(); + } return NULL; } /** - * __alloc_percpu - allocate dynamic percpu area + * __alloc_percpu_gfp - allocate dynamic percpu area * @size: size of area to allocate in bytes * @align: alignment of area (max PAGE_SIZE) + * @gfp: allocation flags * - * Allocate zero-filled percpu area of @size bytes aligned at @align. - * Might sleep. Might trigger writeouts. - * - * CONTEXT: - * Does GFP_KERNEL allocation. + * Allocate zero-filled percpu area of @size bytes aligned at @align. If + * @gfp doesn't contain %GFP_KERNEL, the allocation doesn't block and can + * be called from any context but is a lot more likely to fail. * * RETURNS: * Percpu pointer to the allocated area on success, NULL on failure. */ +void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp) +{ + return pcpu_alloc(size, align, false, gfp); +} +EXPORT_SYMBOL_GPL(__alloc_percpu_gfp); + +/** + * __alloc_percpu - allocate dynamic percpu area + * @size: size of area to allocate in bytes + * @align: alignment of area (max PAGE_SIZE) + * + * Equivalent to __alloc_percpu_gfp(size, align, %GFP_KERNEL). + */ void __percpu *__alloc_percpu(size_t size, size_t align) { - return pcpu_alloc(size, align, false); + return pcpu_alloc(size, align, false, GFP_KERNEL); } EXPORT_SYMBOL_GPL(__alloc_percpu); @@ -869,44 +1101,121 @@ EXPORT_SYMBOL_GPL(__alloc_percpu); */ void __percpu *__alloc_reserved_percpu(size_t size, size_t align) { - return pcpu_alloc(size, align, true); + return pcpu_alloc(size, align, true, GFP_KERNEL); } /** - * pcpu_reclaim - reclaim fully free chunks, workqueue function + * pcpu_balance_workfn - manage the amount of free chunks and populated pages * @work: unused * * Reclaim all fully free chunks except for the first one. - * - * CONTEXT: - * workqueue context. */ -static void pcpu_reclaim(struct work_struct *work) +static void pcpu_balance_workfn(struct work_struct *work) { - LIST_HEAD(todo); - struct list_head *head = &pcpu_slot[pcpu_nr_slots - 1]; + LIST_HEAD(to_free); + struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1]; struct pcpu_chunk *chunk, *next; + int slot, nr_to_pop, ret; + /* + * There's no reason to keep around multiple unused chunks and VM + * areas can be scarce. Destroy all free chunks except for one. + */ mutex_lock(&pcpu_alloc_mutex); spin_lock_irq(&pcpu_lock); - list_for_each_entry_safe(chunk, next, head, list) { + list_for_each_entry_safe(chunk, next, free_head, list) { WARN_ON(chunk->immutable); /* spare the first one */ - if (chunk == list_first_entry(head, struct pcpu_chunk, list)) + if (chunk == list_first_entry(free_head, struct pcpu_chunk, list)) continue; - list_move(&chunk->list, &todo); + list_move(&chunk->list, &to_free); } spin_unlock_irq(&pcpu_lock); - list_for_each_entry_safe(chunk, next, &todo, list) { - pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size); + list_for_each_entry_safe(chunk, next, &to_free, list) { + int rs, re; + + pcpu_for_each_pop_region(chunk, rs, re, 0, pcpu_unit_pages) { + pcpu_depopulate_chunk(chunk, rs, re); + spin_lock_irq(&pcpu_lock); + pcpu_chunk_depopulated(chunk, rs, re); + spin_unlock_irq(&pcpu_lock); + } pcpu_destroy_chunk(chunk); } + /* + * Ensure there are certain number of free populated pages for + * atomic allocs. Fill up from the most packed so that atomic + * allocs don't increase fragmentation. If atomic allocation + * failed previously, always populate the maximum amount. This + * should prevent atomic allocs larger than PAGE_SIZE from keeping + * failing indefinitely; however, large atomic allocs are not + * something we support properly and can be highly unreliable and + * inefficient. + */ +retry_pop: + if (pcpu_atomic_alloc_failed) { + nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH; + /* best effort anyway, don't worry about synchronization */ + pcpu_atomic_alloc_failed = false; + } else { + nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH - + pcpu_nr_empty_pop_pages, + 0, PCPU_EMPTY_POP_PAGES_HIGH); + } + + for (slot = pcpu_size_to_slot(PAGE_SIZE); slot < pcpu_nr_slots; slot++) { + int nr_unpop = 0, rs, re; + + if (!nr_to_pop) + break; + + spin_lock_irq(&pcpu_lock); + list_for_each_entry(chunk, &pcpu_slot[slot], list) { + nr_unpop = pcpu_unit_pages - chunk->nr_populated; + if (nr_unpop) + break; + } + spin_unlock_irq(&pcpu_lock); + + if (!nr_unpop) + continue; + + /* @chunk can't go away while pcpu_alloc_mutex is held */ + pcpu_for_each_unpop_region(chunk, rs, re, 0, pcpu_unit_pages) { + int nr = min(re - rs, nr_to_pop); + + ret = pcpu_populate_chunk(chunk, rs, rs + nr); + if (!ret) { + nr_to_pop -= nr; + spin_lock_irq(&pcpu_lock); + pcpu_chunk_populated(chunk, rs, rs + nr); + spin_unlock_irq(&pcpu_lock); + } else { + nr_to_pop = 0; + } + + if (!nr_to_pop) + break; + } + } + + if (nr_to_pop) { + /* ran out of chunks to populate, create a new one and retry */ + chunk = pcpu_create_chunk(); + if (chunk) { + spin_lock_irq(&pcpu_lock); + pcpu_chunk_relocate(chunk, -1); + spin_unlock_irq(&pcpu_lock); + goto retry_pop; + } + } + mutex_unlock(&pcpu_alloc_mutex); } @@ -924,7 +1233,7 @@ void free_percpu(void __percpu *ptr) void *addr; struct pcpu_chunk *chunk; unsigned long flags; - int off; + int off, occ_pages; if (!ptr) return; @@ -938,7 +1247,10 @@ void free_percpu(void __percpu *ptr) chunk = pcpu_chunk_addr_search(addr); off = addr - chunk->base_addr; - pcpu_free_area(chunk, off); + pcpu_free_area(chunk, off, &occ_pages); + + if (chunk != pcpu_reserved_chunk) + pcpu_nr_empty_pop_pages += occ_pages; /* if there are more than one fully free chunks, wake up grim reaper */ if (chunk->free_size == pcpu_unit_size) { @@ -946,7 +1258,7 @@ void free_percpu(void __percpu *ptr) list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list) if (pos != chunk) { - schedule_work(&pcpu_reclaim_work); + pcpu_schedule_balance_work(); break; } } @@ -1336,11 +1648,13 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, */ schunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); INIT_LIST_HEAD(&schunk->list); + INIT_WORK(&schunk->map_extend_work, pcpu_map_extend_workfn); schunk->base_addr = base_addr; schunk->map = smap; schunk->map_alloc = ARRAY_SIZE(smap); schunk->immutable = true; bitmap_fill(schunk->populated, pcpu_unit_pages); + schunk->nr_populated = pcpu_unit_pages; if (ai->reserved_size) { schunk->free_size = ai->reserved_size; @@ -1364,11 +1678,13 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, if (dyn_size) { dchunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); INIT_LIST_HEAD(&dchunk->list); + INIT_WORK(&dchunk->map_extend_work, pcpu_map_extend_workfn); dchunk->base_addr = base_addr; dchunk->map = dmap; dchunk->map_alloc = ARRAY_SIZE(dmap); dchunk->immutable = true; bitmap_fill(dchunk->populated, pcpu_unit_pages); + dchunk->nr_populated = pcpu_unit_pages; dchunk->contig_hint = dchunk->free_size = dyn_size; dchunk->map[0] = 1; @@ -1379,6 +1695,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, /* link the first chunk in */ pcpu_first_chunk = dchunk ?: schunk; + pcpu_nr_empty_pop_pages += + pcpu_count_occupied_pages(pcpu_first_chunk, 1); pcpu_chunk_relocate(pcpu_first_chunk, -1); /* we're done */ @@ -1932,8 +2250,6 @@ void __init setup_per_cpu_areas(void) if (pcpu_setup_first_chunk(ai, fc) < 0) panic("Failed to initialize percpu areas."); - - pcpu_free_alloc_info(ai); } #endif /* CONFIG_SMP */ @@ -1967,3 +2283,15 @@ void __init percpu_init_late(void) spin_unlock_irqrestore(&pcpu_lock, flags); } } + +/* + * Percpu allocator is initialized early during boot when neither slab or + * workqueue is available. Plug async management until everything is up + * and running. + */ +static int __init percpu_enable_async(void) +{ + pcpu_async_enabled = true; + return 0; +} +subsys_initcall(percpu_enable_async); @@ -2993,7 +2993,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent) #endif spin_lock_init(&sbinfo->stat_lock); - if (percpu_counter_init(&sbinfo->used_blocks, 0)) + if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL)) goto failed; sbinfo->free_inodes = sbinfo->max_inodes; |