From 41fd59b7f9bdde2a473450680411c2016017b992 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Wed, 12 Jul 2023 11:16:20 +0800 Subject: [PATCH 1/6] mm/percpu: Remove some local variables in pcpu_populate_pte In function pcpu_populate_pte there are already variable defined, it can be reused for later use, here remove duplicated local variables. Signed-off-by: Bibo Mao Signed-off-by: Dennis Zhou --- mm/percpu.c | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/mm/percpu.c b/mm/percpu.c index 28e07ede46f6..85e3f9b2a61f 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -3189,32 +3189,26 @@ void __init __weak pcpu_populate_pte(unsigned long addr) pmd_t *pmd; if (pgd_none(*pgd)) { - p4d_t *new; - - new = memblock_alloc(P4D_TABLE_SIZE, P4D_TABLE_SIZE); - if (!new) + p4d = memblock_alloc(P4D_TABLE_SIZE, P4D_TABLE_SIZE); + if (!p4d) goto err_alloc; - pgd_populate(&init_mm, pgd, new); + pgd_populate(&init_mm, pgd, p4d); } p4d = p4d_offset(pgd, addr); if (p4d_none(*p4d)) { - pud_t *new; - - new = memblock_alloc(PUD_TABLE_SIZE, PUD_TABLE_SIZE); - if (!new) + pud = memblock_alloc(PUD_TABLE_SIZE, PUD_TABLE_SIZE); + if (!pud) goto err_alloc; - p4d_populate(&init_mm, p4d, new); + p4d_populate(&init_mm, p4d, pud); } pud = pud_offset(p4d, addr); if (pud_none(*pud)) { - pmd_t *new; - - new = memblock_alloc(PMD_TABLE_SIZE, PMD_TABLE_SIZE); - if (!new) + pmd = memblock_alloc(PMD_TABLE_SIZE, PMD_TABLE_SIZE); + if (!pmd) goto err_alloc; - pud_populate(&init_mm, pud, new); + pud_populate(&init_mm, pud, pmd); } pmd = pmd_offset(pud, addr); From 5b672085e70c2ea40f4c9d6a23848079bf0ff700 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Fri, 21 Jul 2023 21:17:58 +0800 Subject: [PATCH 2/6] mm/percpu.c: remove redundant check The conditional check "(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE) has covered the check '(!ai->dyn_size)'. Signed-off-by: Baoquan He Signed-off-by: Dennis Zhou --- mm/percpu.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/percpu.c b/mm/percpu.c index 85e3f9b2a61f..93b1bec2b28d 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -2615,7 +2615,6 @@ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE); PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->unit_size, PCPU_BITMAP_BLOCK_SIZE)); PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE); - PCPU_SETUP_BUG_ON(!ai->dyn_size); PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->reserved_size, PCPU_MIN_ALLOC_SIZE)); PCPU_SETUP_BUG_ON(!(IS_ALIGNED(PCPU_BITMAP_BLOCK_SIZE, PAGE_SIZE) || IS_ALIGNED(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE))); From 7ee1e758bebe13d96217bcfd5230892ed44760e7 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Sat, 22 Jul 2023 09:14:37 +0800 Subject: [PATCH 3/6] mm/percpu.c: optimize the code in pcpu_setup_first_chunk() a little bit This removes the need of local varibale 'chunk', and optimize the code calling pcpu_alloc_first_chunk() to initialize reserved chunk and dynamic chunk to make it simpler. Signed-off-by: Baoquan He [Dennis: reworded first chunk init comment] Signed-off-by: Dennis Zhou --- mm/percpu.c | 38 +++++++++++++++----------------------- 1 file changed, 15 insertions(+), 23 deletions(-) diff --git a/mm/percpu.c b/mm/percpu.c index 93b1bec2b28d..ab4ba2ac91c7 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -2581,14 +2581,12 @@ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, { size_t size_sum = ai->static_size + ai->reserved_size + ai->dyn_size; size_t static_size, dyn_size; - struct pcpu_chunk *chunk; unsigned long *group_offsets; size_t *group_sizes; unsigned long *unit_off; unsigned int cpu; int *unit_map; int group, unit, i; - int map_size; unsigned long tmp_addr; size_t alloc_size; @@ -2697,7 +2695,7 @@ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT; pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; pcpu_atom_size = ai->atom_size; - pcpu_chunk_struct_size = struct_size(chunk, populated, + pcpu_chunk_struct_size = struct_size((struct pcpu_chunk *)0, populated, BITS_TO_LONGS(pcpu_unit_pages)); pcpu_stats_save_ai(ai); @@ -2734,29 +2732,23 @@ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, dyn_size = ai->dyn_size - (static_size - ai->static_size); /* - * Initialize first chunk. - * If the reserved_size is non-zero, this initializes the reserved - * chunk. If the reserved_size is zero, the reserved chunk is NULL - * and the dynamic region is initialized here. The first chunk, - * pcpu_first_chunk, will always point to the chunk that serves - * the dynamic region. + * Initialize first chunk: + * This chunk is broken up into 3 parts: + * < static | [reserved] | dynamic > + * - static - there is no backing chunk because these allocations can + * never be freed. + * - reserved (pcpu_reserved_chunk) - exists primarily to serve + * allocations from module load. + * - dynamic (pcpu_first_chunk) - serves the dynamic part of the first + * chunk. */ tmp_addr = (unsigned long)base_addr + static_size; - map_size = ai->reserved_size ?: dyn_size; - chunk = pcpu_alloc_first_chunk(tmp_addr, map_size); + if (ai->reserved_size) + pcpu_reserved_chunk = pcpu_alloc_first_chunk(tmp_addr, + ai->reserved_size); + tmp_addr = (unsigned long)base_addr + static_size + ai->reserved_size; + pcpu_first_chunk = pcpu_alloc_first_chunk(tmp_addr, dyn_size); - /* init dynamic chunk if necessary */ - if (ai->reserved_size) { - pcpu_reserved_chunk = chunk; - - tmp_addr = (unsigned long)base_addr + static_size + - ai->reserved_size; - map_size = dyn_size; - chunk = pcpu_alloc_first_chunk(tmp_addr, map_size); - } - - /* link the first chunk in */ - pcpu_first_chunk = chunk; pcpu_nr_empty_pop_pages = pcpu_first_chunk->nr_empty_pop_pages; pcpu_chunk_relocate(pcpu_first_chunk, -1); From f7d77dfc91f747f64cb00884fd6d7940c3b49fca Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Fri, 28 Jul 2023 11:02:55 +0800 Subject: [PATCH 4/6] mm/percpu.c: print error message too if atomic alloc failed The variable 'err' is assgigned to an error message if atomic alloc failed, while it has no chance to be printed if is_atomic is true. Here change to print error message too if atomic alloc failed, while avoid to call dump_stack() if that case. Signed-off-by: Baoquan He Signed-off-by: Dennis Zhou --- mm/percpu.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/percpu.c b/mm/percpu.c index ab4ba2ac91c7..a7665de8485f 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1890,13 +1890,15 @@ fail_unlock: fail: trace_percpu_alloc_percpu_fail(reserved, is_atomic, size, align); - if (!is_atomic && do_warn && warn_limit) { + if (do_warn && warn_limit) { pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n", size, align, is_atomic, err); - dump_stack(); + if (!is_atomic) + dump_stack(); if (!--warn_limit) pr_info("limit reached, disable warning\n"); } + if (is_atomic) { /* see the flag handling in pcpu_balance_workfn() */ pcpu_atomic_alloc_failed = true; From c439d5e8a0deb7310b5bb4e5f2fe47c40ff5297f Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 23 Aug 2023 07:06:08 +0200 Subject: [PATCH 5/6] pcpcntr: add group allocation/free Allocations and frees are globally serialized on the pcpu lock (and the CPU hotplug lock if enabled, which is the case on Debian). At least one frequent consumer allocates 4 back-to-back counters (and frees them in the same manner), exacerbating the problem. While this does not fully remedy scalability issues, it is a step towards that goal and provides immediate relief. Signed-off-by: Mateusz Guzik Reviewed-by: Dennis Zhou Reviewed-by: Vegard Nossum Link: https://lore.kernel.org/r/20230823050609.2228718-2-mjguzik@gmail.com [Dennis: reflowed a few lines] Signed-off-by: Dennis Zhou --- include/linux/percpu_counter.h | 41 +++++++++++++++++++---- lib/percpu_counter.c | 60 ++++++++++++++++++++++++---------- 2 files changed, 76 insertions(+), 25 deletions(-) diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h index 75b73c83bc9d..d01351b1526f 100644 --- a/include/linux/percpu_counter.h +++ b/include/linux/percpu_counter.h @@ -30,17 +30,28 @@ struct percpu_counter { extern int percpu_counter_batch; -int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, gfp_t gfp, - struct lock_class_key *key); +int __percpu_counter_init_many(struct percpu_counter *fbc, s64 amount, + gfp_t gfp, u32 nr_counters, + struct lock_class_key *key); -#define percpu_counter_init(fbc, value, gfp) \ +#define percpu_counter_init_many(fbc, value, gfp, nr_counters) \ ({ \ static struct lock_class_key __key; \ \ - __percpu_counter_init(fbc, value, gfp, &__key); \ + __percpu_counter_init_many(fbc, value, gfp, nr_counters,\ + &__key); \ }) -void percpu_counter_destroy(struct percpu_counter *fbc); + +#define percpu_counter_init(fbc, value, gfp) \ + percpu_counter_init_many(fbc, value, gfp, 1) + +void percpu_counter_destroy_many(struct percpu_counter *fbc, u32 nr_counters); +static inline void percpu_counter_destroy(struct percpu_counter *fbc) +{ + percpu_counter_destroy_many(fbc, 1); +} + void percpu_counter_set(struct percpu_counter *fbc, s64 amount); void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch); @@ -116,11 +127,27 @@ struct percpu_counter { s64 count; }; +static inline int percpu_counter_init_many(struct percpu_counter *fbc, + s64 amount, gfp_t gfp, + u32 nr_counters) +{ + u32 i; + + for (i = 0; i < nr_counters; i++) + fbc[i].count = amount; + + return 0; +} + static inline int percpu_counter_init(struct percpu_counter *fbc, s64 amount, gfp_t gfp) { - fbc->count = amount; - return 0; + return percpu_counter_init_many(fbc, amount, gfp, 1); +} + +static inline void percpu_counter_destroy_many(struct percpu_counter *fbc, + u32 nr_counters) +{ } static inline void percpu_counter_destroy(struct percpu_counter *fbc) diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c index 5004463c4f9f..9073430dc865 100644 --- a/lib/percpu_counter.c +++ b/lib/percpu_counter.c @@ -151,48 +151,72 @@ s64 __percpu_counter_sum(struct percpu_counter *fbc) } EXPORT_SYMBOL(__percpu_counter_sum); -int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, gfp_t gfp, - struct lock_class_key *key) +int __percpu_counter_init_many(struct percpu_counter *fbc, s64 amount, + gfp_t gfp, u32 nr_counters, + struct lock_class_key *key) { unsigned long flags __maybe_unused; + size_t counter_size; + s32 __percpu *counters; + u32 i; - raw_spin_lock_init(&fbc->lock); - lockdep_set_class(&fbc->lock, key); - fbc->count = amount; - fbc->counters = alloc_percpu_gfp(s32, gfp); - if (!fbc->counters) + counter_size = ALIGN(sizeof(*counters), __alignof__(*counters)); + counters = __alloc_percpu_gfp(nr_counters * counter_size, + __alignof__(*counters), gfp); + if (!counters) { + fbc[0].counters = NULL; return -ENOMEM; + } - debug_percpu_counter_activate(fbc); + for (i = 0; i < nr_counters; i++) { + raw_spin_lock_init(&fbc[i].lock); + lockdep_set_class(&fbc[i].lock, key); +#ifdef CONFIG_HOTPLUG_CPU + INIT_LIST_HEAD(&fbc[i].list); +#endif + fbc[i].count = amount; + fbc[i].counters = (void *)counters + (i * counter_size); + + debug_percpu_counter_activate(&fbc[i]); + } #ifdef CONFIG_HOTPLUG_CPU - INIT_LIST_HEAD(&fbc->list); spin_lock_irqsave(&percpu_counters_lock, flags); - list_add(&fbc->list, &percpu_counters); + for (i = 0; i < nr_counters; i++) + list_add(&fbc[i].list, &percpu_counters); spin_unlock_irqrestore(&percpu_counters_lock, flags); #endif return 0; } -EXPORT_SYMBOL(__percpu_counter_init); +EXPORT_SYMBOL(__percpu_counter_init_many); -void percpu_counter_destroy(struct percpu_counter *fbc) +void percpu_counter_destroy_many(struct percpu_counter *fbc, u32 nr_counters) { unsigned long flags __maybe_unused; + u32 i; - if (!fbc->counters) + if (WARN_ON_ONCE(!fbc)) return; - debug_percpu_counter_deactivate(fbc); + if (!fbc[0].counters) + return; + + for (i = 0; i < nr_counters; i++) + debug_percpu_counter_deactivate(&fbc[i]); #ifdef CONFIG_HOTPLUG_CPU spin_lock_irqsave(&percpu_counters_lock, flags); - list_del(&fbc->list); + for (i = 0; i < nr_counters; i++) + list_del(&fbc[i].list); spin_unlock_irqrestore(&percpu_counters_lock, flags); #endif - free_percpu(fbc->counters); - fbc->counters = NULL; + + free_percpu(fbc[0].counters); + + for (i = 0; i < nr_counters; i++) + fbc[i].counters = NULL; } -EXPORT_SYMBOL(percpu_counter_destroy); +EXPORT_SYMBOL(percpu_counter_destroy_many); int percpu_counter_batch __read_mostly = 32; EXPORT_SYMBOL(percpu_counter_batch); From 14ef95be6f5558fb9e43aaf06ef9a1d6e0cae6c8 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 23 Aug 2023 07:06:09 +0200 Subject: [PATCH 6/6] kernel/fork: group allocation/free of per-cpu counters for mm struct A trivial execve scalability test which tries to be very friendly (statically linked binaries, all separate) is predominantly bottlenecked by back-to-back per-cpu counter allocations which serialize on global locks. Ease the pain by allocating and freeing them in one go. Bench can be found here: http://apollo.backplane.com/DFlyMisc/doexec.c $ cc -static -O2 -o static-doexec doexec.c $ ./static-doexec $(nproc) Even at a very modest scale of 26 cores (ops/s): before: 133543.63 after: 186061.81 (+39%) While with the patch these allocations remain a significant problem, the primary bottleneck shifts to page release handling. Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20230823050609.2228718-3-mjguzik@gmail.com [Dennis: reflowed 1 line] Signed-off-by: Dennis Zhou --- kernel/fork.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index d2e12b6d2b18..afd198bae640 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -909,8 +909,6 @@ static void cleanup_lazy_tlbs(struct mm_struct *mm) */ void __mmdrop(struct mm_struct *mm) { - int i; - BUG_ON(mm == &init_mm); WARN_ON_ONCE(mm == current->mm); @@ -925,9 +923,8 @@ void __mmdrop(struct mm_struct *mm) put_user_ns(mm->user_ns); mm_pasid_drop(mm); mm_destroy_cid(mm); + percpu_counter_destroy_many(mm->rss_stat, NR_MM_COUNTERS); - for (i = 0; i < NR_MM_COUNTERS; i++) - percpu_counter_destroy(&mm->rss_stat[i]); free_mm(mm); } EXPORT_SYMBOL_GPL(__mmdrop); @@ -1252,8 +1249,6 @@ static void mm_init_uprobes_state(struct mm_struct *mm) static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, struct user_namespace *user_ns) { - int i; - mt_init_flags(&mm->mm_mt, MM_MT_FLAGS); mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock); atomic_set(&mm->mm_users, 1); @@ -1301,17 +1296,15 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, if (mm_alloc_cid(mm)) goto fail_cid; - for (i = 0; i < NR_MM_COUNTERS; i++) - if (percpu_counter_init(&mm->rss_stat[i], 0, GFP_KERNEL_ACCOUNT)) - goto fail_pcpu; + if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT, + NR_MM_COUNTERS)) + goto fail_pcpu; mm->user_ns = get_user_ns(user_ns); lru_gen_init_mm(mm); return mm; fail_pcpu: - while (i > 0) - percpu_counter_destroy(&mm->rss_stat[--i]); mm_destroy_cid(mm); fail_cid: destroy_context(mm);