linux-stable/include/linux/nmi.h
Li Huafei d6834d9c99 watchdog/hardlockup/perf: Fix perf_event memory leak
During stress-testing, we found a kmemleak report for perf_event:

  unreferenced object 0xff110001410a33e0 (size 1328):
    comm "kworker/4:11", pid 288, jiffies 4294916004
    hex dump (first 32 bytes):
      b8 be c2 3b 02 00 11 ff 22 01 00 00 00 00 ad de  ...;....".......
      f0 33 0a 41 01 00 11 ff f0 33 0a 41 01 00 11 ff  .3.A.....3.A....
    backtrace (crc 24eb7b3a):
      [<00000000e211b653>] kmem_cache_alloc_node_noprof+0x269/0x2e0
      [<000000009d0985fa>] perf_event_alloc+0x5f/0xcf0
      [<00000000084ad4a2>] perf_event_create_kernel_counter+0x38/0x1b0
      [<00000000fde96401>] hardlockup_detector_event_create+0x50/0xe0
      [<0000000051183158>] watchdog_hardlockup_enable+0x17/0x70
      [<00000000ac89727f>] softlockup_start_fn+0x15/0x40
      ...

Our stress test includes CPU online and offline cycles, and updating the
watchdog configuration.

After reading the code, I found that there may be a race between cleaning up
perf_event after updating watchdog and disabling event when the CPU goes offline:

  CPU0                          CPU1                           CPU2
  (update watchdog)                                            (hotplug offline CPU1)

  ...                                                          _cpu_down(CPU1)
  cpus_read_lock()                                             // waiting for cpu lock
    softlockup_start_all
      smp_call_on_cpu(CPU1)
                                softlockup_start_fn
                                ...
                                  watchdog_hardlockup_enable(CPU1)
                                    perf create E1
                                    watchdog_ev[CPU1] = E1
  cpus_read_unlock()
                                                               cpus_write_lock()
                                                               cpuhp_kick_ap_work(CPU1)
                                cpuhp_thread_fun
                                ...
                                  watchdog_hardlockup_disable(CPU1)
                                    watchdog_ev[CPU1] = NULL
                                    dead_event[CPU1] = E1
  __lockup_detector_cleanup
    for each dead_events_mask
      release each dead_event
      /*
       * CPU1 has not been added to
       * dead_events_mask, then E1
       * will not be released
       */
                                    CPU1 -> dead_events_mask
    cpumask_clear(&dead_events_mask)
    // dead_events_mask is cleared, E1 is leaked

In this case, the leaked perf_event E1 matches the perf_event leak
reported by kmemleak. Due to the low probability of problem recurrence
(only reported once), I added some hack delays in the code:

  static void __lockup_detector_reconfigure(void)
  {
    ...
          watchdog_hardlockup_start();
          cpus_read_unlock();
  +       mdelay(100);
          /*
           * Must be called outside the cpus locked section to prevent
           * recursive locking in the perf code.
    ...
  }

  void watchdog_hardlockup_disable(unsigned int cpu)
  {
    ...
                  perf_event_disable(event);
                  this_cpu_write(watchdog_ev, NULL);
                  this_cpu_write(dead_event, event);
  +               mdelay(100);
                  cpumask_set_cpu(smp_processor_id(), &dead_events_mask);
                  atomic_dec(&watchdog_cpus);
    ...
  }

  void hardlockup_detector_perf_cleanup(void)
  {
    ...
                          perf_event_release_kernel(event);
                  per_cpu(dead_event, cpu) = NULL;
          }
  +       mdelay(100);
          cpumask_clear(&dead_events_mask);
  }

Then, simultaneously performing CPU on/off and switching watchdog, it is
almost certain to reproduce this leak.

The problem here is that releasing perf_event is not within the CPU
hotplug read-write lock. Commit:

  941154bd69 ("watchdog/hardlockup/perf: Prevent CPU hotplug deadlock")

introduced deferred release to solve the deadlock caused by calling
get_online_cpus() when releasing perf_event. Later, commit:

  efe951d3de ("perf/x86: Fix perf,x86,cpuhp deadlock")

removed the get_online_cpus() call on the perf_event release path to solve
another deadlock problem.

Therefore, it is now possible to move the release of perf_event back
into the CPU hotplug read-write lock, and release the event immediately
after disabling it.

Fixes: 941154bd69 ("watchdog/hardlockup/perf: Prevent CPU hotplug deadlock")
Signed-off-by: Li Huafei <lihuafei1@huawei.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20241021193004.308303-1-lihuafei1@huawei.com
2025-03-06 12:05:33 +01:00

230 lines
7.2 KiB
C

/* SPDX-License-Identifier: GPL-2.0 */
/*
* linux/include/linux/nmi.h
*/
#ifndef LINUX_NMI_H
#define LINUX_NMI_H
#include <linux/sched.h>
#include <asm/irq.h>
/* Arch specific watchdogs might need to share extra watchdog-related APIs. */
#if defined(CONFIG_HARDLOCKUP_DETECTOR_ARCH) || defined(CONFIG_HARDLOCKUP_DETECTOR_SPARC64)
#include <asm/nmi.h>
#endif
#ifdef CONFIG_LOCKUP_DETECTOR
void lockup_detector_init(void);
void lockup_detector_retry_init(void);
void lockup_detector_soft_poweroff(void);
extern int watchdog_user_enabled;
extern int watchdog_thresh;
extern unsigned long watchdog_enabled;
extern struct cpumask watchdog_cpumask;
extern unsigned long *watchdog_cpumask_bits;
#ifdef CONFIG_SMP
extern int sysctl_softlockup_all_cpu_backtrace;
extern int sysctl_hardlockup_all_cpu_backtrace;
#else
#define sysctl_softlockup_all_cpu_backtrace 0
#define sysctl_hardlockup_all_cpu_backtrace 0
#endif /* !CONFIG_SMP */
#else /* CONFIG_LOCKUP_DETECTOR */
static inline void lockup_detector_init(void) { }
static inline void lockup_detector_retry_init(void) { }
static inline void lockup_detector_soft_poweroff(void) { }
#endif /* !CONFIG_LOCKUP_DETECTOR */
#ifdef CONFIG_SOFTLOCKUP_DETECTOR
extern void touch_softlockup_watchdog_sched(void);
extern void touch_softlockup_watchdog(void);
extern void touch_softlockup_watchdog_sync(void);
extern void touch_all_softlockup_watchdogs(void);
extern unsigned int softlockup_panic;
extern int lockup_detector_online_cpu(unsigned int cpu);
extern int lockup_detector_offline_cpu(unsigned int cpu);
#else /* CONFIG_SOFTLOCKUP_DETECTOR */
static inline void touch_softlockup_watchdog_sched(void) { }
static inline void touch_softlockup_watchdog(void) { }
static inline void touch_softlockup_watchdog_sync(void) { }
static inline void touch_all_softlockup_watchdogs(void) { }
#define lockup_detector_online_cpu NULL
#define lockup_detector_offline_cpu NULL
#endif /* CONFIG_SOFTLOCKUP_DETECTOR */
#ifdef CONFIG_DETECT_HUNG_TASK
void reset_hung_task_detector(void);
#else
static inline void reset_hung_task_detector(void) { }
#endif
/*
* The run state of the lockup detectors is controlled by the content of the
* 'watchdog_enabled' variable. Each lockup detector has its dedicated bit -
* bit 0 for the hard lockup detector and bit 1 for the soft lockup detector.
*
* 'watchdog_user_enabled', 'watchdog_hardlockup_user_enabled' and
* 'watchdog_softlockup_user_enabled' are variables that are only used as an
* 'interface' between the parameters in /proc/sys/kernel and the internal
* state bits in 'watchdog_enabled'. The 'watchdog_thresh' variable is
* handled differently because its value is not boolean, and the lockup
* detectors are 'suspended' while 'watchdog_thresh' is equal zero.
*/
#define WATCHDOG_HARDLOCKUP_ENABLED_BIT 0
#define WATCHDOG_SOFTOCKUP_ENABLED_BIT 1
#define WATCHDOG_HARDLOCKUP_ENABLED (1 << WATCHDOG_HARDLOCKUP_ENABLED_BIT)
#define WATCHDOG_SOFTOCKUP_ENABLED (1 << WATCHDOG_SOFTOCKUP_ENABLED_BIT)
#if defined(CONFIG_HARDLOCKUP_DETECTOR)
extern void hardlockup_detector_disable(void);
extern unsigned int hardlockup_panic;
#else
static inline void hardlockup_detector_disable(void) {}
#endif
/* Sparc64 has special implemetantion that is always enabled. */
#if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HARDLOCKUP_DETECTOR_SPARC64)
void arch_touch_nmi_watchdog(void);
#else
static inline void arch_touch_nmi_watchdog(void) { }
#endif
#if defined(CONFIG_HARDLOCKUP_DETECTOR_COUNTS_HRTIMER)
void watchdog_hardlockup_touch_cpu(unsigned int cpu);
void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs);
#endif
#if defined(CONFIG_HARDLOCKUP_DETECTOR_PERF)
extern void hardlockup_detector_perf_stop(void);
extern void hardlockup_detector_perf_restart(void);
extern void hardlockup_config_perf_event(const char *str);
#else
static inline void hardlockup_detector_perf_stop(void) { }
static inline void hardlockup_detector_perf_restart(void) { }
static inline void hardlockup_config_perf_event(const char *str) { }
#endif
void watchdog_hardlockup_stop(void);
void watchdog_hardlockup_start(void);
int watchdog_hardlockup_probe(void);
void watchdog_hardlockup_enable(unsigned int cpu);
void watchdog_hardlockup_disable(unsigned int cpu);
void lockup_detector_reconfigure(void);
#ifdef CONFIG_HARDLOCKUP_DETECTOR_BUDDY
void watchdog_buddy_check_hardlockup(int hrtimer_interrupts);
#else
static inline void watchdog_buddy_check_hardlockup(int hrtimer_interrupts) {}
#endif
/**
* touch_nmi_watchdog - manually reset the hardlockup watchdog timeout.
*
* If we support detecting hardlockups, touch_nmi_watchdog() may be
* used to pet the watchdog (reset the timeout) - for code which
* intentionally disables interrupts for a long time. This call is stateless.
*
* Though this function has "nmi" in the name, the hardlockup watchdog might
* not be backed by NMIs. This function will likely be renamed to
* touch_hardlockup_watchdog() in the future.
*/
static inline void touch_nmi_watchdog(void)
{
/*
* Pass on to the hardlockup detector selected via CONFIG_. Note that
* the hardlockup detector may not be arch-specific nor using NMIs
* and the arch_touch_nmi_watchdog() function will likely be renamed
* in the future.
*/
arch_touch_nmi_watchdog();
touch_softlockup_watchdog();
}
/*
* Create trigger_all_cpu_backtrace() out of the arch-provided
* base function. Return whether such support was available,
* to allow calling code to fall back to some other mechanism:
*/
#ifdef arch_trigger_cpumask_backtrace
static inline bool trigger_all_cpu_backtrace(void)
{
arch_trigger_cpumask_backtrace(cpu_online_mask, -1);
return true;
}
static inline bool trigger_allbutcpu_cpu_backtrace(int exclude_cpu)
{
arch_trigger_cpumask_backtrace(cpu_online_mask, exclude_cpu);
return true;
}
static inline bool trigger_cpumask_backtrace(struct cpumask *mask)
{
arch_trigger_cpumask_backtrace(mask, -1);
return true;
}
static inline bool trigger_single_cpu_backtrace(int cpu)
{
arch_trigger_cpumask_backtrace(cpumask_of(cpu), -1);
return true;
}
/* generic implementation */
void nmi_trigger_cpumask_backtrace(const cpumask_t *mask,
int exclude_cpu,
void (*raise)(cpumask_t *mask));
bool nmi_cpu_backtrace(struct pt_regs *regs);
#else
static inline bool trigger_all_cpu_backtrace(void)
{
return false;
}
static inline bool trigger_allbutcpu_cpu_backtrace(int exclude_cpu)
{
return false;
}
static inline bool trigger_cpumask_backtrace(struct cpumask *mask)
{
return false;
}
static inline bool trigger_single_cpu_backtrace(int cpu)
{
return false;
}
#endif
#ifdef CONFIG_HARDLOCKUP_DETECTOR_PERF
u64 hw_nmi_get_sample_period(int watchdog_thresh);
bool arch_perf_nmi_is_available(void);
#endif
#if defined(CONFIG_HARDLOCKUP_CHECK_TIMESTAMP) && \
defined(CONFIG_HARDLOCKUP_DETECTOR_PERF)
void watchdog_update_hrtimer_threshold(u64 period);
#else
static inline void watchdog_update_hrtimer_threshold(u64 period) { }
#endif
#ifdef CONFIG_HAVE_ACPI_APEI_NMI
#include <asm/nmi.h>
#endif
#ifdef CONFIG_NMI_CHECK_CPU
void nmi_backtrace_stall_snap(const struct cpumask *btp);
void nmi_backtrace_stall_check(const struct cpumask *btp);
#else
static inline void nmi_backtrace_stall_snap(const struct cpumask *btp) {}
static inline void nmi_backtrace_stall_check(const struct cpumask *btp) {}
#endif
#endif