hardlockup: detect hard lockups without NMIs using secondary cpus
Emulate NMIs on systems where they are not available by using timer interrupts on other cpus. Each cpu will use its softlockup hrtimer to check that the next cpu is processing hrtimer interrupts by verifying that a counter is increasing. This patch is useful on systems where the hardlockup detector is not available due to a lack of NMIs, for example most ARM SoCs. Without this patch any cpu stuck with interrupts disabled can cause a hardware watchdog reset with no debugging information, but with this patch the kernel can detect the lockup and panic, which can result in useful debugging info. Change-Id: Ia5faf50243e19c1755201212e04c8892d929785a Signed-off-by: Colin Cross <ccross@android.com> Signed-off-by: Huibin Hong <huibin.hong@rock-chips.com> Signed-off-by: Tao Huang <huangtao@rock-chips.com>
This commit is contained in:
@ -94,6 +94,45 @@ __setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup);
|
||||
# endif /* CONFIG_SMP */
|
||||
#endif /* CONFIG_HARDLOCKUP_DETECTOR */
|
||||
|
||||
#ifdef CONFIG_HARDLOCKUP_DETECTOR_OTHER_CPU
|
||||
static cpumask_t __read_mostly watchdog_cpus;
|
||||
static DEFINE_PER_CPU(bool, hard_watchdog_warn);
|
||||
static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
|
||||
static unsigned int watchdog_next_cpu(unsigned int cpu);
|
||||
|
||||
int watchdog_nmi_enable(unsigned int cpu)
|
||||
{
|
||||
/*
|
||||
* The new cpu will be marked online before the first hrtimer interrupt
|
||||
* runs on it. If another cpu tests for a hardlockup on the new cpu
|
||||
* before it has run its first hrtimer, it will get a false positive.
|
||||
* Touch the watchdog on the new cpu to delay the first check for at
|
||||
* least 3 sampling periods to guarantee one hrtimer has run on the new
|
||||
* cpu.
|
||||
*/
|
||||
per_cpu(watchdog_nmi_touch, cpu) = true;
|
||||
smp_wmb();
|
||||
cpumask_set_cpu(cpu, &watchdog_cpus);
|
||||
return 0;
|
||||
}
|
||||
|
||||
void watchdog_nmi_disable(unsigned int cpu)
|
||||
{
|
||||
unsigned int next_cpu = watchdog_next_cpu(cpu);
|
||||
|
||||
/*
|
||||
* Offlining this cpu will cause the cpu before this one to start
|
||||
* checking the one after this one. If this cpu just finished checking
|
||||
* the next cpu and updating hrtimer_interrupts_saved, and then the
|
||||
* previous cpu checks it within one sample period, it will trigger a
|
||||
* false positive. Touch the watchdog on the next cpu to prevent it.
|
||||
*/
|
||||
if (next_cpu < nr_cpu_ids)
|
||||
per_cpu(watchdog_nmi_touch, next_cpu) = true;
|
||||
smp_wmb();
|
||||
cpumask_clear_cpu(cpu, &watchdog_cpus);
|
||||
}
|
||||
#else
|
||||
/*
|
||||
* These functions can be overridden if an architecture implements its
|
||||
* own hardlockup detector.
|
||||
@ -112,6 +151,7 @@ void __weak watchdog_nmi_disable(unsigned int cpu)
|
||||
{
|
||||
hardlockup_detector_perf_disable();
|
||||
}
|
||||
#endif /* CONFIG_HARDLOCKUP_DETECTOR_OTHER_CPU */
|
||||
|
||||
/* Return 0, if a NMI watchdog is available. Error code otherwise */
|
||||
int __weak __init watchdog_nmi_probe(void)
|
||||
@ -327,6 +367,76 @@ bool is_hardlockup(void)
|
||||
return false;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_HARDLOCKUP_DETECTOR_OTHER_CPU
|
||||
static unsigned int watchdog_next_cpu(unsigned int cpu)
|
||||
{
|
||||
cpumask_t cpus = watchdog_cpus;
|
||||
unsigned int next_cpu;
|
||||
|
||||
next_cpu = cpumask_next(cpu, &cpus);
|
||||
if (next_cpu >= nr_cpu_ids)
|
||||
next_cpu = cpumask_first(&cpus);
|
||||
|
||||
if (next_cpu == cpu)
|
||||
return nr_cpu_ids;
|
||||
|
||||
return next_cpu;
|
||||
}
|
||||
|
||||
static int is_hardlockup_other_cpu(unsigned int cpu)
|
||||
{
|
||||
unsigned long hrint = per_cpu(hrtimer_interrupts, cpu);
|
||||
|
||||
if (per_cpu(hrtimer_interrupts_saved, cpu) == hrint)
|
||||
return 1;
|
||||
|
||||
per_cpu(hrtimer_interrupts_saved, cpu) = hrint;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void watchdog_check_hardlockup_other_cpu(void)
|
||||
{
|
||||
unsigned int next_cpu;
|
||||
|
||||
/*
|
||||
* Test for hardlockups every 3 samples. The sample period is
|
||||
* watchdog_thresh * 2 / 5, so 3 samples gets us back to slightly over
|
||||
* watchdog_thresh (over by 20%).
|
||||
*/
|
||||
if (__this_cpu_read(hrtimer_interrupts) % 3 != 0)
|
||||
return;
|
||||
|
||||
/* check for a hardlockup on the next cpu */
|
||||
next_cpu = watchdog_next_cpu(smp_processor_id());
|
||||
if (next_cpu >= nr_cpu_ids)
|
||||
return;
|
||||
|
||||
smp_rmb();
|
||||
|
||||
if (per_cpu(watchdog_nmi_touch, next_cpu) == true) {
|
||||
per_cpu(watchdog_nmi_touch, next_cpu) = false;
|
||||
return;
|
||||
}
|
||||
|
||||
if (is_hardlockup_other_cpu(next_cpu)) {
|
||||
/* only warn once */
|
||||
if (per_cpu(hard_watchdog_warn, next_cpu) == true)
|
||||
return;
|
||||
|
||||
if (hardlockup_panic)
|
||||
panic("Watchdog detected hard LOCKUP on cpu %u", next_cpu);
|
||||
else
|
||||
WARN(1, "Watchdog detected hard LOCKUP on cpu %u", next_cpu);
|
||||
|
||||
per_cpu(hard_watchdog_warn, next_cpu) = true;
|
||||
} else {
|
||||
per_cpu(hard_watchdog_warn, next_cpu) = false;
|
||||
}
|
||||
}
|
||||
#else
|
||||
static inline void watchdog_check_hardlockup_other_cpu(void) { return; }
|
||||
#endif
|
||||
|
||||
static void watchdog_interrupt_count(void)
|
||||
{
|
||||
__this_cpu_inc(hrtimer_interrupts);
|
||||
@ -364,6 +474,10 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
|
||||
if (!watchdog_enabled)
|
||||
return HRTIMER_NORESTART;
|
||||
|
||||
/* test for hardlockups on the next cpu */
|
||||
if (IS_ENABLED(CONFIG_HARDLOCKUP_DETECTOR_OTHER_CPU))
|
||||
watchdog_check_hardlockup_other_cpu();
|
||||
|
||||
/* kick the hardlockup detector */
|
||||
watchdog_interrupt_count();
|
||||
|
||||
@ -495,6 +609,8 @@ static void watchdog_enable(unsigned int cpu)
|
||||
/* Enable the perf event */
|
||||
if (watchdog_enabled & NMI_WATCHDOG_ENABLED)
|
||||
watchdog_nmi_enable(cpu);
|
||||
else if (IS_ENABLED(CONFIG_HARDLOCKUP_DETECTOR_OTHER_CPU))
|
||||
watchdog_nmi_enable(cpu);
|
||||
}
|
||||
|
||||
static void watchdog_disable(unsigned int cpu)
|
||||
|
||||
@ -858,12 +858,22 @@ config HARDLOCKUP_CHECK_TIMESTAMP
|
||||
# arch/ can define HAVE_HARDLOCKUP_DETECTOR_ARCH to provide their own hard
|
||||
# lockup detector rather than the perf based detector.
|
||||
#
|
||||
config HAVE_HARDLOCKUP_DETECTOR_OTHER_CPU
|
||||
def_bool y
|
||||
depends on SMP
|
||||
depends on !HAVE_HARDLOCKUP_DETECTOR_PERF && !HAVE_HARDLOCKUP_DETECTOR_ARCH
|
||||
|
||||
config HARDLOCKUP_DETECTOR_OTHER_CPU
|
||||
bool
|
||||
select SOFTLOCKUP_DETECTOR
|
||||
|
||||
config HARDLOCKUP_DETECTOR
|
||||
bool "Detect Hard Lockups"
|
||||
depends on DEBUG_KERNEL && !S390
|
||||
depends on HAVE_HARDLOCKUP_DETECTOR_PERF || HAVE_HARDLOCKUP_DETECTOR_ARCH
|
||||
depends on HAVE_HARDLOCKUP_DETECTOR_PERF || HAVE_HARDLOCKUP_DETECTOR_ARCH || HAVE_HARDLOCKUP_DETECTOR_OTHER_CPU
|
||||
select LOCKUP_DETECTOR
|
||||
select HARDLOCKUP_DETECTOR_PERF if HAVE_HARDLOCKUP_DETECTOR_PERF
|
||||
select HARDLOCKUP_DETECTOR_OTHER_CPU if HAVE_HARDLOCKUP_DETECTOR_OTHER_CPU
|
||||
help
|
||||
Say Y here to enable the kernel to act as a watchdog to detect
|
||||
hard lockups.
|
||||
|
||||
Reference in New Issue
Block a user