summaryrefslogtreecommitdiff
authorColin Cross <ccross@android.com>2013-01-11 21:51:48 (GMT)
committer jie.yuan <jie.yuan@amlogic.com>2018-03-02 06:18:04 (GMT)
commit3528de9636a4ba772277845a561deea3a69fc87c (patch)
treee03b8727fa9a7fc5ec7a1f94f356240e22cd72e6
parent6388b25b51ef64e157bf9b2c3b59ee1750a5554e (diff)
downloadcommon-3528de9636a4ba772277845a561deea3a69fc87c.zip
common-3528de9636a4ba772277845a561deea3a69fc87c.tar.gz
common-3528de9636a4ba772277845a561deea3a69fc87c.tar.bz2
hardlockup: detect hard lockups without NMIs using secondary cpus
PD#160530: porting hard lockup without nmi support Emulate NMIs on systems where they are not available by using timer interrupts on other cpus. Each cpu will use its softlockup hrtimer to check that the next cpu is processing hrtimer interrupts by verifying that a counter is increasing. This patch is useful on systems where the hardlockup detector is not available due to a lack of NMIs, for example most ARM SoCs. Without this patch any cpu stuck with interrupts disabled can cause a hardware watchdog reset with no debugging information, but with this patch the kernel can detect the lockup and panic, which can result in useful debugging info. Change-Id: Ia42886e6f29de53e999e9ce10e6e01fd063c0ffa Signed-off-by: Colin Cross <ccross@android.com> Signed-off-by: Jiamin Ma <jiamin.ma@amlogic.com>
Diffstat
-rw-r--r--arch/arm64/configs/meson64_defconfig1
-rw-r--r--include/linux/nmi.h9
-rw-r--r--kernel/watchdog.c21
-rw-r--r--kernel/watchdog_hld.c128
-rw-r--r--lib/Kconfig.debug14
5 files changed, 156 insertions, 17 deletions
diff --git a/arch/arm64/configs/meson64_defconfig b/arch/arm64/configs/meson64_defconfig
index a2171ba..1395f8b 100644
--- a/arch/arm64/configs/meson64_defconfig
+++ b/arch/arm64/configs/meson64_defconfig
@@ -528,6 +528,7 @@ CONFIG_NLS_ISO8859_1=y
CONFIG_PRINTK_TIME=y
CONFIG_DEBUG_INFO=y
CONFIG_MAGIC_SYSRQ=y
+CONFIG_LOCKUP_DETECTOR=y
CONFIG_WQ_WATCHDOG=y
CONFIG_PANIC_ON_RT_THROTTLING=y
CONFIG_SCHEDSTATS=y
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 0a3fadc..32ba395 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -31,8 +31,11 @@
* may be used to reset the timeout - for code which intentionally
* disables interrupts for a long time. This call is stateless.
*/
-#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR)
+#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR_NMI)
#include <asm/nmi.h>
+#endif
+
+#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR)
extern void touch_nmi_watchdog(void);
#else
static inline void touch_nmi_watchdog(void)
@@ -143,6 +146,10 @@ static inline void lockup_detector_resume(void)
}
#endif
+extern void watchdog_check_hardlockup_other_cpu(void);
+extern int watchdog_nmi_enable(unsigned int cpu);
+extern void watchdog_nmi_disable(unsigned int cpu);
+
#ifdef CONFIG_HAVE_ACPI_APEI_NMI
#include <asm/nmi.h>
#endif
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 63177be..0b3638c 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -78,10 +78,10 @@ static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer);
static DEFINE_PER_CPU(bool, softlockup_touch_sync);
static DEFINE_PER_CPU(bool, soft_watchdog_warn);
-static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
+DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt);
static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved);
-static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
+DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
static unsigned long soft_lockup_nmi_warn;
unsigned int __read_mostly softlockup_panic =
@@ -211,6 +211,7 @@ void touch_softlockup_watchdog_sync(void)
}
/* watchdog detector functions */
+#ifdef CONFIG_HARDLOCKUP_DETECTOR_NMI
bool is_hardlockup(void)
{
unsigned long hrint = __this_cpu_read(hrtimer_interrupts);
@@ -221,6 +222,7 @@ bool is_hardlockup(void)
__this_cpu_write(hrtimer_interrupts_saved, hrint);
return false;
}
+#endif
static int is_softlockup(unsigned long touch_ts)
{
@@ -239,18 +241,6 @@ static void watchdog_interrupt_count(void)
__this_cpu_inc(hrtimer_interrupts);
}
-/*
- * These two functions are mostly architecture specific
- * defining them as weak here.
- */
-int __weak watchdog_nmi_enable(unsigned int cpu)
-{
- return 0;
-}
-void __weak watchdog_nmi_disable(unsigned int cpu)
-{
-}
-
static int watchdog_enable_all_cpus(void);
static void watchdog_disable_all_cpus(void);
@@ -268,6 +258,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
/* kick the hardlockup detector */
watchdog_interrupt_count();
+ /* test for hardlockups on the next cpu */
+ watchdog_check_hardlockup_other_cpu();
+
/* kick the softlockup detector */
wake_up_process(__this_cpu_read(softlockup_watchdog));
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index 12b8dd6..ac528e1 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -18,7 +18,14 @@
static DEFINE_PER_CPU(bool, hard_watchdog_warn);
static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
+#ifdef CONFIG_HARDLOCKUP_DETECTOR_OTHER_CPU
+static cpumask_t __read_mostly watchdog_cpus;
+#endif
+#ifdef CONFIG_HARDLOCKUP_DETECTOR_NMI
static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
+#endif
+DECLARE_PER_CPU(unsigned long, hrtimer_interrupts);
+DECLARE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
/* boot commands */
/*
@@ -26,7 +33,7 @@ static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
*/
unsigned int __read_mostly hardlockup_panic =
CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
-static unsigned long hardlockup_allcpu_dumped;
+static unsigned long __maybe_unused hardlockup_allcpu_dumped;
/*
* We may not want to enable hard lockup detection by default in all cases,
* for example when running the kernel as a guest on a hypervisor. In these
@@ -68,6 +75,7 @@ void touch_nmi_watchdog(void)
}
EXPORT_SYMBOL(touch_nmi_watchdog);
+#ifdef CONFIG_HARDLOCKUP_DETECTOR_NMI
static struct perf_event_attr wd_hw_attr = {
.type = PERF_TYPE_HARDWARE,
.config = PERF_COUNT_HW_CPU_CYCLES,
@@ -131,7 +139,84 @@ static void watchdog_overflow_callback(struct perf_event *event,
__this_cpu_write(hard_watchdog_warn, false);
return;
}
+#endif
+#ifdef CONFIG_HARDLOCKUP_DETECTOR_OTHER_CPU
+static unsigned int watchdog_next_cpu(unsigned int cpu)
+{
+ cpumask_t cpus = watchdog_cpus;
+ unsigned int next_cpu;
+
+ next_cpu = cpumask_next(cpu, &cpus);
+ if (next_cpu >= nr_cpu_ids)
+ next_cpu = cpumask_first(&cpus);
+
+ if (next_cpu == cpu)
+ return nr_cpu_ids;
+
+ return next_cpu;
+}
+
+static int is_hardlockup_other_cpu(unsigned int cpu)
+{
+ unsigned long hrint = per_cpu(hrtimer_interrupts, cpu);
+
+ if (per_cpu(hrtimer_interrupts_saved, cpu) == hrint)
+ return 1;
+
+ per_cpu(hrtimer_interrupts_saved, cpu) = hrint;
+ return 0;
+}
+
+void watchdog_check_hardlockup_other_cpu(void)
+{
+ unsigned int next_cpu;
+
+ /*
+ * Test for hardlockups every 3 samples. The sample period is
+ * watchdog_thresh * 2 / 5, so 3 samples gets us back to slightly over
+ * watchdog_thresh (over by 20%).
+ */
+ if (__this_cpu_read(hrtimer_interrupts) % 3 != 0)
+ return;
+
+ /* check for a hardlockup on the next cpu */
+ next_cpu = watchdog_next_cpu(smp_processor_id());
+ if (next_cpu >= nr_cpu_ids)
+ return;
+
+ /*mem barrier*/
+ smp_rmb();
+
+ if (per_cpu(watchdog_nmi_touch, next_cpu) == true) {
+ per_cpu(watchdog_nmi_touch, next_cpu) = false;
+ return;
+ }
+
+ if (is_hardlockup_other_cpu(next_cpu)) {
+ /* only warn once */
+ if (per_cpu(hard_watchdog_warn, next_cpu) == true)
+ return;
+
+ if (hardlockup_panic)
+ panic("Watchdog detected hard LOCKUP on cpu %u",
+ next_cpu);
+ else
+ WARN(1, "Watchdog detected hard LOCKUP on cpu %u",
+ next_cpu);
+
+ per_cpu(hard_watchdog_warn, next_cpu) = true;
+ } else {
+ per_cpu(hard_watchdog_warn, next_cpu) = false;
+ }
+}
+#else
+static inline void watchdog_check_hardlockup_other_cpu(void) { return; }
+#endif
+
+
+
+#ifdef CONFIG_HARDLOCKUP_DETECTOR_NMI
/*
* People like the simple clean cpu node info on boot.
* Reduce the watchdog noise by only printing messages
@@ -228,3 +313,44 @@ void watchdog_nmi_disable(unsigned int cpu)
cpu0_err = 0;
}
}
+#else
+#ifdef CONFIG_HARDLOCKUP_DETECTOR_OTHER_CPU
+int watchdog_nmi_enable(unsigned int cpu)
+{
+ /*
+ * The new cpu will be marked online before the first hrtimer interrupt
+ * runs on it. If another cpu tests for a hardlockup on the new cpu
+ * before it has run its first hrtimer, it will get a false positive.
+ * Touch the watchdog on the new cpu to delay the first check for at
+ * least 3 sampling periods to guarantee one hrtimer has run on the new
+ * cpu.
+ */
+ per_cpu(watchdog_nmi_touch, cpu) = true;
+ /*mem barrier*/
+ smp_wmb();
+ cpumask_set_cpu(cpu, &watchdog_cpus);
+ return 0;
+}
+
+void watchdog_nmi_disable(unsigned int cpu)
+{
+ unsigned int next_cpu = watchdog_next_cpu(cpu);
+
+ /*
+ * Offlining this cpu will cause the cpu before this one to start
+ * checking the one after this one. If this cpu just finished checking
+ * the next cpu and updating hrtimer_interrupts_saved, and then the
+ * previous cpu checks it within one sample period, it will trigger a
+ * false positive. Touch the watchdog on the next cpu to prevent it.
+ */
+ if (next_cpu < nr_cpu_ids)
+ per_cpu(watchdog_nmi_touch, next_cpu) = true;
+ /*mem barrier*/
+ smp_wmb();
+ cpumask_clear_cpu(cpu, &watchdog_cpus);
+}
+#else
+int watchdog_nmi_enable(unsigned int cpu) { return 0; }
+void watchdog_nmi_disable(unsigned int cpu) { return; }
+#endif /* CONFIG_HARDLOCKUP_DETECTOR_OTHER_CPU */
+#endif /* CONFIG_HARDLOCKUP_DETECTOR_NMI */
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index db71a20..d8a4358 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -764,15 +764,27 @@ config LOCKUP_DETECTOR
The overhead should be minimal. A periodic hrtimer runs to
generate interrupts and kick the watchdog task every 4 seconds.
An NMI is generated every 10 seconds or so to check for hardlockups.
+ If NMIs are not available on the platform, every 12 seconds the
+ hrtimer interrupt on one cpu will be used to check for hardlockups
+ on the next cpu.
The frequency of hrtimer and NMI events and the soft and hard lockup
thresholds can be controlled through the sysctl watchdog_thresh.
-config HARDLOCKUP_DETECTOR
+config HARDLOCKUP_DETECTOR_NMI
def_bool y
depends on LOCKUP_DETECTOR && !HAVE_NMI_WATCHDOG
depends on PERF_EVENTS && HAVE_PERF_EVENTS_NMI
+config HARDLOCKUP_DETECTOR_OTHER_CPU
+ def_bool y
+ depends on LOCKUP_DETECTOR && SMP
+ depends on !HARDLOCKUP_DETECTOR_NMI && !HAVE_NMI_WATCHDOG
+
+config HARDLOCKUP_DETECTOR
+ def_bool y
+ depends on HARDLOCKUP_DETECTOR_NMI || HARDLOCKUP_DETECTOR_OTHER_CPU
+
config BOOTPARAM_HARDLOCKUP_PANIC
bool "Panic (Reboot) On Hard Lockups"
depends on HARDLOCKUP_DETECTOR