aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorColin Cross2013-01-11 15:51:48 -0600
committerArve Hjønnevåg2013-02-19 19:56:07 -0600
commit44d0cb81d297c96fecdc71392133c33908feb45d (patch)
tree08cdbda198ca24c439a1af02eefaf237ff557aa7
parent69e99ffaf725e2a9acf6da3cfbf1605f49a9171f (diff)
downloadkernel-common-44d0cb81d297c96fecdc71392133c33908feb45d.tar.gz
kernel-common-44d0cb81d297c96fecdc71392133c33908feb45d.tar.xz
kernel-common-44d0cb81d297c96fecdc71392133c33908feb45d.zip
hardlockup: detect hard lockups without NMIs using secondary cpus
Emulate NMIs on systems where they are not available by using timer interrupts on other cpus. Each cpu will use its softlockup hrtimer to check that the next cpu is processing hrtimer interrupts by verifying that a counter is increasing. This patch is useful on systems where the hardlockup detector is not available due to a lack of NMIs, for example most ARM SoCs. Without this patch any cpu stuck with interrupts disabled can cause a hardware watchdog reset with no debugging information, but with this patch the kernel can detect the lockup and panic, which can result in useful debugging info. Signed-off-by: Colin Cross <ccross@android.com>
-rw-r--r--include/linux/nmi.h5
-rw-r--r--kernel/watchdog.c123
-rw-r--r--lib/Kconfig.debug14
3 files changed, 135 insertions, 7 deletions
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index db50840e635..c8f8aa0383e 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -14,8 +14,11 @@
14 * may be used to reset the timeout - for code which intentionally 14 * may be used to reset the timeout - for code which intentionally
15 * disables interrupts for a long time. This call is stateless. 15 * disables interrupts for a long time. This call is stateless.
16 */ 16 */
17#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR) 17#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR_NMI)
18#include <asm/nmi.h> 18#include <asm/nmi.h>
19#endif
20
21#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR)
19extern void touch_nmi_watchdog(void); 22extern void touch_nmi_watchdog(void);
20#else 23#else
21static inline void touch_nmi_watchdog(void) 24static inline void touch_nmi_watchdog(void)
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 75a2ab3d0b0..61a0595464d 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -44,6 +44,11 @@ static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt);
44static DEFINE_PER_CPU(bool, hard_watchdog_warn); 44static DEFINE_PER_CPU(bool, hard_watchdog_warn);
45static DEFINE_PER_CPU(bool, watchdog_nmi_touch); 45static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
46static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); 46static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
47#endif
48#ifdef CONFIG_HARDLOCKUP_DETECTOR_OTHER_CPU
49static cpumask_t __read_mostly watchdog_cpus;
50#endif
51#ifdef CONFIG_HARDLOCKUP_DETECTOR_NMI
47static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); 52static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
48#endif 53#endif
49 54
@@ -179,7 +184,7 @@ void touch_softlockup_watchdog_sync(void)
179 __raw_get_cpu_var(watchdog_touch_ts) = 0; 184 __raw_get_cpu_var(watchdog_touch_ts) = 0;
180} 185}
181 186
182#ifdef CONFIG_HARDLOCKUP_DETECTOR 187#ifdef CONFIG_HARDLOCKUP_DETECTOR_NMI
183/* watchdog detector functions */ 188/* watchdog detector functions */
184static int is_hardlockup(void) 189static int is_hardlockup(void)
185{ 190{
@@ -193,6 +198,76 @@ static int is_hardlockup(void)
193} 198}
194#endif 199#endif
195 200
201#ifdef CONFIG_HARDLOCKUP_DETECTOR_OTHER_CPU
202static unsigned int watchdog_next_cpu(unsigned int cpu)
203{
204 cpumask_t cpus = watchdog_cpus;
205 unsigned int next_cpu;
206
207 next_cpu = cpumask_next(cpu, &cpus);
208 if (next_cpu >= nr_cpu_ids)
209 next_cpu = cpumask_first(&cpus);
210
211 if (next_cpu == cpu)
212 return nr_cpu_ids;
213
214 return next_cpu;
215}
216
217static int is_hardlockup_other_cpu(unsigned int cpu)
218{
219 unsigned long hrint = per_cpu(hrtimer_interrupts, cpu);
220
221 if (per_cpu(hrtimer_interrupts_saved, cpu) == hrint)
222 return 1;
223
224 per_cpu(hrtimer_interrupts_saved, cpu) = hrint;
225 return 0;
226}
227
228static void watchdog_check_hardlockup_other_cpu(void)
229{
230 unsigned int next_cpu;
231
232 /*
233 * Test for hardlockups every 3 samples. The sample period is
234 * watchdog_thresh * 2 / 5, so 3 samples gets us back to slightly over
235 * watchdog_thresh (over by 20%).
236 */
237 if (__this_cpu_read(hrtimer_interrupts) % 3 != 0)
238 return;
239
240 /* check for a hardlockup on the next cpu */
241 next_cpu = watchdog_next_cpu(smp_processor_id());
242 if (next_cpu >= nr_cpu_ids)
243 return;
244
245 smp_rmb();
246
247 if (per_cpu(watchdog_nmi_touch, next_cpu) == true) {
248 per_cpu(watchdog_nmi_touch, next_cpu) = false;
249 return;
250 }
251
252 if (is_hardlockup_other_cpu(next_cpu)) {
253 /* only warn once */
254 if (per_cpu(hard_watchdog_warn, next_cpu) == true)
255 return;
256
257 if (hardlockup_panic)
258 panic("Watchdog detected hard LOCKUP on cpu %u", next_cpu);
259 else
260 WARN(1, "Watchdog detected hard LOCKUP on cpu %u", next_cpu);
261
262 per_cpu(hard_watchdog_warn, next_cpu) = true;
263 } else {
264 per_cpu(hard_watchdog_warn, next_cpu) = false;
265 }
266}
267#else
268static inline void watchdog_check_hardlockup_other_cpu(void) { return; }
269#endif
270
196static int is_softlockup(unsigned long touch_ts) 271static int is_softlockup(unsigned long touch_ts)
197{ 272{
198 unsigned long now = get_timestamp(smp_processor_id()); 273 unsigned long now = get_timestamp(smp_processor_id());
@@ -204,7 +279,7 @@ static int is_softlockup(unsigned long touch_ts)
204 return 0; 279 return 0;
205} 280}
206 281
207#ifdef CONFIG_HARDLOCKUP_DETECTOR 282#ifdef CONFIG_HARDLOCKUP_DETECTOR_NMI
208 283
209static struct perf_event_attr wd_hw_attr = { 284static struct perf_event_attr wd_hw_attr = {
210 .type = PERF_TYPE_HARDWARE, 285 .type = PERF_TYPE_HARDWARE,
@@ -252,7 +327,7 @@ static void watchdog_overflow_callback(struct perf_event *event,
252 __this_cpu_write(hard_watchdog_warn, false); 327 __this_cpu_write(hard_watchdog_warn, false);
253 return; 328 return;
254} 329}
255#endif /* CONFIG_HARDLOCKUP_DETECTOR */ 330#endif /* CONFIG_HARDLOCKUP_DETECTOR_NMI */
256 331
257static void watchdog_interrupt_count(void) 332static void watchdog_interrupt_count(void)
258{ 333{
@@ -272,6 +347,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
272 /* kick the hardlockup detector */ 347 /* kick the hardlockup detector */
273 watchdog_interrupt_count(); 348 watchdog_interrupt_count();
274 349
350 /* test for hardlockups on the next cpu */
351 watchdog_check_hardlockup_other_cpu();
352
275 /* kick the softlockup detector */ 353 /* kick the softlockup detector */
276 wake_up_process(__this_cpu_read(softlockup_watchdog)); 354 wake_up_process(__this_cpu_read(softlockup_watchdog));
277 355
@@ -396,7 +474,7 @@ static void watchdog(unsigned int cpu)
396 __touch_watchdog(); 474 __touch_watchdog();
397} 475}
398 476
399#ifdef CONFIG_HARDLOCKUP_DETECTOR 477#ifdef CONFIG_HARDLOCKUP_DETECTOR_NMI
400/* 478/*
401 * People like the simple clean cpu node info on boot. 479 * People like the simple clean cpu node info on boot.
402 * Reduce the watchdog noise by only printing messages 480 * Reduce the watchdog noise by only printing messages
@@ -472,9 +550,44 @@ static void watchdog_nmi_disable(unsigned int cpu)
472 return; 550 return;
473} 551}
474#else 552#else
553#ifdef CONFIG_HARDLOCKUP_DETECTOR_OTHER_CPU
554static int watchdog_nmi_enable(unsigned int cpu)
555{
556 /*
557 * The new cpu will be marked online before the first hrtimer interrupt
558 * runs on it. If another cpu tests for a hardlockup on the new cpu
559 * before it has run its first hrtimer, it will get a false positive.
560 * Touch the watchdog on the new cpu to delay the first check for at
561 * least 3 sampling periods to guarantee one hrtimer has run on the new
562 * cpu.
563 */
564 per_cpu(watchdog_nmi_touch, cpu) = true;
565 smp_wmb();
566 cpumask_set_cpu(cpu, &watchdog_cpus);
567 return 0;
568}
569
570static void watchdog_nmi_disable(unsigned int cpu)
571{
572 unsigned int next_cpu = watchdog_next_cpu(cpu);
573
574 /*
575 * Offlining this cpu will cause the cpu before this one to start
576 * checking the one after this one. If this cpu just finished checking
577 * the next cpu and updating hrtimer_interrupts_saved, and then the
578 * previous cpu checks it within one sample period, it will trigger a
579 * false positive. Touch the watchdog on the next cpu to prevent it.
580 */
581 if (next_cpu < nr_cpu_ids)
582 per_cpu(watchdog_nmi_touch, next_cpu) = true;
583 smp_wmb();
584 cpumask_clear_cpu(cpu, &watchdog_cpus);
585}
586#else
475static int watchdog_nmi_enable(unsigned int cpu) { return 0; } 587static int watchdog_nmi_enable(unsigned int cpu) { return 0; }
476static void watchdog_nmi_disable(unsigned int cpu) { return; } 588static void watchdog_nmi_disable(unsigned int cpu) { return; }
477#endif /* CONFIG_HARDLOCKUP_DETECTOR */ 589#endif /* CONFIG_HARDLOCKUP_DETECTOR_OTHER_CPU */
590#endif /* CONFIG_HARDLOCKUP_DETECTOR_NMI */
478 591
479/* prepare/enable/disable routines */ 592/* prepare/enable/disable routines */
480/* sysctl functions */ 593/* sysctl functions */
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index aaf8baf8635..f7c48595201 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -191,15 +191,27 @@ config LOCKUP_DETECTOR
191 The overhead should be minimal. A periodic hrtimer runs to 191 The overhead should be minimal. A periodic hrtimer runs to
192 generate interrupts and kick the watchdog task every 4 seconds. 192 generate interrupts and kick the watchdog task every 4 seconds.
193 An NMI is generated every 10 seconds or so to check for hardlockups. 193 An NMI is generated every 10 seconds or so to check for hardlockups.
194 If NMIs are not available on the platform, every 12 seconds the
195 hrtimer interrupt on one cpu will be used to check for hardlockups
196 on the next cpu.
194 197
195 The frequency of hrtimer and NMI events and the soft and hard lockup 198 The frequency of hrtimer and NMI events and the soft and hard lockup
196 thresholds can be controlled through the sysctl watchdog_thresh. 199 thresholds can be controlled through the sysctl watchdog_thresh.
197 200
198config HARDLOCKUP_DETECTOR 201config HARDLOCKUP_DETECTOR_NMI
199 def_bool y 202 def_bool y
200 depends on LOCKUP_DETECTOR && !HAVE_NMI_WATCHDOG 203 depends on LOCKUP_DETECTOR && !HAVE_NMI_WATCHDOG
201 depends on PERF_EVENTS && HAVE_PERF_EVENTS_NMI 204 depends on PERF_EVENTS && HAVE_PERF_EVENTS_NMI
202 205
206config HARDLOCKUP_DETECTOR_OTHER_CPU
207 def_bool y
208 depends on LOCKUP_DETECTOR && SMP
209 depends on !HARDLOCKUP_DETECTOR_NMI && !HAVE_NMI_WATCHDOG
210
211config HARDLOCKUP_DETECTOR
212 def_bool y
213 depends on HARDLOCKUP_DETECTOR_NMI || HARDLOCKUP_DETECTOR_OTHER_CPU
214
203config BOOTPARAM_HARDLOCKUP_PANIC 215config BOOTPARAM_HARDLOCKUP_PANIC
204 bool "Panic (Reboot) On Hard Lockups" 216 bool "Panic (Reboot) On Hard Lockups"
205 depends on HARDLOCKUP_DETECTOR 217 depends on HARDLOCKUP_DETECTOR