- 论坛徽章:
- 0
|
回复 1# nevastill
1. 在hung_task模块初始化时会启动一个内核线程khungtaskd执行watchdog函数- static int __init hung_task_init(void)
- {
- atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
- watchdog_task = kthread_run(watchdog, NULL, "khungtaskd");
- return 0;
- }
- module_init(hung_task_init);
复制代码 2. 在线程khungtaskd中执行check_hung_uninterruptible_tasks- static int watchdog(void *dummy)
- {
- set_user_nice(current, 0);
- for ( ; ; ) {
- unsigned long timeout = sysctl_hung_task_timeout_secs;
- while (schedule_timeout_interruptible(timeout_jiffies(timeout)))
- timeout = sysctl_hung_task_timeout_secs;
- check_hung_uninterruptible_tasks(timeout);
- }
- return 0;
- }
复制代码 3. check_hung_uninterruptible_tasks->check_hung_task->touch_nmi_watchdog
- static void check_hung_task(struct task_struct *t, unsigned long timeout)
- {
- ... ...
- touch_nmi_watchdog();
- if (sysctl_hung_task_panic)
- panic("hung_task: blocked tasks");
- }
复制代码 4. 在touch_nmi_watchdog设置pcpu变量nmi_touch- void touch_nmi_watchdog(void)
- {
- if (nmi_watchdog_active()) {
- unsigned cpu;
- /*
- * Tell other CPUs to reset their alert counters. We cannot
- * do it ourselves because the alert count increase is not
- * atomic.
- */
- for_each_present_cpu(cpu) {
- if (per_cpu(nmi_touch, cpu) != 1)
- per_cpu(nmi_touch, cpu) = 1;
- }
- }
- /*
- * Tickle the softlockup detector too:
- */
- touch_softlockup_watchdog();
- }
复制代码 5. 在nmi中断处理do_nmi进行nmi_touch变量的检查,如果不为1,则计数加1,计数超过预设值,则调用die_nmi
代码流程:entry_32.s->call do_nmi->default_do_nmi->nmi_watchdog_tick
- notrace __kprobes int
- nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
- {
- ... ...
- if (__get_cpu_var(nmi_touch)) { /*进行pcpu变量touched的检查,如果为1,则置touched标志*/
- __get_cpu_var(nmi_touch) = 0;
- touched = 1;
- }
- ... ...
- /* 下面对touch标志进行判断,如果为0且nmi中断处理没有多次进入,则将altert_counter加1 */
- /* if the none of the timers isn't firing, this cpu isn't doing much */
- if (!touched && __get_cpu_var(last_irq_sum) == sum) {
- /*
- * Ayiee, looks like this CPU is stuck ...
- * wait a few IRQs (5 seconds) before doing the oops ...
- */
- __this_cpu_inc(alert_counter);
- if (__this_cpu_read(alert_counter) == 5 * nmi_hz)
- /*
- * die_nmi will return ONLY if NOTIFY_STOP happens..
- */
- die_nmi("BUG: NMI Watchdog detected LOCKUP",
- regs, panic_on_timeout);
- } else {
- /* 不然就更新中断计数,对alert_counter进行清零操作 */
- __get_cpu_var(last_irq_sum) = sum;
- __this_cpu_write(alert_counter, 0);
- }
- /* see if the nmi watchdog went off */
- if (!__get_cpu_var(wd_enabled))
- return rc;
- switch (nmi_watchdog) {
- case NMI_LOCAL_APIC:
- /* 如果是APIC类型的watch_dog,调如下函数进行喂狗 */
- rc |= lapic_wd_event(nmi_hz);
- break;
- case NMI_IO_APIC:
- /*
- * don't know how to accurately check for this.
- * just assume it was a watchdog timer interrupt
- * This matches the old behaviour.
- */
- rc = 1;
- break;
- }
- return rc;
- }
复制代码 6. APIC类型的watch_dog喂狗函数- int __kprobes lapic_wd_event(unsigned nmi_hz)
- {
- struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
- u64 ctr;
- rdmsrl(wd->perfctr_msr, ctr);
- if (ctr & wd_ops->checkbit) /* perfctr still running? */
- return 0;
- wd_ops->rearm(wd, nmi_hz);
- return 1;
- }
- static struct wd_ops intel_arch_wd_ops __read_mostly = {
- .reserve = single_msr_reserve,
- .unreserve = single_msr_unreserve,
- .setup = setup_intel_arch_watchdog,
- .rearm = p6_rearm,
- .stop = single_msr_stop_watchdog,
- .perfctr = MSR_ARCH_PERFMON_PERFCTR1,
- .evntsel = MSR_ARCH_PERFMON_EVENTSEL1,
- };
- static void __kprobes p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
- {
- /*
- * P6 based Pentium M need to re-unmask
- * the apic vector but it doesn't hurt
- * other P6 variant.
- * ArchPerfom/Core Duo also needs this
- */
- apic_write(APIC_LVTPC, APIC_DM_NMI);
- /* P6/ARCH_PERFMON has 32 bit counter write */
- write_watchdog_counter32(wd->perfctr_msr, NULL, nmi_hz);
- }
复制代码 |
|