- 论坛徽章:
- 3
|
是这样的,最近碰到个棘手的问题。双核cpu(ARM),在linux3.0.y内核底下,内核启动时扫描多个sata硬盘死机。
现象有两种
现象1:是死机无任何输出,中断无输入输出反应
现象2:是循环打印INFO: rcu_sched_state detected stalls on CPUs/tasks: { 1} (detected by 0, t=6002 jiffies)
我之前对内核sd_probe_async函数做了一点修改,加了一点东西
在add_disk之后
...........................
kernel_write_info( (unsigned char *)&dvr_usbdisklist_info, sizeof(dvr_intr_info));
printk(KERN_NOTICE "add a disk:%s,unique_id:%d,this_id:%d,sdp id:%d,lun:%d,channel:%d\n",
stmp->diskname,stmp->uniqueid,sdp->host->this_id,sdp->id,sdp->lun,sdp->channel);
printk(KERN_NOTICE "add a disk:%s,unique_id:%d\n",stmp->diskname,stmp->uniqueid);
spin_lock(&scsi_head_lock);
appand_to_list(SN_DISK_MAP, &scsi_disk_head, stmp);//链表操作
printk(KERN_NOTICE "===>appand %s[%d] to list\n", stmp->diskname, stmp->portnum);
spin_unlock(&scsi_head_lock);
........................
其中kernel_write_info实现如下,主要down_interruptible等信号量操作,由于还是在内核加载阶段,应用程序还没有打开这个设备,所以在发现dev->nreaders <= 0之后马上
up返回了
int kernel_write_info(unsigned char *buf, size_t count)
{
struct intr_pipe *dev = &intr_p_devices;
buffer_quere *tmp = NULL;
if (count <= 0)
return -ERESTARTSYS;
if (down_interruptible(&dev->sem))
return -ERESTARTSYS;
if (dev->nreaders <= 0)
{
printk("nobody care the interrupt info\n" ;
up (&dev->sem);
return 0;
}
tmp = kmalloc(sizeof(buffer_quere), GFP_KERNEL);
if (!tmp)
{
up (&dev->sem);
return -ERESTARTSYS;
}
memset(tmp, 0, sizeof(buffer_quere));
tmp->buffer = kmalloc(count, GFP_KERNEL);
if (!tmp->buffer)
{
kfree(tmp);
up (&dev->sem);
return -ERESTARTSYS;
}
memset(tmp->buffer, 0, count);
memcpy(tmp->buffer, buf, count);
tmp->len = count;
appand_to_list(buffer_quere, &dev->bquere, tmp);
up(&dev->sem);
/* finally, awake any reader */
wake_up_interruptible(&dev->inq); /* blocked in read() and select() */
/* and signal asynchronous readers, explained late in chapter 5 */
if (dev->async_queue)
kill_fasync(&dev->async_queue, SIGIO, POLL_IN);
printk("\"%s\" did write %li bytes\n",current->comm, (long)count);
return count;
}
sd_probe_async主要是通过workqueue调用的,多个硬盘在sd_probe加载调用的时候依次挂到workqueue中,按道理说应该是sda、sdb、sdc依次初始化,但是发现工作队列在处理sda、sdb时只处理了一半,sdc又开始初始化了,按道理说双核cpu在处理工作队列的时候,sda、sdb分别在一个cpu,如果都挂起了,sdc的初始化函数sd_probe_async应该没有cpu可以调用,但是打印情况确实这样的
ata1.01: SATA link down (SStatus 0 SControl 310)
ata1.02: hard resetting link
ata1.02: SATA link up 1.5 Gbps (SStatus 113 SControl 310)
ata1.03: hard resetting link
ata1.03: SATA link up 1.5 Gbps (SStatus 113 SControl 310)
ata1.04: hard resetting link
ata1.04: SATA link up 1.5 Gbps (SStatus 113 SControl 310)
ata1.00: ATA-8: ST3250318AS, CC46, max UDMA/133
ata1.00: 488397168 sectors, multi 0: LBA48 NCQ (depth 31/32)
ata1.00: configured for UDMA/133
ata1.02: ATA-8: ST1000DM003-9YN162, CC4B, max UDMA/133
ata1.02: 1953525168 sectors, multi 0: LBA48 NCQ (depth 31/32)
ata1.02: configured for UDMA/133
ata1.03: ATA-8: ST33000651AS, CC45, max UDMA/133
ata1.03: 5860533168 sectors, multi 0: LBA48 NCQ (depth 31/32)
ata1.03: configured for UDMA/133
ata1.04: ATA-8: WDC WD20EADS-00R6B0, 01.00A01, max UDMA/133
ata1.04: 3907029168 sectors, multi 0: LBA48 NCQ (depth 31/32)
ata1.04: configured for UDMA/133
ata1: EH complete
scsi 0:0:0:0: Direct-Access ATA ST3250318AS CC46 PQ: 0 ANSI: 5
sd 0:0:0:0: [sda] 488397168 512-byte logical blocks: (250 GB/232 GiB)
sd 0:0:0:0: Attached scsi generic sg0 type 0
scsi 0:2:0:0: Direct-Access ATA ST1000DM003-9YN1 CC4B PQ: 0 ANSI: 5
sd 0:0:0:0: [sda] Write Protect is off
sd 0:0:0:0: [sda] Write cache: enabled, read cache: enabled, doesn't support DPO or FUA
sd 0:2:0:0: [sdb] 1953525168 512-byte logical blocks: (1.00 TB/931 GiB)
sd 0:2:0:0: Attached scsi generic sg1 type 0
scsi 0:3:0:0: Direct-Access ATA ST33000651AS CC45 PQ: 0 ANSI: 5
sd 0:3:0:0: [sdc] 5860533168 512-byte logical blocks: (3.00 TB/2.72 TiB) sdc在sda sdb还没完成的时候开始被初始化了
sd 0:3:0:0: [sdc] Write Protect is off
sd 0:3:0:0: [sdc] Write cache: enabled, read cache: enabled, doesn't support DPO or FUA
sd 0:2:0:0: [sdb] 4096-byte physical blocks
sda: sda1 sda2 sda3 sda4
sd 0:3:0:0: Attached scsi generic sg2 type 0
sd 0:2:0:0: [sdb] Write Protect is off
sd 0:2:0:0: [sdb] Write cache: enabled, read cache: enabled, doesn't support DPO or FUA
scsi 0:4:0:0: Direct-Access ATA WDC WD20EADS-00R 01.0 PQ: 0 ANSI: 5
nobody care the interrupt info
add a disk:sda,unique_id:1,this_id:-1,sdp id:0,lun:0,channel:0
sd 0:4:0:0: Attached scsi generic sg3 type 0
add a disk:sda,unique_id:1
sd 0:4:0:0: [sdd] 3907029168 512-byte logical blocks: (2.00 TB/1.81 TiB)
sd 0:4:0:0: [sdd] Write Protect is off
sd 0:4:0:0: [sdd] Write cache: enabled, read cache: enabled, doesn't support DPO or FUA
===>appand sda[256] to list 这里应该是锁住了。。。
求解workqueue为什么会这样,还有 各位大神如何调试死锁的问题呢
下面是rcu stall的log
scsi 1:1:0:0: Direct-Access ATA ST33000651AS CC45 PQ: 0 ANSI: 5
sd 1:1:0:0: [sdb] 5860533168 512-byte logical blocks: (3.00 TB/2.72 TiB)
sd 1:1:0:0: Attached scsi generic sg1 type 0
scsi 1:2:0:0: Direct-Access ATA WDC WD20EADS-00R 01.0 PQ: 0 ANSI: 5
sd 1:2:0:0: [sdc] 3907029168 512-byte logical blocks: (2.00 TB/1.81 TiB)
sd 1:2:0:0: [sdc] Write Protect is off
sd 1:2:0:0: [sdc] Write cache: enabled, read cache: enabled, doesn't support DPO or FUA
sd 1:2:0:0: Attached scsi generic sg2 type 0
scsi 1:3:0:0: Direct-Access ATA Hitachi HDS5C101 JC4O PQ: 0 ANSI: 5
sd 1:3:0:0: [sdd] 1953525168 512-byte logical blocks: (1.00 TB/931 GiB)
sd 1:3:0:0: [sdd] Write Protect is off
sd 1:3:0:0: [sdd] Write cache: enabled, read cache: enabled, doesn't support DPO or FUA
sd 1:3:0:0: Attached scsi generic sg3 type 0
sdc: sdc1 sdc2 sdc3 sdc4
sd 1:1:0:0: [sdb] Write Protect is off
sd 1:1:0:0: [sdb] Write cache: enabled, read cache: enabled, doesn't support DPO or FUA
nobody care the interrupt info
add a disk:sdc,unique_id:2,this_id:-1,sdp id:0,lun:0,channel:2
add a disk:sdc,unique_id:2
===>appand sdc[514] to list
sdd: sdd1 sdd2 sdd3 sdd4
INFO: rcu_sched_state detected stalls on CPUs/tasks: { 1} (detected by 0, t=6002 jiffies)
Backtrace for cpu 0 (current):
Backtrace:
[<c0041efc>] (dump_backtrace+0x0/0x110) from [<c055735c>] (dump_stack+0x18/0x1c)
r6:00000000 r5:00000000 r4:c06d2e24 r3:c06b0874
[<c0557344>] (dump_stack+0x0/0x1c) from [<c00436d4>] (smp_send_all_cpu_backtrace+0x74/0x104)
[<c0043660>] (smp_send_all_cpu_backtrace+0x0/0x104) from [<c003f538>] (arch_trigger_all_cpu_backtrace+0x10/0x14)
r7:c06b2380 r6:c06ac080 r5:c06b2380 r4:c0b34440
[<c003f528>] (arch_trigger_all_cpu_backtrace+0x0/0x14) from [<c0096570>] (__rcu_pending+0x36c/0x39
[<c0096204>] (__rcu_pending+0x0/0x39 from [<c0096614>] (rcu_check_callbacks+0x78/0x16
[<c009659c>] (rcu_check_callbacks+0x0/0x16 from [<c006630c>] (update_process_times+0x40/0x54)
r5:00000000 r4:df966580
[<c00662cc>] (update_process_times+0x0/0x54) from [<c0083748>] (tick_periodic+0x5c/0xcc)
r6:7b29dc00 r5:00000000 r4:c06ac0c4 r3:00000000
[<c00836ec>] (tick_periodic+0x0/0xcc) from [<c00837e4>] (tick_handle_periodic+0x2c/0xa
r9:df806280 r8:00000023 r7:00000000 r6:c0b34040 r4:c0715758
r3:c0b34040
[<c00837b8>] (tick_handle_periodic+0x0/0xa from [<c0083b84>] (tick_do_periodic_broadcast+0xf4/0xfc)
[<c0083a90>] (tick_do_periodic_broadcast+0x0/0xfc) from [<c0083ba4>] (tick_handle_periodic_broadcast+0x18/0x7
r6:c06b01c0 r5:df8062cc r4:c06b0220 r3:c0083b8c
[<c0083b8c>] (tick_handle_periodic_broadcast+0x0/0x7 from [<c004a208>] (godnet_timer_interrupt+0x38/0x40)
[<c004a1d0>] (godnet_timer_interrupt+0x0/0x40) from [<c00903c4>] (handle_irq_event_percpu+0x58/0x18
[<c009036c>] (handle_irq_event_percpu+0x0/0x18 from [<c009053c>] (handle_irq_event+0x48/0x68)
[<c00904f4>] (handle_irq_event+0x0/0x68) from [<c0092e28>] (handle_fasteoi_irq+0xa4/0x118)
r6:00000023 r5:df8062cc r4:df806280 r3:c0035110
[<c0092d84>] (handle_fasteoi_irq+0x0/0x118) from [<c0090350>] (generic_handle_irq+0x30/0x38)
r5:c0035b0c r4:00000023
[<c0090320>] (generic_handle_irq+0x0/0x38) from [<c0037060>] (asm_do_IRQ+0x60/0xc0)
r4:c06ac480 r3:00000080
[<c0037000>] (asm_do_IRQ+0x0/0xc0) from [<c003d778>] (__irq_svc+0x38/0xa0)
Exception stack(0xdf8bbc90 to 0xdf8bbcd8)
bc80: 00000001 00000001 fe301000 00020005
bca0: df8bbd14 c0b3d320 c0b3d320 00b07000 00000001 c0b3d328 00000001 df8bbd04
bcc0: 00000002 df8bbcd8 c0043474 c00885b0 20000013 ffffffff
r7:00b07000 r6:00000023 r5:fe300100 r4:ffffffff
[<c0088530>] (generic_exec_single+0x0/0x98) from [<c0088888>] (smp_call_function_single+0x1b4/0x1c0)
[<c00886d4>] (smp_call_function_single+0x0/0x1c0) from [<c0088afc>] (smp_call_function_many+0x268/0x29c)
[<c0088894>] (smp_call_function_many+0x0/0x29c) from [<c0088b60>] (smp_call_function+0x30/0x38)
[<c0088b30>] (smp_call_function+0x0/0x38) from [<c0088b80>] (on_each_cpu+0x18/0x38)
[<c0088b68>] (on_each_cpu+0x0/0x38) from [<c00ffb5c>] (invalidate_bh_lrus+0x20/0x24)
r7:00000001 r6:00000000 r5:df44c210 r4:df44c200
[<c00ffb3c>] (invalidate_bh_lrus+0x0/0x24) from [<c01078e0>] (kill_bdev+0x28/0x40)
[<c01078b8>] (kill_bdev+0x0/0x40) from [<c0108754>] (__blkdev_put+0x68/0x180)
r4:df44c200 r3:00800000
[<c01086ec>] (__blkdev_put+0x0/0x180) from [<c0108898>] (blkdev_put+0x2c/0x158)
r8:00000000 r7:df44c200 r6:df8af268 r5:00000001 r4:df44c200
r3:00000000
[<c010886c>] (blkdev_put+0x0/0x158) from [<c02c9084>] (register_disk+0x170/0x174)
r8:00000000 r7:df44c200 r6:df8af268 r5:00000001 r4:df8af200
r3:00000000
[<c02c8f14>] (register_disk+0x0/0x174) from [<c02c9134>] (add_disk+0xac/0x278)
r7:00000000 r6:df8af268 r5:df8ec790 r4:df8af200
[<c02c9088>] (add_disk+0x0/0x278) from [<c035e2cc>] (sd_probe_async+0xfc/0x39c)
r8:00000000 r7:df8bbeb4 r6:df9a3000 r5:df8af200 r4:df86ca00
[<c035e1d0>] (sd_probe_async+0x0/0x39c) from [<c007cbc4>] (async_run_entry_fn+0x94/0x1cc)
[<c007cb30>] (async_run_entry_fn+0x0/0x1cc) from [<c006e940>] (process_one_work+0x120/0x41c)
r9:df872548 r8:c007cb30 r7:00000000 r6:df804c00 r5:c0715120
r4:df8da400
[<c006e820>] (process_one_work+0x0/0x41c) from [<c006f00c>] (worker_thread+0x1a8/0x4ac)
[<c006ee64>] (worker_thread+0x0/0x4ac) from [<c0075518>] (kthread+0x94/0x98)
[<c0075484>] (kthread+0x0/0x98) from [<c005c1d4>] (do_exit+0x0/0x768)
r7:00000013 r6:c005c1d4 r5:c0075484 r4:df8ffeac
sending IPI to all other CPUs:
INFO: rcu_sched_state detected stalls on CPUs/tasks: { 1} (detected by 0, t=24034 jiffies)
Backtrace for cpu 0 (current):
Backtrace:
[<c0041efc>] (dump_backtrace+0x0/0x110) from [<c055735c>] (dump_stack+0x18/0x1c)
r6:00000000 r5:00000000 r4:c06d2e24 r3:c06b0874
[<c0557344>] (dump_stack+0x0/0x1c) from [<c00436d4>] (smp_send_all_cpu_backtrace+0x74/0x104)
[<c0043660>] (smp_send_all_cpu_backtrace+0x0/0x104) from [<c003f538>] (arch_trigger_all_cpu_backtrace+0x10/0x14)
r7:c06b2380 r6:c06ac080 r5:c06b2380 r4:c0b34440
[<c003f528>] (arch_trigger_all_cpu_backtrace+0x0/0x14) from [<c0096570>] (__rcu_pending+0x36c/0x398)
[<c0096204>] (__rcu_pending+0x0/0x398) from [<c0096614>] (rcu_check_callbacks+0x78/0x168)
[<c009659c>] (rcu_check_callbacks+0x0/0x168) from [<c006630c>] (update_process_times+0x40/0x54)
r5:00000000 r4:df966580
[<c00662cc>] (update_process_times+0x0/0x54) from [<c0083748>] (tick_periodic+0x5c/0xcc)
r6:7712b400 r5:00000000 r4:c06ac0c4 r3:00000000
[<c00836ec>] (tick_periodic+0x0/0xcc) from [<c00837e4>] (tick_handle_periodic+0x2c/0xa8)
r9:df806280 r8:00000023 r7:00000000 r6:c0b34040 r4:c0715758
r3:c0b34040
[<c00837b8>] (tick_handle_periodic+0x0/0xa8) from [<c0083b84>] (tick_do_periodic_broadcast+0xf4/0xfc)
[<c0083a90>] (tick_do_periodic_broadcast+0x0/0xfc) from [<c0083ba4>] (tick_handle_periodic_broadcast+0x18/0x78)
r6:c06b01c0 r5:df8062cc r4:c06b0220 r3:c0083b8c
[<c0083b8c>] (tick_handle_periodic_broadcast+0x0/0x78) from [<c004a208>] (godnet_timer_interrupt+0x38/0x40)
[<c004a1d0>] (godnet_timer_interrupt+0x0/0x40) from [<c00903c4>] (handle_irq_event_percpu+0x58/0x188)
[<c009036c>] (handle_irq_event_percpu+0x0/0x188) from [<c009053c>] (handle_irq_event+0x48/0x68)
[<c00904f4>] (handle_irq_event+0x0/0x68) from [<c0092e28>] (handle_fasteoi_irq+0xa4/0x118)
r6:00000023 r5:df8062cc r4:df806280 r3:c0035110
[<c0092d84>] (handle_fasteoi_irq+0x0/0x118) from [<c0090350>] (generic_handle_irq+0x30/0x38)
r5:c0035b0c r4:00000023
[<c0090320>] (generic_handle_irq+0x0/0x38) from [<c0037060>] (asm_do_IRQ+0x60/0xc0)
r4:c06ac480 r3:00000080
[<c0037000>] (asm_do_IRQ+0x0/0xc0) from [<c003d778>] (__irq_svc+0x38/0xa0)
Exception stack(0xdf8bbc90 to 0xdf8bbcd8)
bc80: 00000001 00000001 fe301000 00020005
bca0: df8bbd14 c0b3d320 c0b3d320 00b07000 00000001 c0b3d328 00000001 df8bbd04
bcc0: 00000002 df8bbcd8 c0043474 c00885b0 20000013 ffffffff
r7:00b07000 r6:00000023 r5:fe300100 r4:ffffffff
[<c0088530>] (generic_exec_single+0x0/0x98) from [<c0088888>] (smp_call_function_single+0x1b4/0x1c0)
[<c00886d4>] (smp_call_function_single+0x0/0x1c0) from [<c0088afc>] (smp_call_function_many+0x268/0x29c)
[<c0088894>] (smp_call_function_many+0x0/0x29c) from [<c0088b60>] (smp_call_function+0x30/0x38)
[<c0088b30>] (smp_call_function+0x0/0x38) from [<c0088b80>] (on_each_cpu+0x18/0x38)
[<c0088b68>] (on_each_cpu+0x0/0x38) from [<c00ffb5c>] (invalidate_bh_lrus+0x20/0x24)
r7:00000001 r6:00000000 r5:df44c210 r4:df44c200
[<c00ffb3c>] (invalidate_bh_lrus+0x0/0x24) from [<c01078e0>] (kill_bdev+0x28/0x40)
[<c01078b8>] (kill_bdev+0x0/0x40) from [<c0108754>] (__blkdev_put+0x68/0x180)
r4:df44c200 r3:00800000
[<c01086ec>] (__blkdev_put+0x0/0x180) from [<c0108898>] (blkdev_put+0x2c/0x158)
r8:00000000 r7:df44c200 r6:df8af268 r5:00000001 r4:df44c200
r3:00000000
[<c010886c>] (blkdev_put+0x0/0x158) from [<c02c9084>] (register_disk+0x170/0x174)
r8:00000000 r7:df44c200 r6:df8af268 r5:00000001 r4:df8af200
r3:00000000
[<c02c8f14>] (register_disk+0x0/0x174) from [<c02c9134>] (add_disk+0xac/0x278)
r7:00000000 r6:df8af268 r5:df8ec790 r4:df8af200
[<c02c9088>] (add_disk+0x0/0x278) from [<c035e2cc>] (sd_probe_async+0xfc/0x39c)
r8:00000000 r7:df8bbeb4 r6:df9a3000 r5:df8af200 r4:df86ca00
[<c035e1d0>] (sd_probe_async+0x0/0x39c) from [<c007cbc4>] (async_run_entry_fn+0x94/0x1cc)
[<c007cb30>] (async_run_entry_fn+0x0/0x1cc) from [<c006e940>] (process_one_work+0x120/0x41c)
r9:df872548 r8:c007cb30 r7:00000000 r6:df804c00 r5:c0715120
r4:df8da400
[<c006e820>] (process_one_work+0x0/0x41c) from [<c006f00c>] (worker_thread+0x1a8/0x4ac)
[<c006ee64>] (worker_thread+0x0/0x4ac) from [<c0075518>] (kthread+0x94/0x98)
[<c0075484>] (kthread+0x0/0x98) from [<c005c1d4>] (do_exit+0x0/0x768)
r7:00000013 r6:c005c1d4 r5:c0075484 r4:df8ffeac
sending IPI to all other CPUs
|
|