免费注册 查看新帖 |

Chinaunix

  平台 论坛 博客 文库
最近访问板块 发新帖
楼主: sisi8408
打印 上一主题 下一主题

Kernel Bug-Vulnerability-Comment library [复制链接]

论坛徽章:
0
111 [报告]
发表于 2008-05-11 00:29 |只看该作者

  1. /*
  2. * 2.6.24.4-rt4
  3. */
  4. +static int ksoftirqd(void * __data)
  5. +{
  6. +        struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2 };
  7. +        struct softirqdata *data = __data;
  8. +        u32 softirq_mask = (1 << data->nr);
  9. +        struct softirq_action *h;
  10. +        int cpu = data->cpu;
  11. +
  12. +#ifdef CONFIG_PREEMPT_SOFTIRQS
  13. +        init_waitqueue_head(&data->wait);
  14. +#endif
  15. +
  16. +        sys_sched_setscheduler(current->pid, SCHED_FIFO, &param);
  17. +        current->flags |= PF_SOFTIRQ;
  18.         set_current_state(TASK_INTERRUPTIBLE);
复制代码

  1. /*
  2. * 2.6.24.4
  3. */
  4. static void task_tick_rt(struct rq *rq, struct task_struct *p)
  5. {
  6.         update_curr_rt(rq);

  7.         /*
  8.          * RR tasks need a special form of timeslice management.
  9.          * FIFO tasks have no timeslices.
  10.          */
  11.         if (p->policy != SCHED_RR)
  12.                 return;

  13.         if (--p->time_slice)
  14.                 return;

  15.         p->time_slice = DEF_TIMESLICE;

  16.         /*
  17.          * Requeue to the end of queue if we are not the only element
  18.          * on the queue:
  19.          */
  20.         if (p->run_list.prev != p->run_list.next) {
  21.                 requeue_task_rt(rq, p);
  22.                 set_tsk_need_resched(p);
  23.         }
  24. }
复制代码

if ksoftirqd is RT & SCHED_FIFO, provided that FIFO tasks have no timeslices,
it is possible in many cases even in RT version that NET_TX_SOFTIRQ,
NET_RX_SOFTIRQ and others mask each other, which may result in unnormal behaviors,
say slab cache crash.

[ 本帖最后由 sisi8408 于 2008-5-11 00:30 编辑 ]

论坛徽章:
0
112 [报告]
发表于 2008-05-17 10:05 |只看该作者

  1. /* 2.6.24.4
  2. * static void rebalance_domains(int cpu, enum cpu_idle_type idle)
  3. */
  4. -                if (interval > HZ * NR_CPUS /10)
  5. -                        interval = HZ * NR_CPUS /10;
  6. +                if (interval > (HZ /10) * num_online_cpus)
  7. +                        interval = (HZ /10) * num_online_cpus;
复制代码

论坛徽章:
0
113 [报告]
发表于 2008-05-18 00:34 |只看该作者

  1. /** 2.6.24.4
  2. static int __assign_irq_vector(int irq, cpumask_t mask)
  3. */
  4.                 if (unlikely(current_vector == vector))
  5. -                        continue;
  6. +                        goto next;               
复制代码

论坛徽章:
0
114 [报告]
发表于 2008-06-08 00:06 |只看该作者

  1. /*
  2. * from 独孤九贱大侠的帖子
  3. */
  4. Unable to handle kernel paging request at virtual address 713401b6

  5. printing eip: *pde = 00000000
  6. Oops: 0000 [#1]
  7. SMP
  8. Modules linked in: uflux e1000 trusthost
  9. CPU:    0
  10. EIP:    0060:[<c03c1de6>]    Not tainted VLI
  11. EFLAGS: 00010206   (2.6.12)

  12. EIP is at ip_route_input+0x86/0x1d0

  13. eax: df976fc0   ebx: c0629000   ecx: 7134010a   edx: 00000000
  14. esi: 0121010a   edi: 2495313a   ebp: 00000003   esp: c0629f04
  15. ds: 007b   es: 007b   ss: 0068

  16. Process swapper (pid: 0, threadinfo=c0629000 task=c0508c20)

  17. Stack: 0121010a 2495315a 00000000 00000000 df987800 00000000 df987800 dd0b6400
  18.        00000000 006d8580 00000000 dd0b6400 dd14c620 c03c4fc0 c03c4eff dd0b6400
  19.        0121010a 2495313a 00000000 df987800 c03c4fc0 80000000 dd18d440 dc7ef240
  20. Call Trace:
  21. [<c03c4fc0>] ip_rcv_finish+0x0/0x310
  22. [<c03c4eff>] ip_rcv+0x4ef/0x5b0
  23. [<c03c4fc0>] ip_rcv_finish+0x0/0x310
  24. [<c039a771>] netif_receive_skb+0x1e1/0x280
  25. [<c039a8a3>] process_backlog+0x93/0x130
  26. [<c039a9ef>] net_rx_action+0xaf/0x1a0
  27. [<c0129942>] __do_softirq+0x72/0xe0
  28. [<c0106bcb>] do_softirq+0x5b/0x60

  29. /* ------------------------------------------------ */

  30.     39c0:       a1 00 00 00 00          mov    0x0,%eax
  31.     39c5:       8b 53 10                mov    0x10(%ebx),%edx
  32.     39c8:       f7 d0                   not    %eax
  33.     39ca:       8b 04 90                mov    (%eax,%edx,4),%eax
  34.     39cd:       ff 40 38                incl   0x38(%eax)
  35.     39d0:       8b 09                   mov    (%ecx),%ecx
  36.     39d2:       85 c9                   test   %ecx,%ecx
  37.     39d4:       74 7a                   je     3a50 <ip_route_input+0x100>
  38.     39d6:       39 b1 ac 00 00 00       cmp    %esi,0xac(%ecx)
复制代码


  1. 1,        EIP is at 39d6, as shown the original poster,
  2.         and it is clear that the rt entry is still in hash table,
  3.         otherwise the reader of rt hash table has no chance to see it.
  4.         And there are at most 3 chances of hard irq, which will block
  5.         the reader, from 39d0 to 39d6.

  6. 2,        From call trace, the reader is in context of soft irq,
  7.         and the NAPI is not active in the kernel used.

  8. 3,        Since non-NAPI, what a pity, the NIC ISR is longer and completes when
  9.         netif_rx() returns.

  10. 4,        It is assumed that hardware works fine, anyway.
  11.         An updater on another cpu, yeah, (if no ohter cpu oops has no chance to play
  12.         its game, just because oops occurs in soft irq contxt,)
  13.         deletes the rt entry in %ecx in 39d0,
  14.         after happenly the reader is blocked by hard irq, say new skb is coming.

  15. 5,        Of high probablity, the updater is from timer hard irq on another cpu,

  16. /* 2.6.24.4
  17. *
  18. * Called from the timer interrupt handler,
  19. * to charge one tick to the current process.
  20. *
  21. * user_tick is 1 if the tick is user time, 0 for system.
  22. */
  23. void update_process_times (int user_tick)
  24. {
  25.         struct task_struct *p = current;
  26.         int cpu = smp_processor_id();

  27.         account_process_tick(p, user_tick);
  28.         run_local_timers();
  29.        
  30.         if (rcu_pending(cpu))
  31.                 rcu_check_callbacks(cpu, user_tick);
  32.        
  33.         scheduler_tick();
  34.         run_posix_cpu_timers(p);
  35. }

  36.         rt gc timer, execed by run_local_timers(),
  37.         happenly deletes the rt entry in %ecx in 39d0,
  38.         which is already seen by the reader and is not NULL,
  39.         and delivers it to rcu core by calling call_rcu_bh().
  40.        
  41.         In rcu_pending(), there are 4 chances that
  42.         fire rcu_check_callbacks(),
  43.         in which tasklet_schedule(&per_cpu(rcu_tasklet, cpu)) is called in anyway.

  44. 6,        Though rcu_tasklet is registered,

  45. /*
  46. * This does the RCU processing work from tasklet context.
  47. */
  48. static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
  49.                                         struct rcu_data *rdp)
  50. {
  51.         if (rdp->curlist &&
  52.             !rcu_batch_before(rcp->completed, rdp->batch)) {
  53.                 *rdp->donetail = rdp->curlist;
  54.                 rdp->donetail = rdp->curtail;
  55.                 /*
  56.                  * rcp->completed >= rdp->batch
  57.                  */
  58.                 rdp->curlist = NULL;
  59.                 rdp->curtail = &rdp->curlist;
  60.         }

  61.         if (rdp->nxtlist && !rdp->curlist) {
  62.                 local_irq_disable();
  63.                 rdp->curlist = rdp->nxtlist;
  64.                 rdp->curtail = rdp->nxttail;
  65.                
  66.                 rdp->nxtlist = NULL;
  67.                 rdp->nxttail = &rdp->nxtlist;
  68.                 local_irq_enable();
  69.                
  70.                 /*
  71.                  * start the next batch of callbacks
  72.                  */

  73.                 /* determine batch number */
  74.                 rdp->batch = rcp->cur + 1;
  75.                
  76.                 /* see the comment and corresponding wmb() in
  77.                  * the rcu_start_batch()
  78.                  */
  79.                 smp_rmb();

  80.                 if (!rcp->next_pending) {
  81.                         /* and start it/schedule start if it's a new batch */
  82.                         spin_lock(&rcp->lock);
  83.                         rcp->next_pending = 1;
  84.                         rcu_start_batch(rcp);
  85.                         spin_unlock(&rcp->lock);
  86.                 }
  87.         }

  88.         rcu_check_quiescent_state(rcp, rdp);
  89.        
  90.         if (rdp->donelist)
  91.                 rcu_do_batch(rdp);
  92. }

  93.         call_rcu_bh() only add the victim to rdp->nxtlist,
  94.         which will not be freed in the first exec of rcu_tasklet after
  95.         timer hard irq, but maybe freed in the following execs of rcu_tasklet
  96.         after the same timer hard irq becaude soft irqs are allowed more than once,
  97.         if certain conditions match:
  98.        
  99.         I,        if (rdp->nxtlist && !rdp->curlist) is true,
  100.                 when the first exec of rcu_tasklet,
  101.                 rdp->curlist = rdp->nxtlist;
  102.        
  103.         II,        if (rdp->nxtlist && !rdp->curlist) is true,
  104.                 also when the first exec of rcu_tasklet,
  105.        
  106.         III,        rcu_bh_qsctr_inc(cpu) already called in rcu_check_callbacks()
  107.        
  108.         IV,        cpu_quiet() called by rcu_check_quiescent_state()
  109.                 when the first exec of rcu_tasklet,
  110.        
  111.         V,        if (rdp->curlist &&
  112.                         !rcu_batch_before(rcp->completed, rdp->batch))
  113.                 is true,
  114.                 when the second exec of rcu_tasklet,

  115.         VI,        if the reader is blocked longer enough       
复制代码

[ 本帖最后由 sisi8408 于 2008-6-8 00:08 编辑 ]

论坛徽章:
0
115 [报告]
发表于 2008-07-18 21:48 |只看该作者

  1. ...
  2.         schedstat_inc(rq, ttwu_count);

  3.         if (cpu == this_cpu) {
  4.                 schedstat_inc(rq, ttwu_local);
  5.                 goto out_set_cpu;
  6.         }
  7. /* try_to_wake_up @ 2.6.24.4
  8.         if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
  9.                 goto out_set_cpu;
  10. */
  11.         for_each_domain(this_cpu, sd) {
  12.                 if (cpu_isset(cpu, sd->span)) {
  13.                         schedstat_inc(sd, ttwu_wake_remote);
  14.                         this_sd = sd;
  15.                         break;
  16.                 }
  17.         }

  18.         if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
  19.                 goto out_set_cpu;

  20.         /*
  21.          * Check for affine wakeup and passive balancing possibilities.
  22.          */
  23. ...
复制代码

论坛徽章:
0
116 [报告]
发表于 2008-07-18 22:38 |只看该作者

  1. ...
  2.         /* build_sched_domains @ 2.6.24.4
  3.          * Calculate CPU power for physical packages and nodes
  4.          */
  5. #ifdef CONFIG_SCHED_SMT
  6.         for_each_cpu_mask(i, *cpu_map) {
  7.                 struct sched_domain *sd = &per_cpu(cpu_domains, i);

  8.                 init_sched_groups_power(i, sd);
  9.         }

  10. #elif defined(CONFIG_SCHED_MC)
  11.         for_each_cpu_mask(i, *cpu_map) {
  12.                 struct sched_domain *sd = &per_cpu(core_domains, i);

  13.                 init_sched_groups_power(i, sd);
  14.         }
  15. #else

  16.         for_each_cpu_mask(i, *cpu_map) {
  17.                 struct sched_domain *sd = &per_cpu(phys_domains, i);

  18.                 init_sched_groups_power(i, sd);
  19.         }
  20. #endif

  21.         /* Attach the domains */
  22. ...
复制代码

论坛徽章:
0
117 [报告]
发表于 2008-07-27 11:39 |只看该作者
queue & sched in .26 blk layer


  1. 1, blk init core
  2.    =============

  3. int __init blk_dev_init(void)
  4. {
  5.         int i;

  6.         /* 1,
  7.          * ops upon queue is based on `work',
  8.          * it is asyn and in kthead context, goto see work queue,
  9.          * like do_softirq and ksoftirqd/x
  10.          */
  11.         kblockd_workqueue = create_workqueue("kblockd");
  12.         if (!kblockd_workqueue)
  13.                 panic("Failed to create kblockd\n");

  14.         request_cachep = kmem_cache_create("blkdev_requests",
  15.                         sizeof(struct request), 0, SLAB_PANIC, NULL);
  16.         /* 2,
  17.          * like at other spots, kmem cache utilised for skb in net,
  18.          * if u like to do something funny for skb, say 0-copy,
  19.          * u have to maintain alloc/free_skb methods,
  20.          * hard&hot isnt, shit?
  21.          */
  22.         blk_requestq_cachep = kmem_cache_create("blkdev_queue",
  23.                         sizeof(struct request_queue), 0, SLAB_PANIC, NULL);

  24.         for_each_possible_cpu(i)
  25.                 INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i));
  26.         /* 3,
  27.          * per cpu list prepared for BLOCK_SOFTIRQ,
  28.          * powerful, and lockless of cough
  29.          */
  30.         open_softirq(BLOCK_SOFTIRQ, blk_done_softirq, NULL);
  31.        
  32.         register_hotcpu_notifier(&blk_cpu_notifier);

  33.         return 0;
  34. }
复制代码

论坛徽章:
0
118 [报告]
发表于 2008-07-27 14:23 |只看该作者

  1. 2, Q init
  2.    ======

  3. `blk layer` is connected to reiser/btr FS through left hand,
  4. and through right hand connects scsi driver/controler.

  5. general in all, `blk layer` represents the soft methods upon blk device,
  6. including queue and scheduler, or simply elevator which is responsible for
  7. managing whatever schedulers, if available.

  8. in the eyes of elevator, blk device is a queue,
  9. alloced by driver,

  10. struct request_queue * blk_alloc_queue(gfp_t gfp_mask)
  11. {
  12.         return blk_alloc_queue_node(gfp_mask, -1);
  13. }

  14. struct request_queue * blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
  15. {
  16.         struct request_queue *q;
  17.         int err;

  18.         q = kmem_cache_alloc_node(blk_requestq_cachep,
  19.                                 gfp_mask | __GFP_ZERO, node_id);
  20.         if (!q)
  21.                 return NULL;

  22.         q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug;
  23.         /* 1,
  24.          * bdi, one concept in disk cache,
  25.          * goto see and compare with ramdisk, nice example for pseudo device,
  26.          * if u like to play game on SSD.
  27.          */
  28.         q->backing_dev_info.unplug_io_data = q;
  29.        
  30.         err = bdi_init(&q->backing_dev_info);
  31.         if (err) {
  32.                 kmem_cache_free(blk_requestq_cachep, q);
  33.                 return NULL;
  34.         }

  35.         /* 2,
  36.          * like hrtimer, simple machine for scheduling controler
  37.          */
  38.         init_timer(&q->unplug_timer);

  39.         kobject_init(&q->kobj, &blk_queue_ktype);
  40.         /* 3,
  41.          * reguler sysfs socket, its papa looks cool
  42.          */
  43.         mutex_init(&q->sysfs_lock);
  44.        
  45.         spin_lock_init(&q->__queue_lock);
  46.         return q;
  47. }


  48. and initilised in blk layer,

  49. struct request_queue * blk_init_queue(request_fn_proc *rfn, spinlock_t *lock)
  50. {
  51.         return blk_init_queue_node(rfn, lock, -1);
  52. }

  53. struct request_queue *
  54.         blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
  55. {
  56.         struct request_queue *q = blk_alloc_queue_node(GFP_KERNEL, node_id);

  57.         if (!q)
  58.                 return NULL;
  59.         q->node = node_id;

  60.         if (blk_init_free_list(q)) {
  61.                 kmem_cache_free(blk_requestq_cachep, q);
  62.                 return NULL;
  63.         }

  64.         /*
  65.          * if caller didn't supply a lock,
  66.          * they get per-queue locking with our embedded lock
  67.          */
  68.         if (!lock)
  69.                 lock = &q->__queue_lock;
  70.         q->queue_lock        = lock;

  71.         q->request_fn                = rfn;
  72.         q->prep_rq_fn                = NULL;
  73.         q->unplug_fn                = generic_unplug_device;
  74.         q->queue_flags                = (1 << QUEUE_FLAG_CLUSTER);

  75.         blk_queue_segment_boundary(q, 0xffffffff);

  76.         blk_queue_make_request(q, __make_request);
  77.         blk_queue_max_segment_size(q, MAX_SEGMENT_SIZE);

  78.         blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS);
  79.         blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS);

  80.         q->sg_reserved_size = INT_MAX;

  81.         /*
  82.          * all done
  83.          *
  84.          * info big brother, wheelz, ready for game
  85.          */
  86.         if (!elevator_init(q, NULL)) {
  87.                 blk_queue_congestion_threshold(q);
  88.                 return q;
  89.         }

  90.         blk_put_queue(q);
  91.         return NULL;
  92. }
复制代码

论坛徽章:
0
119 [报告]
发表于 2008-07-27 15:44 |只看该作者

  1. 3, basic methods for request alloc/free/prepare/init
  2.    =================================================

  3. FS dispatch bios to elevator, just for efficiency they are merged into request
  4. if possible, then delivered to controler when necessary.

  5. static struct request *
  6.         get_request(struct request_queue *q, int rw_flags,
  7.                         struct bio *bio, gfp_t gfp_mask)
  8. {
  9.         struct request *rq = NULL;
  10.         struct request_list *rl = &q->rq;
  11.         struct io_context *ioc = NULL;
  12.         const int rw = rw_flags & 0x01;
  13.         int may_queue, priv;

  14.         may_queue = elv_may_queue(q, rw_flags);
  15.         /* 1,
  16.          * check resource not over limit, say cpu time.
  17.          * the starv concept still play its role,
  18.          * howto compensate?
  19.          */
  20.         if (may_queue == ELV_MQUEUE_NO)
  21.                 goto rq_starved;

  22.         if (rl->count[rw] +1 >= queue_congestion_on_threshold(q)) {
  23.                 /* 2,
  24.                  * not only for efficiency, nice scheduling also imposes
  25.                  * resource limit upon all tasks in system,
  26.                  * fair play is considered here, with little to do with
  27.                  * if u r root, but compensation is also fair.
  28.                  */
  29.                 if (rl->count[rw] +1 >= q->nr_requests) {
  30.                         ioc = current_io_context(GFP_ATOMIC, q->node);
  31.                         /*
  32.                          * The queue will full after this allocation, so set
  33.                          * it as full, and mark this process as "batching".
  34.                          *
  35.                          * This process will be allowed to complete a batch of
  36.                          * requests, others will be blocked.
  37.                          */
  38.                         if (!blk_queue_full(q, rw)) {
  39.                                 ioc_set_batching(q, ioc);
  40.                                 blk_set_queue_full(q, rw);
  41.                         } else {
  42.                                 if (may_queue != ELV_MQUEUE_MUST
  43.                                         && !ioc_batching(q, ioc)) {
  44.                                         /*
  45.                                          * The queue is full and the allocating
  46.                                          * process is not a "batcher", and not
  47.                                          * exempted by the IO scheduler
  48.                                          */
  49.                                         goto out;
  50.                                 }
  51.                                 /* else
  52.                                  *
  53.                                  * batcher is biased to allocate upto 50%
  54.                                  * over the defined limit
  55.                                  */
  56.                         }
  57.                 }
  58.                 blk_set_queue_congested(q, rw);
  59.         }

  60.         /*
  61.          * Only allow batching queuers to allocate up to 50% over the defined
  62.          * limit of requests, otherwise we could have thousands of requests
  63.          * allocated with any setting of ->nr_requests
  64.          */
  65.         if (rl->count[rw] >= (3 * q->nr_requests / 2))
  66.                 goto out;

  67.         rl->count[rw]++;
  68.         rl->starved[rw] = 0;

  69.         priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
  70.         if (priv)
  71.                 rl->elvpriv++;

  72.         spin_unlock_irq(q->queue_lock);

  73.         rq = blk_alloc_request(q, rw_flags, priv, gfp_mask);
  74.         if (unlikely(!rq)) {
  75.                 /*
  76.                  * Allocation failed presumably due to memory. Undo anything
  77.                  * we might have messed up.
  78.                  *
  79.                  * Allocating task should really be put onto the front of the
  80.                  * wait queue, but this is pretty rare.
  81.                  */
  82.                 spin_lock_irq(q->queue_lock);
  83.                 /* 3,
  84.                  * nice scheduling is based upon housekeeper, right?
  85.                  * is mm based on page frame?
  86.                  */
  87.                 freed_request(q, rw, priv);

  88.                 /*
  89.                  * in the very unlikely event that allocation failed and no
  90.                  * requests for this direction was pending, mark us starved
  91.                  * so that freeing of a request in the other direction will
  92.                  * notice us. another possible fix would be to split the
  93.                  * rq mempool into READ and WRITE
  94.                  */
  95. rq_starved:
  96.                 if (unlikely(rl->count[rw] == 0))
  97.                         rl->starved[rw] = 1;

  98.                 goto out;
  99.         }

  100.         /*
  101.          * ioc may be NULL here, and ioc_batching will be false. That's
  102.          * OK, if the queue is under the request limit then requests need
  103.          * not count toward the nr_batch_requests limit. There will always
  104.          * be some limit enforced by BLK_BATCH_TIME.
  105.          */
  106.         if (ioc_batching(q, ioc))
  107.                 ioc->nr_batch_requests--;

  108.         blk_add_trace_generic(q, bio, rw, BLK_TA_GETRQ);
  109. out:
  110.         return rq;
  111. }

  112. static void freed_request(struct request_queue *q, int rw, int priv)
  113. {
  114.         struct request_list *rl = &q->rq;

  115.         rl->count[rw]--;
  116.        
  117.         if (priv)
  118.                 rl->elvpriv--;

  119.         __freed_request(q, rw);

  120.         if (unlikely(rl->starved[rw ^ 1]))
  121.                 /* 1,
  122.                  * today elevator has little to do with if u r reader,
  123.                  * since nobody has enough power to declare blue blood is nicer,
  124.                  */
  125.                 __freed_request(q, rw ^ 1);
  126. }
  127. static void __freed_request(struct request_queue *q, int rw)
  128. {
  129.         struct request_list *rl = &q->rq;

  130.         if (rl->count[rw] < queue_congestion_off_threshold(q))
  131.                 blk_clear_queue_congested(q, rw);

  132.         if (rl->count[rw] + 1 <= q->nr_requests) {
  133.                 /* 2,
  134.                  * but if and only if under control,
  135.                  * chance for compensation still available to keep disk rotating
  136.                  */
  137.                 if (waitqueue_active(&rl->wait[rw]))
  138.                         wake_up(&rl->wait[rw]);

  139.                 blk_clear_queue_full(q, rw);
  140.         }
  141. }


  142. void init_request_from_bio(struct request *req, struct bio *bio)
  143. {
  144.         req->cmd_type = REQ_TYPE_FS;

  145.         /*
  146.          * inherit FAILFAST from bio (for read-ahead, and explicit FAILFAST)
  147.          */
  148.         if (bio_rw_ahead(bio) || bio_failfast(bio))
  149.                 req->cmd_flags |= REQ_FAILFAST;

  150.         /*
  151.          * REQ_BARRIER implies no merging, but lets make it explicit
  152.          */
  153.         if (unlikely(bio_barrier(bio)))
  154.                 req->cmd_flags |= (REQ_HARDBARRIER | REQ_NOMERGE);

  155.         if (bio_sync(bio))
  156.                 req->cmd_flags |= REQ_RW_SYNC;
  157.         if (bio_rw_meta(bio))
  158.                 req->cmd_flags |= REQ_RW_META;

  159.         req->errors = 0;
  160.         req->hard_sector = req->sector = bio->bi_sector;
  161.         /* 1,
  162.          * prio also considered, but how is it defined?
  163.          */
  164.         req->ioprio = bio_prio(bio);
  165.         req->start_time = jiffies;
  166.        
  167.         blk_rq_bio_prep(req->q, req, bio);
  168. }
  169. void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
  170.                      struct bio *bio)
  171. {
  172.         /* first two bits are identical in rq->cmd_flags and bio->bi_rw */
  173.         rq->cmd_flags |= (bio->bi_rw & 3);

  174.         rq->nr_phys_segments = bio_phys_segments(q, bio);
  175.         rq->nr_hw_segments = bio_hw_segments(q, bio);
  176.         rq->current_nr_sectors = bio_cur_sectors(bio);
  177.         rq->hard_cur_sectors = rq->current_nr_sectors;
  178.         rq->hard_nr_sectors = rq->nr_sectors = bio_sectors(bio);
  179.         rq->buffer = bio_data(bio);
  180.         rq->data_len = bio->bi_size;

  181.         rq->bio = rq->biotail = bio;

  182.         if (bio->bi_bdev)
  183.                 rq->rq_disk = bio->bi_bdev->bd_disk;
  184. }
复制代码

论坛徽章:
0
120 [报告]
发表于 2008-07-27 16:39 |只看该作者

  1. 4, basic method exported to FS
  2.    ===========================

  3. void submit_bio(int rw, struct bio *bio)
  4. {
  5.         int count = bio_sectors(bio);

  6.         bio->bi_rw |= rw;

  7.         /*
  8.          * If it's a regular read/write or a barrier with data attached,
  9.          * go through the normal accounting stuff before submission.
  10.          */
  11.         if (!bio_empty_barrier(bio)) {
  12.                 BIO_BUG_ON(!bio->bi_size);
  13.                 BIO_BUG_ON(!bio->bi_io_vec);

  14.                 if (rw & WRITE) {
  15.                         count_vm_events(PGPGOUT, count);
  16.                 } else {
  17.                         task_io_account_read(bio->bi_size);
  18.                         count_vm_events(PGPGIN, count);
  19.                 }

  20.                 if (unlikely(block_dump)) {
  21.                         char b[BDEVNAME_SIZE];
  22.                        
  23.                         printk(KERN_DEBUG "%s(%d): %s block %Lu on %s\n",
  24.                         current->comm, task_pid_nr(current),
  25.                                 (rw & WRITE) ? "WRITE" : "READ",
  26.                                 (unsigned long long)bio->bi_sector,
  27.                                 bdevname(bio->bi_bdev, b));
  28.                 }
  29.         }
  30.         generic_make_request(bio);
  31. }

  32. void generic_make_request(struct bio *bio)
  33. {
  34.         if (current->bio_tail) { /* make_request is active */
  35.                 *(current->bio_tail) = bio;
  36.                 bio->bi_next = NULL;
  37.                 current->bio_tail = &bio->bi_next;
  38.                 return;
  39.         }
  40.         /* following loop may be a bit non-obvious, and so deserves some
  41.          * explanation.
  42.          * Before entering the loop, bio->bi_next is NULL (as all callers
  43.          * ensure that) so we have a list with a single bio.
  44.          *
  45.          * We pretend that we have just taken it off a longer list, so
  46.          * we assign bio_list to the next (which is NULL) and bio_tail
  47.          * to &bio_list, thus initialising the bio_list of new bios to be
  48.          * added.  __generic_make_request may indeed add some more bios
  49.          * through a recursive call to generic_make_request.  If it
  50.          * did, we find a non-NULL value in bio_list and re-enter the loop
  51.          * from the top.  In this case we really did just take the bio
  52.          * of the top of the list (no pretending) and so fixup bio_list and
  53.          * bio_tail or bi_next, and call into __generic_make_request again.
  54.          *
  55.          * The loop was structured like this to make only one call to
  56.          * __generic_make_request (which is important as it is large and
  57.          * inlined) and to keep the structure simple.
  58.          */
  59.         BUG_ON(bio->bi_next);
  60.        
  61.         do {
  62.                 current->bio_list = bio->bi_next;
  63.                
  64.                 if (bio->bi_next == NULL)
  65.                         current->bio_tail = &current->bio_list;
  66.                 else
  67.                         bio->bi_next = NULL;
  68.                
  69.                 __generic_make_request(bio);
  70.                 bio = current->bio_list;
  71.         } while (bio);

  72.         current->bio_tail = NULL; /* deactivate */
  73. }

  74. static inline void __generic_make_request(struct bio *bio)
  75. {
  76.         struct request_queue *q;
  77.         sector_t old_sector;
  78.         int ret, nr_sectors = bio_sectors(bio);
  79.         dev_t old_dev;
  80.         int err = -EIO;

  81.         might_sleep();
  82.         /* 1,
  83.          * check physical overflow
  84.          */
  85.         if (bio_check_eod(bio, nr_sectors))
  86.                 goto end_io;

  87.         /*
  88.          * Resolve the mapping until finished. (drivers are
  89.          * still free to implement/resolve their own stacking
  90.          * by explicitly returning 0)
  91.          *
  92.          * NOTE: we don't repeat the blk_size check for each new device.
  93.          * Stacking drivers are expected to know what they are doing.
  94.          */
  95.         old_sector = -1;
  96.         old_dev = 0;
  97.         do {
  98.                 char b[BDEVNAME_SIZE];
  99.                 /* 2,
  100.                  * determine physical disk
  101.                  */
  102.                 q = bdev_get_queue(bio->bi_bdev);
  103.                 if (!q) {
  104.                         printk(KERN_ERR
  105.                                "generic_make_request: Trying to access "
  106.                                 "nonexistent block-device %s (%Lu)\n",
  107.                                 bdevname(bio->bi_bdev, b),
  108.                                 (long long) bio->bi_sector);
  109. end_io:
  110.                         bio_endio(bio, err);
  111.                         break;
  112.                 }
  113.                 /* 3,
  114.                  * overflow controler's capability
  115.                  */
  116.                 if (unlikely(nr_sectors > q->max_hw_sectors)) {
  117.                         printk(KERN_ERR "bio too big device %s (%u > %u)\n",
  118.                                 bdevname(bio->bi_bdev, b),
  119.                                 bio_sectors(bio),
  120.                                 q->max_hw_sectors);
  121.                         goto end_io;
  122.                 }
  123.                 /* 4,
  124.                  * elevator still healthy
  125.                  */
  126.                 if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
  127.                         goto end_io;
  128.                 /* 5,
  129.                  * if just a poke
  130.                  */
  131.                 if (should_fail_request(bio))
  132.                         goto end_io;

  133.                 /* 6,
  134.                  * If this device has partitions, remap block n
  135.                  * of partition p to block n+start(p) of the disk.
  136.                  */
  137.                 blk_partition_remap(bio);

  138.                 if (old_sector != -1)
  139.                         blk_add_trace_remap(q, bio, old_dev, bio->bi_sector,
  140.                                             old_sector);

  141.                 blk_add_trace_bio(q, bio, BLK_TA_QUEUE);

  142.                 old_sector = bio->bi_sector;
  143.                 old_dev = bio->bi_bdev->bd_dev;

  144.                 if (bio_check_eod(bio, nr_sectors))
  145.                         goto end_io;
  146.                 /* 7,
  147.                  * If hard to pose
  148.                  */
  149.                 if (bio_empty_barrier(bio) && !q->prepare_flush_fn) {
  150.                         err = -EOPNOTSUPP;
  151.                         goto end_io;
  152.                 }
  153.                 /* 8,
  154.                  * do it, but where assigned?
  155.                  * void blk_queue_make_request(struct request_queue *q,
  156.                  *                                 make_request_fn *mfn)
  157.                  * and how about __make_request?
  158.                  * it is dft method when Q init
  159.                  */
  160.                 ret = q->make_request_fn(q, bio);
  161.         } while (ret);
  162. }

  163. static int __make_request(struct request_queue *q, struct bio *bio)
  164. {
  165.         struct request *req;
  166.         int el_ret, nr_sectors, barrier, err;
  167.         const unsigned short prio = bio_prio(bio);
  168.         const int sync = bio_sync(bio);
  169.         int rw_flags;

  170.         nr_sectors = bio_sectors(bio);

  171.         /*
  172.          * low level driver can indicate that it wants pages above a
  173.          * certain limit bounced to low memory (ie for highmem, or even
  174.          * ISA dma in theory)
  175.          */
  176.         blk_queue_bounce(q, &bio);

  177.         barrier = bio_barrier(bio);
  178.        
  179.         if (unlikely(barrier) && (q->next_ordered == QUEUE_ORDERED_NONE)) {
  180.                 err = -EOPNOTSUPP;
  181.                 goto end_io;
  182.         }

  183.         spin_lock_irq(q->queue_lock);
  184.         /* 1,
  185.          * bar do no merge in principle
  186.          */
  187.         if (unlikely(barrier) || elv_queue_empty(q))
  188.                 goto get_rq;
  189.         /* 2,
  190.          * try merge
  191.          */
  192.         el_ret = elv_merge(q, &req, bio);
  193.        
  194.         switch (el_ret) {
  195.         case ELEVATOR_BACK_MERGE:
  196.                 BUG_ON(!rq_mergeable(req));

  197.                 if (!ll_back_merge_fn(q, req, bio))
  198.                         break;

  199.                 blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);

  200.                 req->biotail->bi_next = bio;
  201.                 req->biotail = bio;
  202.                 req->nr_sectors = req->hard_nr_sectors += nr_sectors;
  203.                 req->ioprio = ioprio_best(req->ioprio, prio);
  204.                 drive_stat_acct(req, 0);
  205.                
  206.                 if (!attempt_back_merge(q, req))
  207.                         elv_merged_request(q, req, el_ret);
  208.                 goto out;

  209.         case ELEVATOR_FRONT_MERGE:
  210.                 BUG_ON(!rq_mergeable(req));

  211.                 if (!ll_front_merge_fn(q, req, bio))
  212.                         break;

  213.                 blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);

  214.                 bio->bi_next = req->bio;
  215.                 req->bio = bio;

  216.                 /*
  217.                  * may not be valid. if the low level driver said
  218.                  * it didn't need a bounce buffer then it better
  219.                  * not touch req->buffer either...
  220.                  */
  221.                 req->buffer = bio_data(bio);
  222.                 req->current_nr_sectors = bio_cur_sectors(bio);
  223.                 req->hard_cur_sectors = req->current_nr_sectors;
  224.                 req->sector = req->hard_sector = bio->bi_sector;
  225.                 req->nr_sectors = req->hard_nr_sectors += nr_sectors;
  226.                 req->ioprio = ioprio_best(req->ioprio, prio);
  227.                 drive_stat_acct(req, 0);
  228.                 if (!attempt_front_merge(q, req))
  229.                         elv_merged_request(q, req, el_ret);
  230.                 goto out;

  231.         /* ELV_NO_MERGE: elevator says don't/can't merge. */
  232.         default:
  233.                 break;
  234.         }

  235. get_rq:
  236.         /*
  237.          * This sync check and mask will be re-done in init_request_from_bio(),
  238.          * but we need to set it earlier to expose the sync flag to the
  239.          * rq allocator and io schedulers.
  240.          */
  241.         rw_flags = bio_data_dir(bio);
  242.         if (sync)
  243.                 rw_flags |= REQ_RW_SYNC;

  244.         /* 3,
  245.          * Grab a free request. This is might sleep but can not fail.
  246.          * Returns with the queue unlocked.
  247.          */
  248.         req = get_request_wait(q, rw_flags, bio);

  249.         /*
  250.          * After dropping the lock and possibly sleeping here, our request
  251.          * may now be mergeable after it had proven unmergeable (above).
  252.          *
  253.          * We don't worry about that case for efficiency. It won't happen
  254.          * often, and the elevators are able to handle it.
  255.          */
  256.         init_request_from_bio(req, bio);

  257.         spin_lock_irq(q->queue_lock);
  258.        
  259.         if (elv_queue_empty(q))
  260.                 blk_plug_device(q);
  261.         /* 4,
  262.          * in case of new request, and then wait on queue
  263.          */
  264.         add_request(q, req);
  265. out:
  266.         if (sync) {
  267.         /* 5,
  268.          * if neccessary, drive disk rotating,
  269.                  * even current gas is above $100
  270.          */
  271.                 __generic_unplug_device(q);
  272.         }
  273.         spin_unlock_irq(q->queue_lock);
  274.         return 0;
  275. end_io:
  276.         bio_endio(bio, err);
  277.         return 0;
  278. }
复制代码
您需要登录后才可以回帖 登录 | 注册

本版积分规则 发表回复

  

北京盛拓优讯信息技术有限公司. 版权所有 京ICP备16024965号-6 北京市公安局海淀分局网监中心备案编号:11010802020122 niuxiaotong@pcpop.com 17352615567
未成年举报专区
中国互联网协会会员  联系我们:huangweiwei@itpub.net
感谢所有关心和支持过ChinaUnix的朋友们 转载本站内容请注明原作者名及出处

清除 Cookies - ChinaUnix - Archiver - WAP - TOP