/*
* Auto-unplugging state
*/
//插入设备时所用到的定时器
struct timer_list unplug_timer;
//如果请求队列中待处理请求数大于该值,将立即去掉请求设备
int unplug_thresh; /* After this many requests */
//去掉设备之间的延迟
unsigned long unplug_delay; /* After this many jiffies */
//去掉设备时使用的操作队列
struct work_struct unplug_work;
//
struct backing_dev_info backing_dev_info;
/*
* The queue owner gets to use this for whatever they like.
* ll_rw_blk doesn't touch it.
*/
//指向块设备驱动程序中的私有数据
void *queuedata;
//activity_fn()所用的参数
void *activity_data;
/*
* queue needs bounce pages for pages above this limit
*/
//如果页框号大于该值,将使用回弹缓存冲
unsigned long bounce_pfn;
//回弹缓存区页面的分配标志
int bounce_gfp;
/*
* various queue flags, see QUEUE_* below
*/
//描述请求队列的标志
unsigned long queue_flags;
/*
* queue settings
*/
//请求队列中允许的最大请求数
unsigned long nr_requests; /* Max # of requests */
//如果待请求的数目超过了该值,则认为该队列是拥挤的
unsigned int nr_congestion_on;
//如果待请求数目在这个阀值下,则认为该队列是不拥挤的
unsigned int nr_congestion_off;
//单个请求所能处理的最大扇区(可调的)
unsigned short max_sectors;
//单个请求所能处理的最大扇区(硬约束)
unsigned short max_hw_sectors;
//单个请求所能处理的最大物理段数
unsigned short max_phys_segments;
//单个请求所能处理的最大物理段数(DMA的约束)
unsigned short max_hw_segments;
//扇区中以字节 为单位的大小
unsigned short hardsect_size;
//物理段的最大长度(以字节为单位)
unsigned int max_segment_size;
//段合并的内存边界屏弊字
unsigned long seg_boundary_mask;
//DMA缓冲区的起始地址和长度的对齐
unsigned int dma_alignment;
//空闲/忙标记的位图.用于带标记的请求
struct blk_queue_tag *queue_tags;
//请求队列的引用计数
atomic_t refcnt;
//请求队列中待处理的请求数
unsigned int in_flight;
/*
* sg stuff
*/
//用户定义的命令超时
unsigned int sg_timeout;
//Not Use
unsigned int sg_reserved_size;
}
request_queue表示的是一个请求队列,每一个请求都是用request来表示的.
3.5: request结构:
struct request {
//用来形成链表
struct list_head queuelist; /* looking for ->queue? you must _not_
* access it directly, use
* blkdev_dequeue_request! */
//请求描述符的标志
unsigned long flags; /* see REQ_ bits below */
/* Maintain bio traversal state for part by part I/O submission.
* hard_* are block layer internals, no driver should touch them!
*/
//要传送的下一个扇区
sector_t sector; /* next sector to submit */
//要传送的扇区数目
unsigned long nr_sectors; /* no. of sectors left to submit */
/* no. of sectors left to submit in the current segment */
//当前bio段传送扇区的数目
unsigned int current_nr_sectors;
//要传送的下一个扇区号
sector_t hard_sector; /* next sector to complete */
//整个过程中要传送的扇区号
unsigned long hard_nr_sectors; /* no. of sectors left to complete */
/* no. of sectors left to complete in the current segment */
//当前bio段要传送的扇区数目
unsigned int hard_cur_sectors;
/* no. of segments left to submit in the current bio */
//
unsigned short nr_cbio_segments;
/* no. of sectors left to submit in the current bio */
unsigned long nr_cbio_sectors;
struct bio *cbio; /* next bio to submit */
//请求中第一个没有完成的bio
struct bio *bio; /* next unfinished bio to complete */
//最后的bio
struct bio *biotail;
//指向I/O调度的私有区
void *elevator_private;
//请求的状态
int rq_status; /* should split this into a few status bits */
//请求所引用的磁盘描述符
struct gendisk *rq_disk;
//统计传送失败的计数
int errors;
//请求开始的时间
unsigned long start_time;
/* Number of scatter-gather DMA addr+len pairs after
* physical address coalescing is performed.
*/
//请求的物理段数
unsigned short nr_phys_segments;
/* Number of scatter-gather addr+len pairs after
* physical and DMA remapping hardware coalescing is performed.
* This is the number of scatter-gather entries the driver
* will actually have to deal with after DMA mapping is done.
*/
//请求的硬段数
unsigned short nr_hw_segments;
//与请求相关的标识
int tag;
//数据传送的缓冲区,如果是高端内存,此成员值为NULL
char *buffer;
//请求的引用计数
int ref_count;
//指向包含请求的请求队列描述符
request_queue_t *q;
struct request_list *rl;
//指向数据传送终止的completion
struct completion *waiting;
//对设备发达“特殊请求所用到的指针”
void *special;
/*
* when request is used as a packet command carrier
*/
//cmd中的数据长度
unsigned int cmd_len;
//请求类型
unsigned char cmd[BLK_MAX_CDB];
//data中的数据长度
unsigned int data_len;
//为了跟踪所传输的数据而使用的指针
void *data;
//sense字段的数据长度
unsigned int sense_len;
//指向输出sense缓存区
void *sense;
//请求超时
unsigned int timeout;
/*
* For Power Management requests
*/
//指向电源管理命令所用的结构
struct request_pm_state *pm;
}
请求队列描述符与请求描述符都很复杂,为了简化驱动的设计,内核提供了一个API,供块设备驱动程序来初始化一个请求队列.这就是blk_init_queue().它的代码如下:
//rfn:驱动程序自动提供的操作I/O的函数.对应请求队列的request_fn
//lock:驱动程序提供给请求队列的自旋锁
request_queue_t *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock)
{
request_queue_t *q;
static int printed;
//申请请求队列描述符
q = blk_alloc_queue(GFP_KERNEL);
if (!q)
return NULL;
//初始化q->request_list
if (blk_init_free_list(q))
goto out_init;
/*
* all done
*/
//设置等待队列的I/O调度程序
if (!elevator_init(q, chosen_elevator))
return q;
//失败的处理
blk_cleanup_queue(q);
out_init:
kmem_cache_free(requestq_cachep, q);
return NULL;
}
这个函数中初始化了很多操作指针,这个函数在所有块设备中都是一样的,这样就为通用块设备层提供了一个统一的接口.对于块设备驱动的接口就是我们在blk_init_queue中设置的策略例程了.留意一下关于请求队列的各操作的设置,这在后续的分析中会用到.
另外,在请求结构中涉及到了bio结构.bio表示一个段.目前内核中关于I/O的所有操作都是由它来表示的.它的结构如下所示:
struct bio {
//段的起始扇区
sector_t bi_sector;
//下一个bio
struct bio *bi_next; /* request queue link */
//段所在的块设备
struct block_device *bi_bdev;
//bio的标志
unsigned long bi_flags; /* status, command, etc */
//Read/Write
unsigned long bi_rw; /* bottom bits READ/WRITE,
* top bits priority
*/
//bio_vec的项数
unsigned short bi_vcnt; /* how many bio_vec's */
//当前正在操作的bio_vec
unsigned short bi_idx; /* current index into bvl_vec */
/* Number of segments in this BIO after
* physical address coalescing is performed.
*/
//结合后的片段数目
unsigned short bi_phys_segments;
/* Number of segments after physical and DMA remapping
* hardware coalescing is performed.
*/
//重映射后的片段数目
unsigned short bi_hw_segments;
//I/O计数
unsigned int bi_size; /* residual I/O count */
/*
* To keep track of the max hw size, we account for the
* sizes of the first and last virtually mergeable segments
* in this bio
*/
//第一个可以合并的段大小
unsigned int bi_hw_front_size;
//最后一个可以合并的段大小
unsigned int bi_hw_back_size;
//最大的bio_vec项数
unsigned int bi_max_vecs; /* max bvl_vecs we can hold */
//bi_io_vec数组
struct bio_vec *bi_io_vec; /* the actual vec list */
//I/O完成的方法
bio_end_io_t *bi_end_io;
//使用计数
atomic_t bi_cnt; /* pin count */
//拥有者的私有区
void *bi_private;
//销毁此bio的方法
bio_destructor_t *bi_destructor; /* destructor */
}
bio_vec的结构如下:
struct bio_vec {
//bi_vec所表示的页面
struct page *bv_page;
//数据区的长度
unsigned int bv_len;
//在页面中的偏移量
unsigned int bv_offset;
}
关于bio与bio_vec的关系,用下图表示:
现在,我们来思考一个问题:
当一个I/O请求提交给请求队列后,它是怎么去调用块设备驱动的策略例程去完成这次I/O的呢?还有,当一个I/O请求被提交给请求队列时,会不会立即调用驱动中的策略例程去完成这次I/O呢?
实际上,为了提高效率,所有的I/O都会在一个特定的延时之后才会调用策略例程去完成本次I/O.我们来看一个反面的例子,假设I/O在被提交后马上得到执行.例如.磁盘有磁针在磁盘12.现在有一个磁道1的请求.就会将磁针移动到磁道1.操作完后,又有一个请求过来了,它要操作磁道11.然后又会将磁针移到磁道11.操作完后,又有一个请求过来,要求操作磁道4.此时会将磁针移到磁道4.这个例子中,磁针移动的位置是:12->1->11->4.实际上,磁针的定位是一个很耗时的操作.这样下去,毫无疑问会影响整个系统的效率.我们可以在整个延时内,将所有I/O操作按顺序排列在一起,然后再调用策略例程.于是上例的磁针移动就会变成12->11->4->1.此时磁针只会往一个方向移动.
至于怎么样排列请求和选取哪一个请求进行操作,这就是I/O调度的任务了.这部份我们在通用块层再进行分析.
内核中有两个操作会完成上面的延时过程.即:激活块设备驱动程序和撤消块设备驱动程序.
3.6:块设备驱动程序的激活和撤消
激活块设备驱动程序和撤消块设备驱动程序在内核中对应的接口为blk_plug_device()和blk_remove_plug().分别看下它们的操作:
void blk_plug_device(request_queue_t *q)
{
WARN_ON(!irqs_disabled());
/*
* don't plug a stopped queue, it must be paired with blk_start_queue()
* which will restart the queueing
*/
//如果设置了QUEUE_FLAG_STOPPED.直接退出
if (test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags))
return;
//为请求队列设置QUEUE_FLAG_PLUGGED.
if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags))
//如果之前请求队列的状态不为QUEUE_FLAG_PLUGGED,则设置定时器超时时间
mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);
}
int blk_remove_plug(request_queue_t *q)
{
WARN_ON(!irqs_disabled());
if (__rq->flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER))
break;
else if (__rq->flags & REQ_STARTED)
break;
//如果不是一个fs类型的请求?
if (!blk_fs_request(__rq))
continue;
//判断能否与这个请求合并
if ((ret = elv_try_merge(__rq, bio))) {
*req = __rq;
q->last_merge = __rq;
return ret;
}
}
return ELEVATOR_NO_MERGE;
}
Elv_try_merge()用来判断能否与请求合并,它的代码如下:
inline int elv_try_merge(struct request *__rq, struct bio *bio)
{
int ret = ELEVATOR_NO_MERGE;
/*
* we can merge and sequence is ok, check if it's possible
*/
//判断rq与bio是否为同类型的请求
if (elv_rq_merge_ok(__rq, bio)) {
//如果请求描述符中的起始扇区+ 扇区数= bio的起始扇区
//则将bio加到_rq的后面.
//返回ELEVATOR_BACK_MERGE
if (__rq->sector + __rq->nr_sectors == bio->bi_sector)
ret = ELEVATOR_BACK_MERGE;
//如果请求描述符中的起始扇区- 扇区数=bio的起始扇区
//则将bio加到_rq的前面
//返回ELEVATOR_FRONT_MERGE
else if (__rq->sector - bio_sectors(bio) == bio->bi_sector)
ret = ELEVATOR_FRONT_MERGE;
}
//如果不可以合并,返回ELEVATOR_NO_MERGE (值为0)
return ret;
}
elv_rq_merge_ok()代码如下:
inline int elv_rq_merge_ok(struct request *rq, struct bio *bio)
{
//判断rq是否可用
if (!rq_mergeable(rq))
return 0;
/*
* different data direction or already started, don't merge
*/
//操作是否相同
if (bio_data_dir(bio) != rq_data_dir(rq))
return 0;
/*
* same device and no special stuff set, merge is ok
*/
//要操作的对象是否一样
if (rq->rq_disk == bio->bi_bdev->bd_disk &&
!rq->waiting && !rq->special)
return 1;
/*
* new merges must not precede this barrier
*/
if (rq->flags & REQ_HARDBARRIER)
q->last_merge = NULL;
else if (!q->last_merge)
q->last_merge = rq;
}
五:通用块层的处理
通用块层的入口点为generic_make_request().它的代码如下:
void generic_make_request(struct bio *bio)
{
request_queue_t *q;
sector_t maxsector;
//nr_sectors:要操作的扇区数
int ret, nr_sectors = bio_sectors(bio);
//可能会引起睡眠
might_sleep();
/* Test device or partition size, when known. */
//最大扇区数目
maxsector = bio->bi_bdev->bd_inode->i_size >> 9;
if (maxsector) {
//bio操作的起始扇区
sector_t sector = bio->bi_sector;
//如果最大扇区数
//非法的情况
if (maxsector
maxsector - nr_sectors
char b[BDEVNAME_SIZE];
/* This may well happen - the kernel calls
* bread() without checking the size of the
* device, e.g., when mounting a device. */
printk(KERN_INFO
"attempt to access beyond end of device\n");
printk(KERN_INFO "%s: rw=%ld, want=%Lu, limit=%Lu\n",
bdevname(bio->bi_bdev, b),
bio->bi_rw,
(unsigned long long) sector + nr_sectors,
(long long) maxsector);
/*
* Resolve the mapping until finished. (drivers are
* still free to implement/resolve their own stacking
* by explicitly returning 0)
*
* NOTE: we don't repeat the blk_size check for each new device.
* Stacking drivers are expected to know what they are doing.
*/
do {
char b[BDEVNAME_SIZE];
//取得块设备的请求对列
q = bdev_get_queue(bio->bi_bdev);
if (!q) {
//请求队列不存在
printk(KERN_ERR
"generic_make_request: Trying to access "
"nonexistent block-device %s (%Lu)\n",
bdevname(bio->bi_bdev, b),
(long long) bio->bi_sector);
end_io:
//最终会调用bio->bi_end_io
bio_endio(bio, bio->bi_size, -EIO);
break;
}
//非法的情况
if (unlikely(bio_sectors(bio) > q->max_hw_sectors)) {
printk("bio too big device %s (%u > %u)\n",
bdevname(bio->bi_bdev, b),
bio_sectors(bio),
q->max_hw_sectors);
goto end_io;
}
//如果请求队列为QUEUE_FLAG_DEAD
//退出
if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))
goto end_io;
/*
* If this device has partitions, remap block n
* of partition p to block n+start(p) of the disk.
*/
//如果当前块设备是一个分区,则转到分区所属的块设备
blk_partition_remap(bio);
//调用请求队列的make_request_fn()
ret = q->make_request_fn(q, bio);
} while (ret);
}
/*
* low level driver can indicate that it wants pages above a
* certain limit bounced to low memory (ie for highmem, or even
* ISA dma in theory)
*/
//建立一个弹性回环缓存
blk_queue_bounce(q, &bio);
/*
* may not be valid. if the low level driver said
* it didn't need a bounce buffer then it better
* not touch req->buffer either...
*/
req->buffer = bio_data(bio);
req->current_nr_sectors = cur_nr_sectors;
req->hard_cur_sectors = cur_nr_sectors;
req->sector = req->hard_sector = sector;
req->nr_sectors = req->hard_nr_sectors += nr_sectors;
drive_stat_acct(req, nr_sectors, 0);
if (!attempt_front_merge(q, req))
elv_merged_request(q, req);
goto out;
/*
* elevator says don't/can't merge. get new request
*/
//不可以合并.申请一个新的请求,将且加入请求队列
case ELEVATOR_NO_MERGE:
break;
default:
printk("elevator returned crap (%d)\n", el_ret);
BUG();
}
/*
* Grab a free request from the freelist - if that is empty, check
* if we are doing read ahead and abort instead of blocking for
* a free slot.
*/
get_rq:
//freereq:是新分配的请求描述符
if (freereq) {
req = freereq;
freereq = NULL;
} else {
//分配一个请求描述符
spin_unlock_irq(q->queue_lock);
if ((freereq = get_request(q, rw, GFP_ATOMIC)) == NULL) {
/*
* READA bit set
*/
//分配失败
err = -EWOULDBLOCK;
if (bio_rw_ahead(bio))
goto end_io;
if (rl->count[rw]+1 >= q->nr_requests) {
/*
* The queue will fill after this allocation, so set it as
* full, and mark this process as "batching". This process
* will be allowed to complete a batch of requests, others
* will be blocked.
*/
//判断是否将队列置为了QUEUE_FLAG_READFULL/QUEUE_FLAG_WRITEFULL
//如果没有,则置此标志.并且设置当前进程为batching
if (!blk_queue_full(q, rw)) {
ioc_set_batching(ioc);
blk_set_queue_full(q, rw);
}
}
//如果队列满了,进程不为batching 且I/O调度程序不能忽略它
//不能分配.直接返回
if (blk_queue_full(q, rw)
&& !ioc_batching(ioc) && !elv_may_queue(q, rw)) {
/*
* The queue is full and the allocating process is not a
* "batcher", and not exempted by the IO scheduler
*/
spin_unlock_irq(q->queue_lock);
goto out;
}
//分配请求描述符
rq = blk_alloc_request(q, gfp_mask);
if (!rq) {
/*
* Allocation failed presumably due to memory. Undo anything
* we might have messed up.
*
* Allocating task should really be put onto the front of the
* wait queue, but this is pretty rare.
*/
spin_lock_irq(q->queue_lock);
//分配失败了,要减小分配描述的引用计数
freed_request(q, rw);
spin_unlock_irq(q->queue_lock);
goto out;
}
if (ioc_batching(ioc))
ioc->nr_batch_requests--;
//初始化请求的各字段
INIT_LIST_HEAD(&rq->queuelist);
/*
* first three bits are identical in rq->flags and bio->bi_rw,
* see bio.h and blkdev.h
*/
rq->flags = rw;
/*
* After sleeping, we become a "batching" process and
* will be able to allocate at least one request, and
* up to a big batch of them for a small period time.
* See ioc_batching, ioc_set_batching
*/
//这里是被唤醒之后运行
ioc_set_batching(ioc);
}
//将进程从等待队列中删除
finish_wait(&rl->wait[rw], &wait);
} while (!rq);
put_io_context(ioc);