- 论坛徽章:
- 0
|
块设备驱动无非就是提供一个接口供vfs调用实现磁盘读写等功能。
首先每个块设备驱动都要创建一个gendisk结构,保存了大部分信息。很明显的有主,次设备号,磁盘分区信息,和一个请求队列request_queue等。
struct gendisk {
/* major, first_minor and minors are input parameters only,
* don't use directly. Use disk_devt() and disk_max_parts().
*/
int major; /* major number of driver */
int first_minor;
int minors; /* maximum number of minors, =1 for
* disks that can't be partitioned. */
char disk_name[DISK_NAME_LEN]; /* name of major driver */
char *(*devnode)(struct gendisk *gd, umode_t *mode);
unsigned int events; /* supported events */
unsigned int async_events; /* async events, subset of all */
/* Array of pointers to partitions indexed by partno.
* Protected with matching bdev lock but stat and other
* non-critical accesses use RCU. Always access through
* helpers.
*/
struct disk_part_tbl __rcu *part_tbl;
struct hd_struct part0;
const struct block_device_operations *fops;
struct request_queue *queue;
void *private_data;
int flags;
struct device *driverfs_dev; // FIXME: remove
struct kobject *slave_dir;
struct timer_rand_state *random;
atomic_t sync_io; /* RAID */
struct disk_events *ev;
#ifdef CONFIG_BLK_DEV_INTEGRITY
struct blk_integrity *integrity;
#endif
int node_id;
};
hd_struct描述了一个磁盘分区的信息,包括开始的sector nr和sector count。
struct hd_struct {
sector_t start_sect;
sector_t nr_sects;
seqcount_t nr_sects_seq;
sector_t alignment_offset;
unsigned int discard_alignment;
struct device __dev;
struct kobject *holder_dir;
int policy, partno;
struct partition_meta_info *info;
unsigned long stamp;
atomic_t in_flight[2];
struct disk_stats __percpu *dkstats;
atomic_t ref;
struct rcu_head rcu_head;
};
request_queue核心管理了一个请求队列,并且提供request_fn方法处理vfs的读写请求,明显request_queue是磁盘驱动联系vfs的一个纽带。磁盘驱动一般alloc_disk分配gendisk,然后调用add_disk注册它。通过blk_init_queue分配一个request_queue和request_fn方法。
但是vfs并没有直接使用request_queue,而是对每个分区或者磁盘都用了一个block_device结构block_device结构中保存了gendisk,request_queue以及分区信息。block_device的结构看起来都是很友善的,但是和gendisk,request_queue由驱动负责创建不同,block_device结构的出生稍微有点曲折。
struct block_device {
dev_t bd_dev; /* not a kdev_t - it's a search key */
int bd_openers;
struct inode * bd_inode; /* will die */
struct super_block * bd_super;
struct mutex bd_mutex; /* open/close mutex */
struct list_head bd_inodes;
void * bd_claiming;
void * bd_holder;
int bd_holders;
bool bd_write_holder;
#ifdef CONFIG_SYSFS
struct list_head bd_holder_disks;
#endif
struct block_device * bd_contains;
unsigned bd_block_size;
struct hd_struct * bd_part;
/* number of times partitions within this device have been opened. */
unsigned bd_part_count;
int bd_invalidated;
struct gendisk * bd_disk;
struct request_queue * bd_queue;
struct list_head bd_list;
/*
* Private data. You must have bd_claim'ed the block_device
* to use this. NOTE: bd_claim allows an owner to claim
* the same device multiple times, the owner must take special
* care to not mess up bd_private for that case.
*/
unsigned long bd_private;
/* The counter of freeze processes */
int bd_fsfreeze_count;
/* Mutex for freeze */
struct mutex bd_fsfreeze_mutex;
};
比如磁盘sda,看一下磁盘分区/dev/sda1的block_device都是怎么来的把。不同于/dev/sda整个磁盘的block_device在add_disk中就产生了,并且还rescan_partitions从而创建了各个分区。我们照着最常见的情况分析,比如linux init进程打开/dev/sda1来挂载文件系统的情况。
直接进入blkdev_open, 我们现在inode指向的是/dev/sda1, 其实这个inode除了设备号是有意义的,其他属性都是一个空壳。然后通过bd_acquire找到或者创建了对应于这个设备号的block_device,当然之后还要通过blkdev_get将这个block_device付给现在的空壳inode并有必要的进行一些初始化工作。这样即使我们打开多个/dev/sda*,他们都指向的sda第一个分区,所以inode实体并不同,但是通过blkdev_open之后他们关联到的block_device都是唯一的。
const struct file_operations def_blk_fops = {
.open = blkdev_open,
.release = blkdev_close,
static int blkdev_open(struct inode * inode, struct file * filp)
{
struct block_device *bdev;
/*
* Preserve backwards compatibility and allow large file access
* even if userspace doesn't ask for it explicitly. Some mkfs
* binary needs it. We might want to drop this workaround
* during an unstable branch.
*/
filp->f_flags |= O_LARGEFILE;
if (filp->f_flags & O_NDELAY)
filp->f_mode |= FMODE_NDELAY;
if (filp->f_flags & O_EXCL)
filp->f_mode |= FMODE_EXCL;
if ((filp->f_flags & O_ACCMODE) == 3)
filp->f_mode |= FMODE_WRITE_IOCTL;
bdev = bd_acquire(inode);
if (bdev == NULL)
return -ENOMEM;
filp->f_mapping = bdev->bd_inode->i_mapping;
return blkdev_get(bdev, filp->f_mode, filp);
}
bd_acquire将inode的设备号传给bdget方法。就得到了相应的block_device。
static struct block_device *bd_acquire(struct inode *inode)
{
struct block_device *bdev;
spin_lock(&bdev_lock);
bdev = inode->i_bdev;
if (bdev) {
ihold(bdev->bd_inode);
spin_unlock(&bdev_lock);
return bdev;
}
spin_unlock(&bdev_lock);
bdev = bdget(inode->i_rdev);
if (bdev) {
spin_lock(&bdev_lock);
if (!inode->i_bdev) {
/*
* We take an additional reference to bd_inode,
* and it's released in clear_inode() of inode.
* So, we can access it via ->i_mapping always
* without igrab().
*/
ihold(bdev->bd_inode);
inode->i_bdev = bdev;
inode->i_mapping = bdev->bd_inode->i_mapping;
list_add(&inode->i_devices, &bdev->bd_inodes);
}
spin_unlock(&bdev_lock);
}
return bdev;
}
bdget方法实际上以设备号为key,到inode_hashtable中查找一个bdev_inode的inode,如果没有找到,则创建一个bdev_inode的inode。可以看到bdev_inode中的bdev属性就是我们的block_device实体,各种block_device指针最终都是指向的这个bdev。为什么是bdev_inode结构,实际上blockdev_superblock指向的是bd_type(file_system_type),相关的alloc_inode方法分配的正式这个bdev_inode结构。block_device创建之后需要一些基本的初始化,但是我们看到这个时候block_device和真正的gendisk,request_queue并没有做关联,具体的工作会丢给blkdev_get处理的。
struct block_device *bdget(dev_t dev)
{
struct block_device *bdev;
struct inode *inode;
inode = iget5_locked(blockdev_superblock, hash(dev),
bdev_test, bdev_set, &dev);
if (!inode)
return NULL;
bdev = &BDEV_I(inode)->bdev;
if (inode->i_state & I_NEW) {
bdev->bd_contains = NULL;
bdev->bd_super = NULL;
bdev->bd_inode = inode;
bdev->bd_block_size = (1 << inode->i_blkbits);
bdev->bd_part_count = 0;
bdev->bd_invalidated = 0;
inode->i_mode = S_IFBLK;
inode->i_rdev = dev;
inode->i_bdev = bdev;
inode->i_data.a_ops = &def_blk_aops;
mapping_set_gfp_mask(&inode->i_data, GFP_USER);
inode->i_data.backing_dev_info = &default_backing_dev_info;
spin_lock(&bdev_lock);
list_add(&bdev->bd_list, &all_bdevs);
spin_unlock(&bdev_lock);
unlock_new_inode(inode);
}
return bdev;
}
struct bdev_inode {
struct block_device bdev;
struct inode vfs_inode;
};
blkget_dev实际上就是嗲用了__blkdev_get。
get_gendisk根据设备号或者到相应的gendisk结构。并且设置bd_queue等于disk->queue
disk_get_part根据设备号找到相应的hd_struct结构。
如果block_device已经打开过,则增加openers引用计数,调用关联的open方法。反之分两种情况:
1. partno是分区,我们现在的假设,这种情况比较简单。因为这个时候主磁盘的block_device已经创建好了,就是上面说的add_disk中做的。这个时候仅需要通过bgget_disk拿到这个block_device,然后设置到分区的block_device->bd_contains。调用关联open方法。
2. partno是0表示整个磁盘。当然其实这个是在add_disk中触发的,设置block_device->bd_contains等于自己,如果bd_invalidated=1,则重新扫描分区(add_disk的时候bd_invalidated就是1,分区就是这样建立的)。然后也要调用关联的open方法。
具体的代码稍微有点绕,忽略不重要的细节,最终的功能基本如上。
int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
{
res = __blkdev_get(bdev, mode, 0);
return res;
}
static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
{
struct gendisk *disk;
struct module *owner;
int ret;
int partno;
int perm = 0;
restart:
ret = -ENXIO;
disk = get_gendisk(bdev->bd_dev, &partno);
if (!disk)
goto out;
owner = disk->fops->owner;
disk_block_events(disk);
mutex_lock_nested(&bdev->bd_mutex, for_part);
if (!bdev->bd_openers) {
bdev->bd_disk = disk;
bdev->bd_queue = disk->queue;
bdev->bd_contains = bdev;
if (!partno) {
struct backing_dev_info *bdi;
ret = -ENXIO;
bdev->bd_part = disk_get_part(disk, partno);
if (!bdev->bd_part)
goto out_clear;
ret = 0;
if (disk->fops->open) {
ret = disk->fops->open(bdev, mode);
if (ret == -ERESTARTSYS) {
/* Lost a race with 'disk' being
* deleted, try again.
* See md.c
*/
disk_put_part(bdev->bd_part);
bdev->bd_part = NULL;
bdev->bd_disk = NULL;
bdev->bd_queue = NULL;
mutex_unlock(&bdev->bd_mutex);
disk_unblock_events(disk);
put_disk(disk);
module_put(owner);
goto restart;
}
}
if (!ret && !bdev->bd_openers) {
bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
bdi = blk_get_backing_dev_info(bdev);
if (bdi == NULL)
bdi = &default_backing_dev_info;
bdev_inode_switch_bdi(bdev->bd_inode, bdi);
}
/*
* If the device is invalidated, rescan partition
* if open succeeded or failed with -ENOMEDIUM.
* The latter is necessary to prevent ghost
* partitions on a removed medium.
*/
if (bdev->bd_invalidated) {
if (!ret)
rescan_partitions(disk, bdev);
else if (ret == -ENOMEDIUM)
invalidate_partitions(disk, bdev);
}
if (ret)
goto out_clear;
} else {
struct block_device *whole;
whole = bdget_disk(disk, 0);
ret = -ENOMEM;
if (!whole)
goto out_clear;
BUG_ON(for_part);
ret = __blkdev_get(whole, mode, 1);
if (ret)
goto out_clear;
bdev->bd_contains = whole;
bdev_inode_switch_bdi(bdev->bd_inode,
whole->bd_inode->i_data.backing_dev_info);
bdev->bd_part = disk_get_part(disk, partno);
if (!(disk->flags & GENHD_FL_UP) ||
!bdev->bd_part || !bdev->bd_part->nr_sects) {
ret = -ENXIO;
goto out_clear;
}
bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
}
} else {
if (bdev->bd_contains == bdev) {
ret = 0;
if (bdev->bd_disk->fops->open)
ret = bdev->bd_disk->fops->open(bdev, mode);
/* the same as first opener case, read comment there */
if (bdev->bd_invalidated) {
if (!ret)
rescan_partitions(bdev->bd_disk, bdev);
else if (ret == -ENOMEDIUM)
invalidate_partitions(bdev->bd_disk, bdev);
}
if (ret)
goto out_unlock_bdev;
}
/* only one opener holds refs to the module and disk */
put_disk(disk);
module_put(owner);
}
bdev->bd_openers++;
if (for_part)
bdev->bd_part_count++;
mutex_unlock(&bdev->bd_mutex);
disk_unblock_events(disk);
return 0;
out_clear:
disk_put_part(bdev->bd_part);
bdev->bd_disk = NULL;
bdev->bd_part = NULL;
bdev->bd_queue = NULL;
bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info);
if (bdev != bdev->bd_contains)
__blkdev_put(bdev->bd_contains, mode, 1);
bdev->bd_contains = NULL;
out_unlock_bdev:
mutex_unlock(&bdev->bd_mutex);
disk_unblock_events(disk);
put_disk(disk);
module_put(owner);
out:
bdput(bdev);
return ret;
}
|
|