- 论坛徽章:
- 0
|
本帖最后由 木叉叉木大 于 2011-07-04 14:33 编辑
不是高手,代码很搓,第一次发出自己的内容来,希望大家指教 代码部分的排版比较乱,直接从博客上copy过来的,不会用wordpress啊
目标:基于linux平台,在内核态解析vmdk文件,把文件模拟为块设备,最后可以mount (主要是做一个简单的sparse文件读写,所以很多vmdk格式的内容没有考虑,后面会随着文档列出)
环境:Ubuntu8.04 linux-2.6.24-24 gcc-4.2.4
vmdk文件格式分析:
注:这里只描述Hosted Sparse Extents(所谓的稀疏文件,一开始创建时很小,随着数据的增多而变大)
在我们使用vmware虚拟机创建一个虚拟磁盘时,会提示你是一个single file 还是 多个split file 并且我们是sparse文件 不是flat文件。下面是一个sparse文件的格式
vmdk spec 1.1 给出的
Hosted Sparse Extent Header (对应上图的Sparse header)
The following example shows the content of a sparse extent’s header from a VMware hosted
product, such as VMware Workstation, VMware Player, VMware ACE, VMware Server, or VMware
GSX Server:
This structure (struct SparseExtentHeader) needs to be packed. If you use gcc to compile your application, you must use the
keyword __attribute__((__packed__)).
typedef uint64 SectorType;
typedef uint8 Bool;
struct vmdisk_header {
uint32_t version;
uint32_t flags;
int64_t capacity;//代表这个extent的大小,以扇区数表示
int64_t granularity;//一个grain的大小,以扇区数表示
int64_t desc_offset;//描述符的偏移,以扇区为基数
int64_t desc_size; //描述符的大小,以扇区为基数
int32_t num_gtes_per_gte;//graintable的entry数目,规范上是512 就是每个GDE对应的GTE的数目
int64_t rgd_offset;//这里指向的是redundant grain directory的位置
int64_t gd_offset;// 这里指向的是grain directory的位置
int64_t grain_offset;
char filler[1];
char check_bytes[4];
};
通过下面这个图可以找到需要的数据,其中数据都是以扇区为偏移的,在规范的11页有详细的计算方法
代码主要是根据读取文件来填充header结构体,然后根据header结构体,和上面这张图,来读取相应的位置的数据,如果是写数据,就计算好位置,写进去。
-----------------------------------------------------------------------------------------
代码部分:
vmdk.h //头文件- #define SECTOR_BITS 9
- #define SECTOR_SIZE (1 << SECTOR_BITS)
- #define SECTOR_MASK (SECTOR_SIZE - 1)
- #define L1_BITS (SECTOR_BITS - 3)
- #define L1_SIZE (1 << L1_BITS)
- #define L1_MASK (L1_SIZE - 1)
- #define L2_BITS SECTOR_BITS
- #define L2_SIZE (1 << L2_BITS)
- #define L2_MASK (L2_SIZE - 1)
- #define MIN(x,y) (((x) < (y)) ? (x) : (y))
- struct vmdisk_header
- {
- uint32_t version;
- uint32_t flags;
- uint64_t capacity;
- uint64_t granularity;
- uint64_t desc_offset;
- uint64_t desc_size;
- uint32_t num_gtes_per_gte;
- uint64_t rgd_offset;
- uint64_t gd_offset;
- uint64_t grain_offset;
- char filler[1];
- char check_bytes[4];
- };
复制代码 vmdk.c //主文件- #include <linux/module.h>
- #include <linux/init.h>
- #include <linux/moduleparam.h>
- #include <linux/blkdev.h>
- #include <linux/fs.h>
- #include <linux/bio.h>
- #include <linux/kthread.h>
- #include <linux/spinlock.h>
- #include <linux/types.h>
- #include <asm/div64.h>
- #include <asm/stat.h>
- #include "vmdk.h"
- #define KERNEL_SECTOR_SIZE 512
- #define _LARGEFILE_SOURCE
- #define _LARGEFILE64_SOURCE
- #define _FILE_OFFSET_BITS 64
- static char *name = NULL;
- //static char name_vmdk[30] = "/home/pengpeng/a.vmdk";
- struct bio_list {
- struct bio *head;
- struct bio *tail;
- };//直接引用内核中未导出的结构体了
- struct vmdk_dev{
- struct request_queue *vmdk_queue;
- struct gendisk *vmdk_disk;
- struct file *fvmdk;
- struct task_struct *vmdk_thread;
- wait_queue_head_t vmdk_event;
- struct bio_list vmdk_bio_list;
- uint64_t size;
- spinlock_t vmdk_lock;
- };
- struct block_device_operations vmdk_fops = {
- .owner = THIS_MODULE,
- };
- static struct vmdk_dev *vmdk_device;
- static int vmdk_major;//因为要把vmdk文件模拟为一个块设备,所以需要设备号
- static int hardsect_size = 512;
- static struct vmdisk_header header4;
- static struct cowdisk_header header;
- static uint64_t disk_limit;//磁盘容量
- static unsigned int granule_size;
- static uint32_t *l1dir;//GDE存储位置
- static unsigned int cached_l2dir;//代表上一次访问的GDE索引,如果当前访问的位置和上一次位于同一个GDE代表的范围内,就不要读取GTE表格了,直接用上一次读取的GTE表格
- static uint32_t l2dir[L2_SIZE];//GTE表格内容 -512项
- static struct vmdk_prm {
- uint32_t grain_table_size; //对应num_gtes_per_gte
- uint32_t sectors_per_grain;//一个grain所包含的扇区数 手册上默认是128 即一个grain是64kB
- uint32_t sectors_per_table; //前两项的乘积,就是一个GTE table所代表的扇区数
- uint32_t directory_size;//GDE的项数
- } vdsk;
- //----------了解块设备驱动的应该很熟悉这个吧
- static inline void bio_list_add(struct bio_list *bl, struct bio *bio)
- {
- bio->bi_next = NULL;
- if (bl->tail)
- bl->tail->bi_next = bio;
- else
- bl->head = bio;
- bl->tail = bio;
- }
- static inline int bio_list_empty(const struct bio_list *bl)
- {
- return bl->head == NULL;
- }
- static inline struct bio *bio_list_pop(struct bio_list *bl)
- {
- struct bio *bio = bl->head;
- if (bio) {
- bl->head = bl->head->bi_next;
- if (!bl->head)
- bl->tail = NULL;
- bio->bi_next = NULL;
- }
- return bio;
- }
- static void vmdk_add_bio(struct vmdk_dev *dev, struct bio *bio)
- {
- bio_list_add(&dev->vmdk_bio_list, bio);
- }
- static struct bio * vmdk_get_bio(struct vmdk_dev *dev)
- {
- return bio_list_pop(&dev->vmdk_bio_list);
- }
- //--------------------------------------------------------
- //------------从文件的offset偏移处读取length长度的文件,存放到buffer处
- static int read_physical(struct file *f, loff_t offset, size_t length, void *buffer)
- {
- size_t n;
- if(f->f_op->llseek(f, offset, SEEK_SET) == (loff_t)-1)
- return -1;
- n = f->f_op->read(f, buffer, length, &f->f_pos);
- if (n < 0) {
- printk(KERN_ALERT "read from disk %lld", offset);
- return -1;
- }
-
- return n;
- }
- //-----------写到文件的offset处
- static int write_physical(struct file *f, loff_t offset, size_t length, void *buffer)
- {
- size_t n;
- if(f->f_op->llseek(f, offset, SEEK_SET) == (loff_t)-1)
- return -1;
- n = f->f_op->write(f, buffer, length, &f->f_pos);
- if (n < 0) {
- printk(KERN_ALERT "write to disk %lld", offset);
-
- return -1;
- }
- //printk(KERN_ALERT "write_physical 0x%x\n", n);
- return n;
- }
- //读取某个GTE表格
- static int read_l2dir(struct file *f, size_t offset, int num)
- {
- return read_physical(f, offset << SECTOR_BITS, sizeof(l2dir[0]) * num, (char *)l2dir) != sizeof(l2dir);
- }
- //读取GDE
- static int read_l1dir(struct file *f, size_t offset, int num)
- {
- l1dir = kmalloc(sizeof(*l1dir) * num, GFP_KERNEL);
- if (!l1dir)
- return -1;
- return read_physical(f, offset << SECTOR_BITS, sizeof(*l1dir) * num, (char *)l1dir) != (sizeof(*l1dir) * num);
- }
- //主要是对需要的结构进行一些初始化设置
- static int open_disk(struct file *f)
- {
- char magic[4];
- int ret = 0;
- int d;
- uint64_t m;
- mm_segment_t old_fs;
- old_fs = get_fs(); //-------内核中读写文件必须这两句吧
- set_fs(KERNEL_DS);//------
-
- if(f->f_op->read(f, magic, sizeof(magic), &f->f_pos) != sizeof(magic)){
- printk(KERN_ALERT "error magic\n");
- ret = -1;
- goto out_set_fs;
- }
- if (!memcmp(magic, "KDMV", sizeof(magic))) { //----代码其实只实现了vmdk格式
- d = 1;
- } else if (!memcmp(magic, "COWD", sizeof(magic))) {
- d = 2;
- } else {
- printk(KERN_ALERT "Not vmdk file\n");
- ret = -1;
- goto out_set_fs;
- }
- if(d == 1){
- if(f->f_op->read(f, (void*)&header4, sizeof(header4), &f->f_pos) != sizeof(header4)){
- printk(KERN_ALERT "error header\n");
- ret = -1;
- goto out_set_fs;
- }
- granule_size = header4.granularity << SECTOR_BITS;
- disk_limit = header4.capacity << SECTOR_BITS;
- cached_l2dir = -1;
- vdsk.grain_table_size = header4.num_gtes_per_gte;
- vdsk.sectors_per_grain = header4.granularity;
- vdsk.sectors_per_table = vdsk.grain_table_size * vdsk.sectors_per_grain;
-
- m = header4.capacity + vdsk.sectors_per_table - 1;
- do_div(m, vdsk.sectors_per_table);
- vdsk.directory_size = m + 1;
- //printk(KERN_ALERT "directory _size %d", vdsk.directory_size);
- if (read_l1dir(f, header4.rgd_offset, vdsk.directory_size)){
- ret = -1;
- goto out_set_fs;
- }
- }
- else{
- if(f->f_op->read(f, (void*)&header, sizeof(header), &f->f_pos) != sizeof(header)){
- printk(KERN_ALERT "error header\n");
- ret = -1;
- goto out_set_fs;
- }
- granule_size = header.granularity << SECTOR_BITS;
- vdsk.sectors_per_grain = header.granularity;
- vdsk.grain_table_size = L2_SIZE;
- vdsk.sectors_per_table = vdsk.grain_table_size * vdsk.sectors_per_grain;
- vdsk.directory_size = L1_SIZE;
- disk_limit = header.disk_sectors << SECTOR_BITS;
- if (read_l1dir(f, header.l1dir_offset, L1_SIZE))
- ret = -1;
- goto out_set_fs;
- }
- out_set_fs:
- set_fs(old_fs);
- return ret;
- }
- static size_t copy_virtual(struct vmdk_prm *dsk, struct file *f, loff_t offset, void *buffer, unsigned long length)
- {
- unsigned int granule_offset;
- unsigned int grain_index;
- unsigned int sector_map_idx;
- int32_t l;
- uint64_t m;
- uint32_t n;
- l = length;
- while(l > 0){
- m = offset;
- granule_offset = do_div(m, granule_size);
- //printk(KERN_ALERT "granule_offset %d\n",granule_offset);
- // printk(KERN_ALERT "length %x\n",length);
- //length = MIN(length, granule_size - granule_offset);
- if(length > granule_size - granule_offset)
- length = granule_size - granule_offset;
- length = MIN(length, disk_limit - offset);
- //printk(KERN_ALERT "length %x\n",length);
- m = offset >> SECTOR_BITS;
- do_div(m, dsk->sectors_per_table);
- sector_map_idx = m;
- // printk(KERN_ALERT "sector_map_idx %d\n",sector_map_idx);
- if (sector_map_idx >= dsk->directory_size) {
- printk(KERN_ALERT "cannot locate grain table for %d in %d\n", sector_map_idx, dsk->directory_size);
- return -1;
- }
- if (l1dir[sector_map_idx] == 0) {
- printk(KERN_ALERT "l1zero\n");
- goto zero_fill;
- }
- if (sector_map_idx != cached_l2dir) {
- if (read_l2dir(f, l1dir[sector_map_idx], dsk->grain_table_size)) {
- printk(KERN_ALERT "read failed\n");
- return -1;
- }
- cached_l2dir = sector_map_idx;
- }
-
- m = offset >> SECTOR_BITS;
- n = do_div(m, dsk->sectors_per_table);
- grain_index = n / dsk->sectors_per_grain;
- if (grain_index >= dsk->grain_table_size) {
- printk(KERN_ALERT "grain to large\n");
- return -1;
- }
- if (l2dir[grain_index] == 0)
- {
- //printk(KERN_ALERT "l2zero\n");
- goto zero_fill;}
- if (read_physical(f, (l2dir[grain_index] << SECTOR_BITS) + granule_offset, length, buffer) != length) {
- printk(KERN_ALERT "read error 2\n");
- return -1;
- }
- goto zero_next;
- //return length;
- zero_fill:
- memset(buffer, 0 ,length);
- zero_next:
- offset += length;
- buffer += length;
- l -= length;
- //return length;
- }
- return 1;
- }
- static size_t write_virtual(struct vmdk_prm *dsk, struct file *f, loff_t offset, void *buffer, unsigned long length)
- {
- unsigned int granule_offset;
- unsigned int grain_index;
- unsigned int sector_map_idx;
- char tail = 1;
- loff_t l;
- uint64_t m;
- int32_t ll;
- uint32_t n;
- ll = length;//signed unsigned
- while(ll > 0){//一开始没有考虑这个while,几乎导致个人的崩溃
- m = offset;
- granule_offset = do_div(m, granule_size);
- //length = MIN(length, granule_size - granule_offset);
- if(length > granule_size - granule_offset)
- length = granule_size - granule_offset;
- length = MIN(length, disk_limit - offset);
- m = offset >> SECTOR_BITS;
- do_div(m, dsk->sectors_per_table);
- sector_map_idx = m;
- if (sector_map_idx >= dsk->directory_size) {
- printk(KERN_ALERT "cannot locate grain table for %d in %d\n", sector_map_idx, dsk->directory_size);
- return -1;
- }
- if (l1dir[sector_map_idx] == 0)
- return -1;
-
- if (sector_map_idx != cached_l2dir) {
- if (read_l2dir(f, l1dir[sector_map_idx], dsk->grain_table_size)) {
- printk(KERN_ALERT "read failed\n");
- return -1;
- }
- cached_l2dir = sector_map_idx;
- }
-
- m = offset >> SECTOR_BITS;
- n = do_div(m, dsk->sectors_per_table);
- grain_index = n / dsk->sectors_per_grain;
- if (grain_index >= dsk->grain_table_size) {
- printk(KERN_ALERT "grain to large\n");
- return -1;
- }
-
- if (l2dir[grain_index] == 0){
- printk(KERN_ALERT "gaga\n");
- if((l = f->f_op->llseek(f, 0, SEEK_END)) == (loff_t)-1)
- return -1;
-
- l2dir[grain_index] = l >> SECTOR_BITS;
- if(write_physical(f, ((l1dir[sector_map_idx] << SECTOR_BITS) + (grain_index << 2)), 4, &l2dir[grain_index]) != 4)
- return -1;
- if(f->f_op->llseek(f, 64*1024 - 1, SEEK_END) == (loff_t)-1)
- return -1;
- if(f->f_op->write(f, &tail, 1, &f->f_pos) < 0)
- return -1;
- }
- //printk(KERN_ALERT "write_virtual 0x%x\n", length);
- if(write_physical(f, (l2dir[grain_index] << SECTOR_BITS) + granule_offset, length, buffer) != length){
- printk(KERN_ALERT "write error\n");
- return -1;
- }
- //return length;
- buffer += length;
- offset += length;
- ll -= length;
- }
- return 1;
- }
- static int vmdk_transfer(struct vmdk_dev *dev, unsigned long sector,
- unsigned long nsect, char *buffer, int write)
- {
- loff_t offset = sector*KERNEL_SECTOR_SIZE;
- loff_t nbytes = nsect*KERNEL_SECTOR_SIZE;
- int ret = 0;
- struct file *f = dev->fvmdk;
- mm_segment_t old_fs;
- if ((offset + nbytes) > dev->size) {
- printk (KERN_NOTICE "Beyond-end write %x\n", offset);
- printk (KERN_NOTICE "Beyond-end write %x\n", nbytes);
- return -1;
- }
-
- old_fs = get_fs();
- set_fs(KERNEL_DS);
-
- if (write){
- ret = (write_virtual(&vdsk, f, offset, buffer, nbytes) > 0);
- //ret = -1;
-
- }
- else{
- ret = (copy_virtual(&vdsk, f, offset, buffer, nbytes) > 0);
-
- //printk(KERN_ALERT "read %d", ret);
- }
-
- set_fs(old_fs);
- return ((ret == -1) ? 1 : 0);
- }
- static void vmdk_handle_bio(struct vmdk_dev *dev, struct bio *bio)
- {
- int i;
- struct bio_vec *bvec;
- sector_t sector = bio->bi_sector;
- int status = 0;
- /* Do each segment independently. */
- bio_for_each_segment(bvec, bio, i) {
- char *buffer = kmap(bio_iovec_idx((bio), (i))->bv_page) + bio_iovec_idx((bio), (i))->bv_offset;
- status = vmdk_transfer(dev, sector, bio_cur_sectors(bio),
- buffer, bio_data_dir(bio) == WRITE);
- if(status)
- break;
- sector += bio_cur_sectors(bio);
- kunmap(bio_iovec_idx((bio), (i))->bv_page);
- }
- bio_endio(bio, status);
- }
- static int vmdk_rw_thread(void *data)
- {
- struct vmdk_dev *dev = data;
- struct bio *bio;
- set_user_nice(current, -20);
- while (!kthread_should_stop() || !bio_list_empty(&dev->vmdk_bio_list)) {
- if(wait_event_interruptible(dev->vmdk_event,
- !bio_list_empty(&dev->vmdk_bio_list) ||
- kthread_should_stop()))
- break;
- if (bio_list_empty(&dev->vmdk_bio_list))
- continue;
- spin_lock_irq(&dev->vmdk_lock);
- bio = vmdk_get_bio(dev);
- spin_unlock_irq(&dev->vmdk_lock);
- BUG_ON(!bio);
- vmdk_handle_bio(dev, bio);
- //bio_endio(bio, (status ? 0 : 1));
- }
- return 0;
- }
- static int vmdk_make_request(struct request_queue *q, struct bio *old_bio)
- {
- struct vmdk_dev *dev = q->queuedata;
- int rw = bio_rw(old_bio);
- if(rw == READA)
- rw = READ;
- BUG_ON(!dev || (rw != READ && rw != WRITE));
-
- spin_lock_irq(&dev->vmdk_lock);
- vmdk_add_bio(dev, old_bio);
- wake_up(&dev->vmdk_event); //这里唤醒内核线程,处理bio
- spin_unlock_irq(&dev->vmdk_lock);
- return 0;
- }
- struct vmdk_dev *vmdk_alloc(int i)//块设备驱动的一些初始化工作
- {
- struct vmdk_dev *dev;
- struct gendisk *disk;
- dev = kzalloc(sizeof(*dev), GFP_KERNEL);
- if (!dev)
- goto out;
- dev->vmdk_queue = blk_alloc_queue(GFP_KERNEL);
- if (!dev->vmdk_queue)
- goto out_free_dev;
- blk_queue_make_request(dev->vmdk_queue, &vmdk_make_request);//vmdk_make_request 这个函数 当有数据请求是会调用这个函数
- blk_queue_hardsect_size(dev->vmdk_queue, hardsect_size);
- dev->vmdk_queue->queuedata = dev;
-
- disk = dev->vmdk_disk = alloc_disk(16);
- if (!disk)
- goto out_free_queue;
- dev->vmdk_thread = NULL;
- init_waitqueue_head(&dev->vmdk_event);
- spin_lock_init(&dev->vmdk_lock);
- disk->major = vmdk_major;
- disk->first_minor = i * 16;
- disk->fops = &vmdk_fops;
- disk->private_data = dev;
- disk->queue = dev->vmdk_queue;
- sprintf(disk->disk_name, "vmdk%c", 'a');
- dev->vmdk_thread = kthread_create(vmdk_rw_thread, dev, "vmdk%d", i);
- wake_up_process(dev->vmdk_thread);
- if(IS_ERR(dev->vmdk_thread))
- goto out_free_queue;
- return dev;
- out_free_queue:
- blk_cleanup_queue(dev->vmdk_queue);
- out_free_dev:
- kfree(dev);
- out:
- return NULL;
- }
- static int __init vmdk_init(void)
- {
- int status;
- struct file *f;
- vmdk_major = register_blkdev(0, "vmdk");
- if(vmdk_major <= 0){
- printk(KERN_ALERT "can not get major\n");
- return -EBUSY;
- }
- vmdk_device = vmdk_alloc(0);
- if(vmdk_device == NULL){
- printk(KERN_ALERT "no mem\n");
- goto Enomem;
- }
-
- if(!name)
- goto freedisk;
- vmdk_device->fvmdk = filp_open(name, O_RDWR | O_LARGEFILE, 0664);
- if(IS_ERR(vmdk_device->fvmdk))
- goto freedisk;
- f = vmdk_device->fvmdk;
- status = open_disk(f);
- if(status < 0)
- goto out_close_file;
- set_capacity(vmdk_device->vmdk_disk, header4.capacity);
- vmdk_device->size = (header4.capacity) << 9;
- add_disk(vmdk_device->vmdk_disk);
-
- return 0;
- out_close_file:
- if(l1dir)
- kfree(l1dir);
- filp_close(vmdk_device->fvmdk, current->files);
- freedisk:
- blk_cleanup_queue(vmdk_device->vmdk_queue);
- kfree(vmdk_device);
- Enomem:
- unregister_blkdev(vmdk_major, "vmdk");
- return -ENOMEM;
-
- }
- static void vmdk_exit(void)
- {
- kthread_stop(vmdk_device->vmdk_thread);
- filp_close(vmdk_device->fvmdk,current->files);
- kfree(l1dir);
- blk_cleanup_queue(vmdk_device->vmdk_queue);
- del_gendisk(vmdk_device->vmdk_disk);
- kfree(vmdk_device);
- unregister_blkdev(vmdk_major, "vmdk");
- }
- module_init(vmdk_init);
- module_exit(vmdk_exit);
- module_param(name, charp, S_IRUGO);
- MODULE_LICENSE("GPL");
复制代码 第一次写这个代码的时候没做什么注释,只是写了个简单的文档,其实是10年1月时写的代码,刚刚来道上混,不懂江湖规矩,大家将就一下吧。现在把代码拿出来拿出来与大家分享,想做点注释,但是又觉得麻烦。因为代码的调用关系很简单。并且也没有内核中那么复杂的merge算法,纯bio操作。如果你了解基本的块设备驱动的编写方法,并且有这方面的工作需求(都是b出来的),读这部分的代码应该很容易的。
写这个代码用了我大约40天的时间,呵呵。很痛苦的经历。虽然代码写的很简单,也不是多完善,功能上确实可以正常运行了。
个人觉得在编写代码中遇到的难点是:
1.如何在内核态读取文件,cu上很多类似的代码都是insmod时读取,但是如果你需要insmod之后读取呢,这时就需要自己建立一个进程上下文的环境,开启内核线程。
2.对vmdk文件格式的把握,然后在代码中严格按照格式去操作,代码中的copy_virtual
和write_virtual是历经磨难
3.对bio结构体的操作,需要kmap和kunmap
4.开启内核线程之后,需要互斥对bio链表的操作,内核发来bio,这时唤醒内核线程对bio操作,所以需要互斥
在加载驱动后,会在/dev下看到vmdka vmdka1之类的设备节点,如果你的vmdk文件是包括文件系统的,这时你可以mount这个设备节点,然后到相应的目录下进行操作就可以了
参考:
ldd3
ldk2
cu上赵磊大侠的文章
qemu源码 vmdk2raw.c
vmdk-loop.c
linux loop驱动
无数次的google
一两次的cu发帖
vm-spec文档 |
评分
-
查看全部评分
|