免费注册 查看新帖 |

Chinaunix

  平台 论坛 博客 文库
12下一页
最近访问板块 发新帖
查看: 10648 | 回复: 12
打印 上一主题 下一主题

把vmdk文件模拟为块设备 [复制链接]

论坛徽章:
0
跳转到指定楼层
1 [收藏(0)] [报告]
发表于 2011-07-02 21:26 |只看该作者 |倒序浏览
本帖最后由 木叉叉木大 于 2011-07-04 14:33 编辑

不是高手,代码很搓,第一次发出自己的内容来,希望大家指教 代码部分的排版比较乱,直接从博客上copy过来的,不会用wordpress啊

目标:基于linux平台,在内核态解析vmdk文件,把文件模拟为块设备,最后可以mount (主要是做一个简单的sparse文件读写,所以很多vmdk格式的内容没有考虑,后面会随着文档列出)

环境:Ubuntu8.04  linux-2.6.24-24  gcc-4.2.4

vmdk文件格式分析:

注:这里只描述Hosted Sparse Extents(所谓的稀疏文件,一开始创建时很小,随着数据的增多而变大)

在我们使用vmware虚拟机创建一个虚拟磁盘时,会提示你是一个single file 还是 多个split file 并且我们是sparse文件 不是flat文件。下面是一个sparse文件的格式



vmdk spec 1.1 给出的

Hosted Sparse Extent Header (对应上图的Sparse header)
The following example shows the content of a sparse extent’s header from a VMware hosted
product, such as VMware Workstation, VMware Player, VMware ACE, VMware Server, or VMware
GSX Server:

This structure (struct SparseExtentHeader) needs to be packed. If you use gcc to compile your application, you must use the
keyword __attribute__((__packed__)).
typedef uint64 SectorType;
typedef uint8 Bool;
struct vmdisk_header {
uint32_t version;
uint32_t flags;
int64_t capacity;//代表这个extent的大小,以扇区数表示
int64_t granularity;//一个grain的大小,以扇区数表示
int64_t desc_offset;//描述符的偏移,以扇区为基数
int64_t desc_size; //描述符的大小,以扇区为基数
int32_t num_gtes_per_gte;//graintable的entry数目,规范上是512 就是每个GDE对应的GTE的数目
int64_t rgd_offset;//这里指向的是redundant grain directory的位置
int64_t gd_offset;// 这里指向的是grain directory的位置
int64_t grain_offset;
char filler[1];
char check_bytes[4];
};
通过下面这个图可以找到需要的数据,其中数据都是以扇区为偏移的,在规范的11页有详细的计算方法

代码主要是根据读取文件来填充header结构体,然后根据header结构体,和上面这张图,来读取相应的位置的数据,如果是写数据,就计算好位置,写进去。
-----------------------------------------------------------------------------------------
代码部分:

vmdk.h //头文件
  1. #define SECTOR_BITS 9
  2. #define SECTOR_SIZE (1 << SECTOR_BITS)
  3. #define SECTOR_MASK (SECTOR_SIZE - 1)
  4. #define L1_BITS (SECTOR_BITS - 3)
  5. #define L1_SIZE (1 << L1_BITS)
  6. #define L1_MASK (L1_SIZE - 1)
  7. #define L2_BITS SECTOR_BITS
  8. #define L2_SIZE (1 << L2_BITS)
  9. #define L2_MASK (L2_SIZE - 1)
  10. #define MIN(x,y) (((x) < (y)) ? (x) : (y))
  11. struct vmdisk_header
  12. {
  13. uint32_t version;
  14. uint32_t flags;
  15. uint64_t capacity;
  16. uint64_t granularity;
  17. uint64_t desc_offset;
  18. uint64_t desc_size;
  19. uint32_t num_gtes_per_gte;
  20. uint64_t rgd_offset;
  21. uint64_t gd_offset;
  22. uint64_t grain_offset;
  23. char filler[1];
  24. char check_bytes[4];
  25. };
复制代码
vmdk.c //主文件
  1. #include <linux/module.h>
  2. #include <linux/init.h>
  3. #include <linux/moduleparam.h>
  4. #include <linux/blkdev.h>
  5. #include <linux/fs.h>
  6. #include <linux/bio.h>
  7. #include <linux/kthread.h>
  8. #include <linux/spinlock.h>
  9. #include <linux/types.h>
  10. #include <asm/div64.h>
  11. #include <asm/stat.h>
  12. #include "vmdk.h"
  13. #define KERNEL_SECTOR_SIZE 512
  14. #define _LARGEFILE_SOURCE
  15. #define _LARGEFILE64_SOURCE
  16. #define _FILE_OFFSET_BITS 64
  17. static char *name = NULL;
  18. //static char name_vmdk[30] = "/home/pengpeng/a.vmdk";
  19. struct bio_list {
  20. struct bio *head;
  21. struct bio *tail;
  22. };//直接引用内核中未导出的结构体了
  23. struct vmdk_dev{
  24. struct request_queue *vmdk_queue;
  25. struct gendisk *vmdk_disk;
  26. struct file *fvmdk;
  27. struct task_struct *vmdk_thread;
  28. wait_queue_head_t vmdk_event;
  29. struct bio_list vmdk_bio_list;
  30. uint64_t size;
  31. spinlock_t vmdk_lock;
  32. };
  33. struct block_device_operations vmdk_fops = {
  34. .owner                = THIS_MODULE,
  35. };
  36. static struct vmdk_dev *vmdk_device;
  37. static int vmdk_major;//因为要把vmdk文件模拟为一个块设备,所以需要设备号
  38. static int hardsect_size = 512;
  39. static struct vmdisk_header header4;
  40. static struct cowdisk_header header;
  41. static uint64_t disk_limit;//磁盘容量
  42. static unsigned int granule_size;
  43. static uint32_t *l1dir;//GDE存储位置
  44. static unsigned int cached_l2dir;//代表上一次访问的GDE索引,如果当前访问的位置和上一次位于同一个GDE代表的范围内,就不要读取GTE表格了,直接用上一次读取的GTE表格
  45. static uint32_t l2dir[L2_SIZE];//GTE表格内容 -512项
  46. static struct vmdk_prm {
  47. uint32_t grain_table_size; //对应num_gtes_per_gte
  48. uint32_t sectors_per_grain;//一个grain所包含的扇区数 手册上默认是128 即一个grain是64kB
  49. uint32_t sectors_per_table; //前两项的乘积,就是一个GTE table所代表的扇区数
  50. uint32_t directory_size;//GDE的项数
  51. } vdsk;
  52. //----------了解块设备驱动的应该很熟悉这个吧
  53. static inline void bio_list_add(struct bio_list *bl, struct bio *bio)
  54. {
  55. bio->bi_next = NULL;
  56. if (bl->tail)
  57. bl->tail->bi_next = bio;
  58. else
  59. bl->head = bio;
  60. bl->tail = bio;
  61. }
  62. static inline int bio_list_empty(const struct bio_list *bl)
  63. {
  64. return bl->head == NULL;
  65. }
  66. static inline struct bio *bio_list_pop(struct bio_list *bl)
  67. {
  68. struct bio *bio = bl->head;
  69. if (bio) {
  70. bl->head = bl->head->bi_next;
  71. if (!bl->head)
  72. bl->tail = NULL;
  73. bio->bi_next = NULL;
  74. }
  75. return bio;
  76. }
  77. static void vmdk_add_bio(struct vmdk_dev *dev, struct bio *bio)
  78. {
  79. bio_list_add(&dev->vmdk_bio_list, bio);
  80. }
  81. static struct bio * vmdk_get_bio(struct vmdk_dev *dev)
  82. {
  83. return bio_list_pop(&dev->vmdk_bio_list);
  84. }
  85. //--------------------------------------------------------
  86. //------------从文件的offset偏移处读取length长度的文件,存放到buffer处
  87. static int read_physical(struct file *f, loff_t offset, size_t length, void *buffer)
  88. {
  89. size_t n;
  90. if(f->f_op->llseek(f, offset, SEEK_SET) == (loff_t)-1)
  91. return -1;
  92. n = f->f_op->read(f, buffer, length, &f->f_pos);
  93. if (n < 0) {
  94. printk(KERN_ALERT "read from disk %lld", offset);
  95. return -1;
  96. }

  97. return n;
  98. }
  99. //-----------写到文件的offset处
  100. static int write_physical(struct file *f, loff_t offset, size_t length, void *buffer)
  101. {
  102. size_t n;
  103. if(f->f_op->llseek(f, offset, SEEK_SET) == (loff_t)-1)
  104. return -1;
  105. n = f->f_op->write(f, buffer, length, &f->f_pos);
  106. if (n < 0) {
  107. printk(KERN_ALERT "write to disk %lld", offset);

  108. return -1;
  109. }
  110. //printk(KERN_ALERT "write_physical 0x%x\n", n);
  111. return n;
  112. }
  113. //读取某个GTE表格
  114. static int read_l2dir(struct file *f, size_t offset, int num)
  115. {
  116. return read_physical(f, offset << SECTOR_BITS, sizeof(l2dir[0]) * num, (char *)l2dir) != sizeof(l2dir);
  117. }
  118. //读取GDE
  119. static int read_l1dir(struct file *f, size_t offset, int num)
  120. {
  121. l1dir = kmalloc(sizeof(*l1dir) * num, GFP_KERNEL);
  122. if (!l1dir)
  123. return -1;
  124. return read_physical(f, offset << SECTOR_BITS, sizeof(*l1dir) * num, (char *)l1dir) != (sizeof(*l1dir) * num);
  125. }
  126. //主要是对需要的结构进行一些初始化设置
  127. static int open_disk(struct file *f)
  128. {
  129. char magic[4];
  130. int ret = 0;
  131. int d;
  132. uint64_t  m;
  133. mm_segment_t old_fs;
  134. old_fs = get_fs(); //-------内核中读写文件必须这两句吧
  135. set_fs(KERNEL_DS);//------

  136. if(f->f_op->read(f, magic, sizeof(magic), &f->f_pos) != sizeof(magic)){
  137. printk(KERN_ALERT "error magic\n");
  138. ret = -1;
  139. goto out_set_fs;
  140. }
  141. if (!memcmp(magic, "KDMV", sizeof(magic))) { //----代码其实只实现了vmdk格式
  142. d = 1;
  143. } else if (!memcmp(magic, "COWD", sizeof(magic))) {
  144. d = 2;
  145. } else {
  146. printk(KERN_ALERT "Not vmdk file\n");
  147. ret = -1;
  148. goto out_set_fs;
  149. }
  150. if(d == 1){
  151. if(f->f_op->read(f, (void*)&header4, sizeof(header4), &f->f_pos) != sizeof(header4)){
  152. printk(KERN_ALERT "error header\n");
  153. ret = -1;
  154. goto out_set_fs;
  155. }
  156. granule_size = header4.granularity << SECTOR_BITS;
  157. disk_limit = header4.capacity << SECTOR_BITS;
  158. cached_l2dir = -1;
  159. vdsk.grain_table_size = header4.num_gtes_per_gte;
  160. vdsk.sectors_per_grain = header4.granularity;
  161. vdsk.sectors_per_table = vdsk.grain_table_size * vdsk.sectors_per_grain;

  162. m = header4.capacity + vdsk.sectors_per_table - 1;
  163. do_div(m, vdsk.sectors_per_table);
  164. vdsk.directory_size = m + 1;
  165. //printk(KERN_ALERT "directory _size %d", vdsk.directory_size);
  166. if (read_l1dir(f, header4.rgd_offset, vdsk.directory_size)){
  167. ret = -1;
  168.    goto out_set_fs;
  169. }
  170. }
  171. else{
  172.    if(f->f_op->read(f, (void*)&header, sizeof(header), &f->f_pos) != sizeof(header)){
  173. printk(KERN_ALERT "error header\n");
  174. ret = -1;
  175. goto out_set_fs;
  176. }
  177. granule_size = header.granularity << SECTOR_BITS;
  178. vdsk.sectors_per_grain = header.granularity;
  179. vdsk.grain_table_size = L2_SIZE;
  180. vdsk.sectors_per_table = vdsk.grain_table_size * vdsk.sectors_per_grain;
  181. vdsk.directory_size = L1_SIZE;
  182. disk_limit = header.disk_sectors << SECTOR_BITS;
  183. if (read_l1dir(f, header.l1dir_offset, L1_SIZE))
  184. ret = -1;
  185. goto out_set_fs;
  186. }
  187. out_set_fs:
  188. set_fs(old_fs);
  189. return ret;
  190. }
  191. static size_t copy_virtual(struct vmdk_prm *dsk, struct file *f, loff_t offset, void *buffer, unsigned long length)
  192. {
  193. unsigned int granule_offset;
  194. unsigned int grain_index;
  195. unsigned int sector_map_idx;
  196. int32_t l;
  197. uint64_t m;
  198. uint32_t n;
  199. l = length;
  200. while(l > 0){
  201. m = offset;
  202. granule_offset = do_div(m, granule_size);
  203. //printk(KERN_ALERT "granule_offset %d\n",granule_offset);
  204. // printk(KERN_ALERT "length %x\n",length);
  205. //length = MIN(length, granule_size - granule_offset);
  206. if(length > granule_size - granule_offset)
  207. length = granule_size - granule_offset;
  208. length = MIN(length, disk_limit - offset);
  209. //printk(KERN_ALERT "length %x\n",length);
  210. m = offset >> SECTOR_BITS;
  211. do_div(m, dsk->sectors_per_table);
  212. sector_map_idx = m;
  213. // printk(KERN_ALERT "sector_map_idx %d\n",sector_map_idx);
  214. if (sector_map_idx >= dsk->directory_size) {
  215. printk(KERN_ALERT "cannot locate grain table for %d in %d\n", sector_map_idx, dsk->directory_size);
  216. return -1;
  217. }
  218. if (l1dir[sector_map_idx] == 0) {
  219. printk(KERN_ALERT "l1zero\n");
  220. goto zero_fill;
  221. }
  222. if (sector_map_idx != cached_l2dir) {
  223. if (read_l2dir(f, l1dir[sector_map_idx], dsk->grain_table_size)) {
  224. printk(KERN_ALERT "read failed\n");
  225. return -1;
  226. }
  227. cached_l2dir = sector_map_idx;
  228. }

  229. m = offset >> SECTOR_BITS;
  230. n = do_div(m, dsk->sectors_per_table);
  231. grain_index = n / dsk->sectors_per_grain;
  232. if (grain_index >= dsk->grain_table_size) {
  233. printk(KERN_ALERT "grain to large\n");
  234. return -1;
  235. }
  236. if (l2dir[grain_index] == 0)
  237. {
  238. //printk(KERN_ALERT "l2zero\n");
  239. goto zero_fill;}
  240. if (read_physical(f, (l2dir[grain_index] << SECTOR_BITS) + granule_offset, length, buffer) != length) {
  241. printk(KERN_ALERT "read error 2\n");
  242. return -1;
  243. }
  244. goto zero_next;
  245. //return length;
  246. zero_fill:
  247. memset(buffer, 0 ,length);
  248. zero_next:
  249. offset += length;
  250. buffer += length;
  251. l -= length;
  252. //return length;
  253. }
  254. return 1;
  255. }
  256. static size_t write_virtual(struct vmdk_prm *dsk, struct file *f, loff_t offset, void *buffer, unsigned long length)
  257. {
  258. unsigned int granule_offset;
  259. unsigned int grain_index;
  260. unsigned int sector_map_idx;
  261. char tail = 1;
  262. loff_t l;
  263. uint64_t m;
  264. int32_t ll;
  265. uint32_t n;
  266. ll = length;//signed  unsigned
  267. while(ll > 0){//一开始没有考虑这个while,几乎导致个人的崩溃
  268. m = offset;
  269. granule_offset = do_div(m, granule_size);
  270. //length = MIN(length, granule_size - granule_offset);
  271. if(length > granule_size - granule_offset)
  272. length = granule_size - granule_offset;
  273. length = MIN(length, disk_limit - offset);
  274. m = offset >> SECTOR_BITS;
  275. do_div(m, dsk->sectors_per_table);
  276. sector_map_idx = m;
  277. if (sector_map_idx >= dsk->directory_size) {
  278. printk(KERN_ALERT "cannot locate grain table for %d in %d\n", sector_map_idx, dsk->directory_size);
  279. return -1;
  280. }
  281. if (l1dir[sector_map_idx] == 0)
  282. return -1;

  283. if (sector_map_idx != cached_l2dir) {
  284. if (read_l2dir(f, l1dir[sector_map_idx], dsk->grain_table_size)) {
  285. printk(KERN_ALERT "read failed\n");
  286. return -1;
  287. }
  288. cached_l2dir = sector_map_idx;
  289. }

  290. m = offset >> SECTOR_BITS;
  291. n = do_div(m, dsk->sectors_per_table);
  292. grain_index = n / dsk->sectors_per_grain;
  293. if (grain_index >= dsk->grain_table_size) {
  294. printk(KERN_ALERT "grain to large\n");
  295. return -1;
  296. }

  297. if (l2dir[grain_index] == 0){
  298. printk(KERN_ALERT "gaga\n");
  299. if((l = f->f_op->llseek(f, 0, SEEK_END)) == (loff_t)-1)
  300. return -1;

  301. l2dir[grain_index] = l >> SECTOR_BITS;
  302. if(write_physical(f, ((l1dir[sector_map_idx] << SECTOR_BITS) + (grain_index << 2)), 4, &l2dir[grain_index]) != 4)
  303. return -1;
  304. if(f->f_op->llseek(f, 64*1024 - 1, SEEK_END) == (loff_t)-1)
  305. return -1;
  306. if(f->f_op->write(f, &tail, 1, &f->f_pos) < 0)
  307. return -1;
  308. }
  309. //printk(KERN_ALERT "write_virtual 0x%x\n", length);
  310. if(write_physical(f, (l2dir[grain_index] << SECTOR_BITS) + granule_offset, length, buffer) != length){
  311. printk(KERN_ALERT "write error\n");
  312. return -1;
  313. }
  314. //return length;
  315. buffer += length;
  316. offset += length;
  317. ll -= length;
  318. }
  319. return 1;
  320. }
  321. static int vmdk_transfer(struct vmdk_dev *dev, unsigned long sector,
  322. unsigned long nsect, char *buffer, int write)
  323. {
  324. loff_t offset = sector*KERNEL_SECTOR_SIZE;
  325. loff_t nbytes = nsect*KERNEL_SECTOR_SIZE;
  326. int ret = 0;
  327. struct file *f = dev->fvmdk;
  328. mm_segment_t old_fs;
  329. if ((offset + nbytes) > dev->size) {
  330. printk (KERN_NOTICE "Beyond-end write %x\n", offset);
  331. printk (KERN_NOTICE "Beyond-end write %x\n", nbytes);
  332. return -1;
  333. }

  334. old_fs = get_fs();
  335. set_fs(KERNEL_DS);

  336. if (write){
  337. ret = (write_virtual(&vdsk, f, offset, buffer, nbytes) > 0);
  338. //ret = -1;

  339. }
  340. else{
  341. ret = (copy_virtual(&vdsk, f, offset, buffer, nbytes) > 0);

  342. //printk(KERN_ALERT "read %d", ret);
  343. }

  344. set_fs(old_fs);
  345. return ((ret == -1) ? 1 : 0);
  346. }
  347. static void vmdk_handle_bio(struct vmdk_dev *dev, struct bio *bio)
  348. {
  349. int i;
  350. struct bio_vec *bvec;
  351. sector_t sector = bio->bi_sector;
  352. int status = 0;
  353. /* Do each segment independently. */
  354. bio_for_each_segment(bvec, bio, i) {
  355. char *buffer = kmap(bio_iovec_idx((bio), (i))->bv_page) + bio_iovec_idx((bio), (i))->bv_offset;
  356. status = vmdk_transfer(dev, sector, bio_cur_sectors(bio),
  357. buffer, bio_data_dir(bio) == WRITE);
  358. if(status)
  359. break;
  360. sector += bio_cur_sectors(bio);
  361. kunmap(bio_iovec_idx((bio), (i))->bv_page);
  362. }
  363. bio_endio(bio, status);
  364. }
  365. static int vmdk_rw_thread(void *data)
  366. {
  367. struct vmdk_dev *dev = data;
  368. struct bio *bio;
  369. set_user_nice(current, -20);
  370. while (!kthread_should_stop() || !bio_list_empty(&dev->vmdk_bio_list)) {
  371. if(wait_event_interruptible(dev->vmdk_event,
  372. !bio_list_empty(&dev->vmdk_bio_list) ||
  373. kthread_should_stop()))
  374. break;
  375. if (bio_list_empty(&dev->vmdk_bio_list))
  376. continue;
  377. spin_lock_irq(&dev->vmdk_lock);
  378. bio = vmdk_get_bio(dev);
  379. spin_unlock_irq(&dev->vmdk_lock);
  380. BUG_ON(!bio);
  381. vmdk_handle_bio(dev, bio);
  382. //bio_endio(bio, (status ? 0 : 1));
  383. }
  384. return 0;
  385. }
  386. static int vmdk_make_request(struct request_queue *q, struct bio *old_bio)
  387. {
  388. struct vmdk_dev *dev = q->queuedata;
  389. int rw = bio_rw(old_bio);
  390. if(rw == READA)
  391. rw = READ;
  392. BUG_ON(!dev || (rw != READ && rw != WRITE));

  393. spin_lock_irq(&dev->vmdk_lock);
  394. vmdk_add_bio(dev, old_bio);
  395. wake_up(&dev->vmdk_event); //这里唤醒内核线程,处理bio
  396. spin_unlock_irq(&dev->vmdk_lock);
  397. return 0;
  398. }
  399. struct vmdk_dev *vmdk_alloc(int i)//块设备驱动的一些初始化工作
  400. {
  401. struct vmdk_dev *dev;
  402. struct gendisk *disk;
  403. dev = kzalloc(sizeof(*dev), GFP_KERNEL);
  404. if (!dev)
  405. goto out;
  406. dev->vmdk_queue = blk_alloc_queue(GFP_KERNEL);
  407. if (!dev->vmdk_queue)
  408. goto out_free_dev;
  409. blk_queue_make_request(dev->vmdk_queue, &vmdk_make_request);//vmdk_make_request 这个函数 当有数据请求是会调用这个函数
  410. blk_queue_hardsect_size(dev->vmdk_queue, hardsect_size);
  411. dev->vmdk_queue->queuedata = dev;

  412. disk = dev->vmdk_disk = alloc_disk(16);
  413. if (!disk)
  414. goto out_free_queue;
  415. dev->vmdk_thread = NULL;
  416. init_waitqueue_head(&dev->vmdk_event);
  417. spin_lock_init(&dev->vmdk_lock);
  418. disk->major = vmdk_major;
  419. disk->first_minor = i * 16;
  420. disk->fops = &vmdk_fops;
  421. disk->private_data = dev;
  422. disk->queue = dev->vmdk_queue;
  423. sprintf(disk->disk_name, "vmdk%c", 'a');
  424. dev->vmdk_thread = kthread_create(vmdk_rw_thread, dev, "vmdk%d", i);
  425. wake_up_process(dev->vmdk_thread);
  426. if(IS_ERR(dev->vmdk_thread))
  427. goto out_free_queue;
  428. return dev;
  429. out_free_queue:
  430. blk_cleanup_queue(dev->vmdk_queue);
  431. out_free_dev:
  432. kfree(dev);
  433. out:
  434. return NULL;
  435. }
  436. static int __init vmdk_init(void)
  437. {
  438. int status;
  439. struct file *f;
  440. vmdk_major = register_blkdev(0, "vmdk");
  441. if(vmdk_major <= 0){
  442. printk(KERN_ALERT "can not get major\n");
  443. return -EBUSY;
  444. }
  445. vmdk_device = vmdk_alloc(0);
  446. if(vmdk_device == NULL){
  447. printk(KERN_ALERT "no mem\n");
  448. goto Enomem;
  449. }

  450. if(!name)
  451. goto freedisk;
  452. vmdk_device->fvmdk = filp_open(name, O_RDWR | O_LARGEFILE, 0664);
  453. if(IS_ERR(vmdk_device->fvmdk))
  454. goto freedisk;
  455. f = vmdk_device->fvmdk;
  456. status = open_disk(f);
  457. if(status < 0)
  458. goto out_close_file;
  459. set_capacity(vmdk_device->vmdk_disk, header4.capacity);
  460. vmdk_device->size = (header4.capacity) << 9;
  461. add_disk(vmdk_device->vmdk_disk);

  462. return 0;
  463. out_close_file:
  464. if(l1dir)
  465. kfree(l1dir);
  466. filp_close(vmdk_device->fvmdk, current->files);
  467. freedisk:
  468. blk_cleanup_queue(vmdk_device->vmdk_queue);
  469. kfree(vmdk_device);
  470. Enomem:
  471. unregister_blkdev(vmdk_major, "vmdk");
  472. return -ENOMEM;

  473. }
  474. static void vmdk_exit(void)
  475. {
  476. kthread_stop(vmdk_device->vmdk_thread);
  477. filp_close(vmdk_device->fvmdk,current->files);
  478. kfree(l1dir);
  479. blk_cleanup_queue(vmdk_device->vmdk_queue);
  480. del_gendisk(vmdk_device->vmdk_disk);
  481. kfree(vmdk_device);
  482. unregister_blkdev(vmdk_major, "vmdk");
  483. }
  484. module_init(vmdk_init);
  485. module_exit(vmdk_exit);
  486. module_param(name, charp, S_IRUGO);
  487. MODULE_LICENSE("GPL");
复制代码
第一次写这个代码的时候没做什么注释,只是写了个简单的文档,其实是10年1月时写的代码,刚刚来道上混,不懂江湖规矩,大家将就一下吧。现在把代码拿出来拿出来与大家分享,想做点注释,但是又觉得麻烦。因为代码的调用关系很简单。并且也没有内核中那么复杂的merge算法,纯bio操作。如果你了解基本的块设备驱动的编写方法,并且有这方面的工作需求(都是b出来的),读这部分的代码应该很容易的。
写这个代码用了我大约40天的时间,呵呵。很痛苦的经历。虽然代码写的很简单,也不是多完善,功能上确实可以正常运行了。
个人觉得在编写代码中遇到的难点是:
1.如何在内核态读取文件,cu上很多类似的代码都是insmod时读取,但是如果你需要insmod之后读取呢,这时就需要自己建立一个进程上下文的环境,开启内核线程。
2.对vmdk文件格式的把握,然后在代码中严格按照格式去操作,代码中的copy_virtual
和write_virtual是历经磨难
3.对bio结构体的操作,需要kmap和kunmap
4.开启内核线程之后,需要互斥对bio链表的操作,内核发来bio,这时唤醒内核线程对bio操作,所以需要互斥
在加载驱动后,会在/dev下看到vmdka vmdka1之类的设备节点,如果你的vmdk文件是包括文件系统的,这时你可以mount这个设备节点,然后到相应的目录下进行操作就可以了
参考:
ldd3
ldk2
cu上赵磊大侠的文章
qemu源码 vmdk2raw.c
vmdk-loop.c
linux loop驱动
无数次的google
一两次的cu发帖
vm-spec文档

评分

参与人数 2可用积分 +15 收起 理由
Godbach + 10 感谢分享
瀚海书香 + 5 感谢分享!

查看全部评分

论坛徽章:
0
2 [报告]
发表于 2011-07-04 13:55 |只看该作者
本帖最后由 almeydifer 于 2011-07-04 13:59 编辑

我觉得楼主发这个贴很有意义呀,不过最好能够把文字在梳理一遍,这样可读性会更强。

顺便问一下楼主,GDE和GTE是用来干什么用的呢?是用来在spare file里实现动态增长的核心数据结构吗?

论坛徽章:
0
3 [报告]
发表于 2011-07-04 14:30 |只看该作者
回复 2# almeydifer


    GDE GTE 其实和linux的页表原理差不多   因为我们平时虚拟机里面的操作系统所用的数据 都是存储在创建的那个大文件里面 ,其实就是那个大文件就是一个磁盘数据+磁盘信息+数据定位信息等
你如果要访问某个位置的数据,首先计算位于哪个GDE 然后计算位于哪个GTE entry  然后再计算具体位置  。如果是写的话,也是和读一样,因为是sparse文件,如果要写入的地方,在GTE entry中是空的,
那么说明需要增长sparse文件,代码中是往文件尾追加64K(我觉得这里可能不是很好的实现方式,敬请指正,为什么追加64k,是因为一个GTE entry 默认代表128个扇区 也就是64K),直到增大到sparse
文件描述符所指名的capacity为止。

    我有点懒得整理了,呵呵。其实完全不用vmdk 你可以自己创建一个flat格式的文件,然后把它模拟为一个块设备。

论坛徽章:
0
4 [报告]
发表于 2011-07-05 14:09 |只看该作者
回复 3# 木叉叉木大


    谢谢啦。打听一下,您做这个是自己的爱好,还是工作/科研的需要呢?

论坛徽章:
0
5 [报告]
发表于 2011-07-06 16:46 |只看该作者
回复 4# almeydifer


This is the detail. The goal is to port the last code to colinux , in order to make colinux boot from sparse file not just flat file. And I really did that.
But I do not know whether it is commercialized,because i quit the job to take further study.If possible , i will also share with people the porting code.

论坛徽章:
0
6 [报告]
发表于 2011-07-07 11:44 |只看该作者
回复 5# 木叉叉木大

谢谢您的回复呀。

我有个问题就是:为什么colinux要用到spare file?

我的理解是:spare file虽然可以容量动态增长,节省空间,但是在传统的旋转磁盘上,这种方式会造成磁盘碎片,如果此时有多个虚拟机或者OS instances同时做磁盘I/O,性能会更加变得难以分析或者理解。

论坛徽章:
0
7 [报告]
发表于 2011-07-08 10:26 |只看该作者
sorry, I do not have the enough knowledge to answer your question about the performance. becasue the original colinux vm machine only support flat file.The size of flat file do not change whether you write to the flat file or not.

As we all know, vmware ,qemu or virtualbox all support sparse file in order to save space.

I am a newbie. The manager ask me to do that,and what I care is how to make it.

The author of colinux gaveme a suggestion to porting the sparse code in windows platform not modifying the linux kernel ,and in this way the switch between linux and windows will take less consuption of system resource. But I modified the linux kernel just for no knowledge of windows platform.

论坛徽章:
0
8 [报告]
发表于 2011-07-08 14:28 |只看该作者
回复 1# 木叉叉木大


    先Mark一下,到时候再看!

论坛徽章:
36
IT运维版块每日发帖之星
日期:2016-04-10 06:20:00IT运维版块每日发帖之星
日期:2016-04-16 06:20:0015-16赛季CBA联赛之广东
日期:2016-04-16 19:59:32IT运维版块每日发帖之星
日期:2016-04-18 06:20:00IT运维版块每日发帖之星
日期:2016-04-19 06:20:00每日论坛发贴之星
日期:2016-04-19 06:20:00IT运维版块每日发帖之星
日期:2016-04-25 06:20:00IT运维版块每日发帖之星
日期:2016-05-06 06:20:00IT运维版块每日发帖之星
日期:2016-05-08 06:20:00IT运维版块每日发帖之星
日期:2016-05-13 06:20:00IT运维版块每日发帖之星
日期:2016-05-28 06:20:00每日论坛发贴之星
日期:2016-05-28 06:20:00
9 [报告]
发表于 2011-07-08 14:52 |只看该作者
回复 1# 木叉叉木大


    感谢 LZ  分享

论坛徽章:
0
10 [报告]
发表于 2011-07-08 15:43 |只看该作者
sorry, I do not have the enough knowledge to answer your question about the performance. becasue the ...
木叉叉木大 发表于 2011-07-08 10:26



嗯,没关系,这么说来colinux应该是在pc上取代VMware workstation的一个较轻量级的方案啦,是应该用sparse file。
您需要登录后才可以回帖 登录 | 注册

本版积分规则 发表回复

  

北京盛拓优讯信息技术有限公司. 版权所有 京ICP备16024965号-6 北京市公安局海淀分局网监中心备案编号:11010802020122 niuxiaotong@pcpop.com 17352615567
未成年举报专区
中国互联网协会会员  联系我们:huangweiwei@itpub.net
感谢所有关心和支持过ChinaUnix的朋友们 转载本站内容请注明原作者名及出处

清除 Cookies - ChinaUnix - Archiver - WAP - TOP