1)既然是cache,就有一个hash结构
hash的索引是(dev,block),这个dev是kdev_t,不是那个blkdev,block_device。。。kdev_t 到block device的映射以后再谈吧。值得说明的是只有加入了这个hash表的buffer才能叫做进入了buffer cache。
这个部分包括hash表的hash算法,hash链表的维护(add delete 。。。),
static unsigned int [color="#006600"]bh_hash_mask;
static unsigned int [color="#006600"]bh_hash_shift;
static struct buffer_head **[color="#006600"]hash_table;
static rwlock_t[color="#006600"] hash_table_lock = RW_LOCK_UNLOCKED;
#define _hashfn(dev,block) 。。。
#define [color="#006600"]hash(dev,block) hash_table[(_hashfn(HASHDEV(dev),block) & bh_hash_mask)]
static __inline__ void [color="#006600"]__hash_link(struct buffer_head *bh, struct buffer_head **head)
static __inline__ void [color="#006600"]__hash_unlink(struct buffer_head *bh)
static inline struct buffer_head *[color="#006600"] __get_hash_table(kdev_t dev, int block, int size)
struct buffer_head * [color="#006600"]get_hash_table(kdev_t dev, int block, int size) 2)缓存就要考虑数据老化,缓存回收的问题,所以有个lur list
static struct buffer_head *[color="#006600"]lru_list[NR_LIST];
static spinlock_t[color="#006600"] lru_list_lock = SPIN_LOCK_UNLOCKED;
static int [color="#006600"]nr_buffers_type[NR_LIST];
static unsigned long [color="#006600"]size_buffers_type[NR_LIST];
呵呵,没有进行啥包装,整个struct 多好。列一下buffer的各种lru队列。
#define[color="#006600"] BUF_CLEAN 0
#define [color="#006600"]BUF_LOCKED 1 /* Buffers scheduled for write */
#define[color="#006600"] BUF_DIRTY 2 /* Dirty buffers, not yet scheduled for write */
#define [color="#006600"]BUF_PROTECTED 3 /* Ramdisk persistent storage */
#define NR_LIST 4
static void [color="#006600"]__insert_into_lru_list(struct buffer_head * bh, int blist)
static void [color="#009900"]__remove_from_lru_list(struct buffer_head * bh, int blist)
static void [color="#006600"]__refile_buffer(struct buffer_head *bh)
void [color="#009900"]refile_buffer(struct buffer_head *bh)
static __inline__ void [color="#006600"]__mark_dirty(struct buffer_head *bh)
void [color="#006600"]__mark_buffer_dirty(struct buffer_head *bh)
static inline void [color="#006600"]__mark_buffer_clean(struct buffer_head *bh)
static inline void [color="#006600"]mark_buffer_clean(struct buffer_head * bh)
static inline void [color="#006600"]__mark_buffer_protected(struct buffer_head *bh)
static inline void [color="#006600"]mark_buffer_protected(struct buffer_head * bh)
这些mark函数当然是标记buffer 的各种状态,然后通过 refile_buffer在各种类型的lru队列间移动。比较简单,就算是考虑的同步和互斥
啥的也不能算作是复杂吧? 有时需要一些打包函数,将buffer head 同时加入hash 和lru队列。
static void [color="#006600"]__remove_from_queues(struct buffer_head *bh)
static void [color="#006600"]__insert_into_queues(struct buffer_head *bh)
[color="#006600"]
__refile_buffer 中有个[color="#006600"] remove_inode_queue(bh) 的操作值得注意一下。[color="#006600"]
/*
* A buffer may need to be moved from one buffer list to another
* (e.g. in case it is not shared any more). Handle this.
*/
static void __refile_buffer(struct buffer_head *bh)
{
int dispose = BUF_CLEAN;
if (buffer_locked(bh))
dispose = BUF_LOCKED;
if (buffer_dirty(bh))
dispose = BUF_DIRTY;
if (buffer_protected(bh))
dispose = BUF_PROTECTED;
if (dispose != bh->b_list) {
__remove_from_lru_list(bh, bh->b_list);
bh->b_list = dispose; if (dispose == BUF_CLEAN) remove_inode_queue(bh);
__insert_into_lru_list(bh, dispose);
}
}
i[color="#000000"]node 中有个inode->i_dirty_buffers [color="#000000"]记录了这个inode中所有dirty的数据。稍后我们再分析这个dirty的数据是什么:元数据还是文件
数据。
/* The caller must have the lru_list lock before calling the
remove_inode_queue functions. */
static void __remove_inode_queue(struct buffer_head *bh)
{
bh->b_inode = NULL;
list_del(&bh->b_inode_buffers);
}
static inline void remove_inode_queue(struct buffer_head *bh)
{ if (bh->b_inode) //可以看出,并不是每个buffer 都和一个inode 相对应的,只有以部分才有.
__remove_inode_queue(bh);
}
int inode_has_buffers(struct inode *inode);//这个简单。。
到底什么buffer才有inode与之对应,等分析万buffer cache的创建就会清楚了。
我们先来看看buffer cache 的创建,藉此研究buffer cache 中的内容以及buffer cache 和系统其他几个部分之间的关系: 3)buffer cache 的创建与buffer head 的回收 实际上,有两种类型的buffer_head 存在于系统中,一种存在于buffer cache, 存在于buffer cache 中的 buffer(head)
必然存在于lur list。[color="#990000"]这中类型的buffer 其唯一的分配途径就是 [color="#006600"]getblk, 然后通过bread(kdev_t dev, int block, int size)被广泛用于读取文件的元数据:
struct buffer_head * [color="#006600"]getblk(kdev_t dev, int block, int size)
{
....
repeat:
spin_lock(&lru_list_lock);
write_lock(&hash_table_lock);
bh = __get_hash_table(dev, block, size); //look up in hash first
if (bh)
goto out; //找到就简单了
isize = BUFSIZE_INDEX(size);
spin_lock(&free_list[isize].lock);
bh = free_list[isize].list; //尝试在free list 中分配一个
if (bh) {
__remove_from_free_list(bh, isize);
atomic_set(&bh->b_count, 1);
}
spin_unlock(&free_list[isize].lock);
/*
* OK, FINALLY we know that this buffer is the only one of
* its kind, we hold a reference (b_count>0), it is unlocked,
* and it is clean.
*/
if (bh) {
init_buffer(bh, NULL, NULL);
bh->b_dev = dev;
bh->b_blocknr = block;
bh->b_state = 1 buffers = bh;
page->flags &= ~(1 mapping->a_ops->readpage(filp, page);->ext2_readpage->[color="#006600"]block_read_full_page
generic_file_write ->mapping->a_ops->prepare_write(file, page, offset, offset+bytes);->ext2_prepare_write-> [color="#006600"]block_prepare_write
generic_file_write -> mapping->a_ops->commit_write(file, page, offset, offset+bytes) -> [color="#006600"]generic_commit_write
这里给出一个图示说明page cache, filemap,buffer cache, buffer entry(仅作io entry的buffer)的关系(也许不是100%正确!!)
马上回顾一下buffer_head的回收,就会发现,这种类型的buffer 很自然的进入page cache继而通过[color="#006600"]try_to_free_buffers 进行回收.
实在没有必要把这些函数的实现都列到这里仔细讨论了,仅以其中一个为例吧,但是在讨论前还是说一下这些函数的用途吧:
这些函数值得注意的是写文件的方式,第一种提供给具体的文件系统使用,参考generic_file_write,
int [color="#006600"]block_prepare_write(struct page *page, unsigned from, unsigned to,...)
int [color="#006600"]generic_commit_write(struct file *file, struct page *page,...)
我们在讨论generic_file 的读写时也涉及到这些函数。
另外一中类型的是 [color="#006600"]block_write_full_page,像是上面两个函数的打包,其实其中有不同。
我们回顾一下generic_file_write的基本操作流程:
ssize_t generic_file_write(struct file *file,const char *buf,size_t count,loff_t *ppos)
{
............ //略过
while (count) {
unsigned long bytes, index, offset;
char *kaddr;
int deactivate = 1; /* * Try to find the page in the cache. If it isn't there, * allocate a free page. */
offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
。。。。
page = __grab_cache_page(mapping, index, &cached_page);
if (!page)
break;
/* We have exclusive IO access to the page.. */
if (!PageLocked(page)) {
PAGE_BUG(page);
} /*对于ext2,就是从磁盘先将文件页面读入,如果需要还要为文件分配磁盘block*/
status = mapping->a_ops->prepare_write(file, page, offset, offset+bytes);
if (status)
goto unlock;
kaddr = page_address(page);
status = copy_from_user(kaddr+offset, buf, bytes);
flush_dcache_page(page);
if (status)
goto fail_write; /*对于ext2,就是mark所有bh为dirt,mark 对应 inode为dirty. 见 ext2_aops */
status = mapping->a_ops->commit_write(file, page, offset, offset+bytes);
.............//略过
/* For now, when the user asks for O_SYNC, we'll actually
* provide O_DSYNC. */
if ((status >= 0) && (file->f_flags & O_SYNC))
status = generic_osync_inode(inode, 1); /* 1 means datasync */
if (buffer_new(bh)) {
[color="#ff0000"] [color="#ff0000"]unmap_underlying_metadata(bh); [color="#cc0000"]//这次我们把这个东西讨论清楚...呵呵
.....
}
}
if (!buffer_uptodate(bh) &&
(block_start to)) {
ll_rw_block(READ, 1, &bh); //read in , make it uptodate
*wait_bh++=bh;
}
}
......
}
[color="#ff0000"]unmap_underlying_metadata 曾经是一个很困惑的问题,这次终于能够了断了 :-) 我们曾经在linuxforum上有一个讨论,但是基本上没有说道点子上,见这个帖子: linux forum上讨论unmpa_underlaying_metadata 的讨论 http://www.linuxforum.net/forum/showthreaded.php?Cat=&Board=linuxK&Number=408077&page=&view=&sb=&o=
这次分析到这里,没有办法,经过刻苦的寻找,终于找到了1999年关于这个问题的一些线索,其实很简单,我终于受到了启发:
这个讨论启发了我: http://www.mail-archive.com/linux-fsdevel@vger.rutgers.edu/msg00298.html 问题的根源在于buffer 的释放问题:真正从buffer cache中消除buffer的函数是__bforget, 然而只有(少数文件系统系统直接调用__bforget)unmap_underlying_metadata, try_to_free_buffers (page_lunder)是进入这个过程的常见入口. 设想这个一个流程:
1) 打开 foo/xxx , 修改xxx的内容
2)rm foo
3)吧xxx元数据所占用的block分配给新的文件, 现在,因为rm foo的时候我们并没有及时调用__bforget, 所以buffer cache 中还有一个alias的buffer.
至于以前讨论的,我们认为通过dd这种操作raw设备的方式所拥有的alias, 并不在unmap_underlying_metadata 考
虑的范围内.本来,2.4的时候已经不负责buffer cache和page
cache之间的同步了.这里有必要性不在于这个alias在buffer
cache中,而在于他是ditry的如果不clear掉,就会引起data corrupt. 2.4以后仅仅是drop掉数据就够了.
/*
* bforget() is like brelse(), except it puts the buffer on the
* free list if it can.. We can NOT free the buffer if:
* - there are other users of it
* - it is locked and thus can have active IO
*/
void [color="#006600"]__bforget(struct buffer_head * buf)
{
/* grab the lru lock here to block bdflush. */
spin_lock(&lru_list_lock);
write_lock(&hash_table_lock);
if (!atomic_dec_and_test(&buf->b_count) || buffer_locked(buf))
goto in_use;
[color="#006600"]__hash_unlink(buf);
remove_inode_queue(buf);
write_unlock(&hash_table_lock);
__remove_from_lru_list(buf, buf->b_list);
spin_unlock(&lru_list_lock);
put_last_free(buf);
return;
in_use:
write_unlock(&hash_table_lock);
spin_unlock(&lru_list_lock);
}
/*
* We are taking a block for data and we don't want any output from any
* buffer-cache aliases starting from return from that function and
* until the moment when something will explicitly mark the buffer
* dirty (hopefully that will not happen until we will free that block ;-)
* We don't even need to mark it not-uptodate - nobody can expect
* anything from a newly allocated buffer anyway. We used to used
* unmap_buffer() for such invalidation, but that was wrong. We definitely
* don't want to mark the alias unmapped, for example - it would confuse
* anyone who might pick it with bread() afterwards...
*/
static void unmap_underlying_metadata(struct buffer_head * bh)
{
struct buffer_head *old_bh;
old_bh = get_hash_table(bh->b_dev, bh->b_blocknr, bh->b_size);
if (old_bh) {
mark_buffer_clean(old_bh);
wait_on_buffer(old_bh);
clear_bit(BH_Req, &old_bh->b_state);
/* Here we could run brelse or bforget. We use
bforget because it will try to put the buffer
in the freelist. */ __bforget(old_bh);
}
}
//mapping->a_ops->commit_write -> [color="#006600"]block_commit_write -->__block_commit_write
static int [color="#006600"]__block_commit_write(struct inode *inode, struct page *page,
unsigned from, unsigned to)
{
5)buffer cache的老化回收:lru 队列
bdflash进程是主要负责将dirty的buffer 写入磁盘的任务, 通过上面的分析我们知道无论是元数据还是文件数据,都通过bh进入lru队列。
union bdflush_param {
} [color="#006600"]bdf_prm = {{30, 64, 64, 256, 5*HZ, 30*HZ, 60, 0, 0}};
/* These are the min and max parameter values that we will allow to be assigned */
int [color="#006600"]bdflush_min[N_PARAM] = { 0, 10, 5, 25, 0, 1*HZ, 0, 0, 0};
int[color="#006600"] bdflush_max[N_PARAM] = {100,50000, 20000, 20000,600*HZ, 6000*HZ, 100, 0, 0};
作为buffer cache,必须有buffer_head, struct
page,和数据区(物理内存页面),缺一不可,并且要同时(几乎都是同时的呵呵)加入lru list 和hash表,这个我们在分析page
cache (filemap.c) 的时候就见过类似的概念了。
另外文件数据只进入lru 队列,并不加入buffer cache,要时刻记住了.
我们从buflash开始吧.
sys_bdflush: 配置,略.
从 __init bdflush_init(void) 知道有两个内核线程专注于回收buffers: bdflush 和 kupdate.
/*
* This is the actual bdflush daemon itself. It used to be started from
* the syscall above, but now we launch it ourselves internally with
* kernel_thread(...) directly after the first thread in init/main.c
*/
int [color="#006600"]bdflush(void *sem)
{
struct task_struct *tsk = current;
int flushed;
....// 初始化,略
....//clear signal,略
for (;;) { //主要任务:
CHECK_EMERGENCY_SYNC //这玩意以后再说吧
flushed = flush_dirty_buffers(0); //flush buffers:遍历所有lru,启动磁盘io操作,仅此而已.
if (free_shortage()) //如果物理页面不够了
flushed += page_launder(GFP_KERNEL, 0); //试图回收一些页面,会有更多dirty page通过bh进入buffer lru
/*
* If there are still a lot of dirty buffers around,
* skip the sleep and flush some more. Otherwise, we
* go to sleep waiting a wakeup.
*/
set_current_state(TASK_INTERRUPTIBLE);
if (!flushed || balance_dirty_state(NODEV) state = TASK_INTERRUPTIBLE;
schedule_timeout(interval); //以一定的间隔运行
} else {
stop_kupdate:
tsk->state = TASK_STOPPED;
schedule(); /* wait for SIGCONT */
}
/* check for sigstop */
if (signal_pending(tsk)) {
int stopped = 0;
spin_lock_irq(&tsk->sigmask_lock);
if (sigismember(&tsk->pending.signal, SIGSTOP)) {//收到SIGSTOP就停止运行
sigdelset(&tsk->pending.signal, SIGSTOP);
stopped = 1;
}
recalc_sigpending(tsk);
spin_unlock_irq(&tsk->sigmask_lock);
if (stopped)
goto stop_kupdate;
}
#ifdef DEBUG
printk("kupdate() activated...\n");
#endif
sync_old_buffers(); //结果就是以一定的见个运行这个函数
}
}
/*
* Here we attempt to write back old buffers. We also try to flush inodes
* and supers as well, since this function is essentially "update", and
* otherwise there would be no way of ensuring that these quantities ever
* get written back. Ideally, we would have a timestamp on the inodes
* and superblocks so that we could write back only the old ones as well
*/
static int sync_old_buffers(void)
{
lock_kernel();
sync_supers(0); //回写super
sync_inodes(0); //回写inode本身和 filemap的那些页面
unlock_kernel();
//回写完了就有更多的bh在lru队列了!! flush_dirty_buffers(1); //检查时戳,老到一定程度再flush,和bdflush的工作一样:启动磁盘io
/* must really sync all the active I/O request to disk here */
run_task_queue(&tq_disk);//不要让bh 在磁盘调度队列中永远沉睡下去(没有timer驱动的,只有byhand调用了)
return 0;
}
顺便去看看tq_disk: 这是一个task queue, 但是不是所有的task queue 都会得到自动执行的.
其实本系列所覆盖的代码(kernel fs(only ext2/proc/devfs and common fs surport ) mm
driver/(ide pci )) 只有extern task_queue tq_timer, tq_immediate, tq_disk;
这三个task queue, 而其中tq_disk没有像另外两个一样挂接到bottom half的处理中去.
其他接口函数:
int block_sync_page(struct page *page)
void wakeup_bdflush(int block) 再说一下buffer head 的回收
try_to_free_buffers 是buffer 回收和buffer head 回收的主要入口. 不论是buffer cache
中的buffer 以及bh还是作为io entry的buffer 以及bh, 绝大多数都是通过page cache的lru队列进行回收的.
我们看到buffer cache 中的page 页面也加入了page cache的lru队列(不过仅仅是加入lru队列而已,不会在page
cache 的hash队列中看到的). 另外在flash 一个page 的时候也会试图释放buffer head
见block_flushpage(用于文件的truncate). 剩余部分:sync invalidate truncate Sync: 文件系统的dirty数据是以一定的策略,定时回写的,有时需要马上把dirty数据回写到硬盘上,这就需要sync的支持了.
这里边,sync_page_buffers(struct buffer_head *bh, int wait)就是为了try_to_free_buffers用用.不太关乎这里的文件sync操作.
不妨看看sync操作的几种情形:
1) fync 和 fdatasync(int fd):希望下面的man出来的信息已经足够理解这两个操作了
fdatasync() flushes all data buffers of a file to disk (before
the sys-tem call returns). It resembles fsync() but is not required
to update the metadata such as access time.
asmlinkage long sys_fsync(unsigned int fd)
{
struct file * file;
struct dentry * dentry;
struct inode * inode;
int err;
err = -EBADF;
file = fget(fd);
if (!file)
goto out;
dentry = file->f_dentry;
inode = dentry->d_inode;
err = -EINVAL;
if (!file->f_op || !file->f_op->fsync)
goto out_putf;
/* We need to protect against concurrent writers.. */
down(&inode->i_sem);
filemap_fdatasync(inode->i_mapping); /*
[color="#3333ff"] int (*writepage)(struct page *) = mapping->a_ops->writepage; ie,ext2_writepage->block_write_full_page->sumit all bh to driver
*/
err = file->f_op->fsync(file, dentry, 0); [color="#3333ff"]/* 基本就是调用file_fsync,ext2是fsync_inode_buffers*/
filemap_fdatawait(inode->i_mapping); [color="#3333ff"]/*等待io完成*/
up(&inode->i_sem);
out_putf:
fput(file);
out:
return err;
}
asmlinkage long sys_fdatasync(unsigned int fd)
{
.............................
filemap_fdatasync(inode->i_mapping);
err = file->f_op->fsync(file, dentry, 1); /*和上面相比就这里不同*/
filemap_fdatawait(inode->i_mapping);
...............
}
2) dev的sync : man sync: 将所有data写入磁盘,包括super block
asmlinkage long sys_sync(void)
{
fsync_dev(0); /*0 代表sync所有设备*/
return 0;
}
3)O_SYNC :open 一个文件的时候指定以同步方式写入文件.
generic_file_write->generic_osync_inode -> osync_inode_buffers(inode);或者 fsync_inode_buffers(inode)
int osync_inode_buffers(struct inode *inode): 就是等待inode上的dirty buffer io完成.
int fsync_inode_buffers(struct inode *inode): 对当前的inode上的dirtybuffer
提交bh到块驱动程序, 然后等待这些buffer io完成,最后调用
osync_inode_buffers,等待在这个过程中其他提交了写操作的buffer.
然后看看在这些接口函数后面,真正干活的吧:
/*
* filp may be NULL if called via the msync of a vma.
*/
int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
{
struct inode * inode = dentry->d_inode;
struct super_block * sb;
kdev_t dev;
int ret;
lock_kernel();
/* sync the inode to buffers */
write_inode_now(inode, 0); [color="#3333ff"]/*又做了一遍 filemap的同步,然后写入inode本身*/
/* sync the superblock to buffers */
sb = inode->i_sb;
lock_super(sb);
if (sb->s_op && sb->s_op->write_super)
sb->s_op->write_super(sb); [color="#3333ff"]/*写入 super block*/
unlock_super(sb);
/* .. finally sync the buffers to disk */
dev = inode->i_dev;
ret = sync_buffers(dev, 1); [color="#3333ff"]/*上面的操作提交写操作到block dev(或者只是mark dirt),这里最后进行写入和等待*/
unlock_kernel();
return ret;
}
void sync_dev(kdev_t dev)和fync_dev类似,只是不等待io操作完成:
int fsync_dev(kdev_t dev)
{
sync_buffers(dev, 0); /*先写入dirt buffer*/
lock_kernel();
sync_supers(dev); /*mark 更多 dirty bh*/
sync_inodes(dev); /*mark 更多 dirty bh,包括file map的,呵呵*/
DQUOT_SYNC(dev);
unlock_kernel();
return sync_buffers(dev, 1); /*写入新mark的bh,然后等待io操作完成.*/
}
最后static int sync_buffers(kdev_t dev, int wait) 虽然不短,但是也是比较好理解的.看看他分三趟写入bh的方式就可以了吧?
/* One pass for no-wait, three for wait:
* 0) write out all dirty, unlocked buffers;
* 1) write out all dirty buffers, waiting if locked;
* 2) wait for completion by waiting for all buffers to unlock.
*/ invalidate:在unmount 文件系统,删除一个文件,或者发生disk change等
状况的时候,我们需要将文件或这设备上所有数据丢弃,这时需要的是invalidate.
对于文件,invalidate_inode_buffers 只是将 inode 的dirty buffer
和这个inode脱离关系,对dirty的buffer不做任何处理.(这些buffer 既含有meta数据又有文件数据),从这里看过去就知道 unmap_underlying_metadata 的重要之处了.
对于一个设备的invalidate操作分成两种,一种需要保留dity的buffer,一种干脆丢弃所有dirty的buffer:__invalidate_buffers
#define invalidate_buffers(dev) __invalidate_buffers((dev), 0)
#define destroy_buffers(dev) __invalidate_buffers((dev), 1)
detroy的时候吧 dirty buffer统统从bufffer cache摘除,然后放到buffer 的free链表中去.
而invalidate 则仅仅减少引用计数,当然clean buffer在两种操作之中都会放到free list中去.
一般进行invalidate的时候都先进行了sync操作.... truncate:
截断一个文件. 对截断的部分进行flush操作. 本模块提供flush的支持:最终的操作都要归结与buffer的操作,page
cache以buffer作为io entry,而元数据则是直接用buffer cache了。有两个接口函数用于buffer 的flush操作:
int block_flushpage(struct
page *page, unsigned long offset) : 将page上的buffer
进行unmap并将bh标记为clean如果是整个页面都被flush,还尝试释放buffer(try_to_free_buffers:
把buffer彻底释放,包括buffer head也释放掉,这是buffer
head的另一个释放途经,也是通过page来的。对这个函数要说明一点,看下面的图:含有offset的那个buffer是没有动过的。注意到这点有助
于理解block_truncate_page。