论坛徽章:: 0

电梯直达

1楼 [收藏(0)] [报告]

发表于 2012-10-09 18:26 |只看该作者 |倒序浏览

本帖最后由 blake326 于 2012-10-09 18:29 编辑

3.6 kernel

SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
size_t, count)
{
struct file *file;
ssize_t ret = -EBADF;
int fput_needed;

file = fget_light(fd, &fput_needed);
if (file) {
loff_t pos = file_pos_read(file);
ret = vfs_write(file, buf, count, &pos);
file_pos_write(file, pos);
fput_light(file, fput_needed);
}

return ret;
}
ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
{
ssize_t ret;

if (!(file->f_mode & FMODE_WRITE))
return -EBADF;
if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write))
return -EINVAL;
if (unlikely(!access_ok(VERIFY_READ, buf, count)))
return -EFAULT;

ret = rw_verify_area(WRITE, file, pos, count);
if (ret >= 0) {
count = ret;
if (file->f_op->write)
ret = file->f_op->write(file, buf, count, pos);
else
ret = do_sync_write(file, buf, count, pos);
if (ret > 0) {
fsnotify_modify(file);
add_wchar(current, ret);
}
inc_syscw(current);
}

return ret;
}
ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
{
struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
struct kiocb kiocb;
ssize_t ret;

init_sync_kiocb(&kiocb, filp);
kiocb.ki_pos = *ppos;
kiocb.ki_left = len;
kiocb.ki_nbytes = len;

for (;

{
ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
if (ret != -EIOCBRETRY)
break;
wait_on_retry_sync_kiocb(&kiocb);
}

if (-EIOCBQUEUED == ret)
ret = wait_on_sync_kiocb(&kiocb);
*ppos = kiocb.ki_pos;
return ret;
}
ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
ssize_t ret;

BUG_ON(iocb->ki_pos != pos);

sb_start_write(inode->i_sb);
mutex_lock(&inode->i_mutex);
ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
mutex_unlock(&inode->i_mutex);

if (ret > 0 || ret == -EIOCBQUEUED) {
ssize_t err;

err = generic_write_sync(file, pos, ret);
if (err < 0 && ret > 0)
ret = err;
}
sb_end_write(inode->i_sb);
return ret;
}

ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
   unsigned long nr_segs, loff_t *ppos)
{
struct file *file = iocb->ki_filp;
struct address_space * mapping = file->f_mapping;
size_t ocount; /* original count */
size_t count; /* after file limit checks */
struct inode *inode = mapping->host;
loff_t pos;
ssize_t written;
ssize_t err;

ocount = 0;
err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
if (err)
return err;

count = ocount;
pos = *ppos;

/* We can write back this queue in page reclaim */
current->backing_dev_info = mapping->backing_dev_info;
written = 0;

err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
if (err)
goto out;

if (count == 0)
goto out;

err = file_remove_suid(file);
if (err)
goto out;

err = file_update_time(file);
if (err)
goto out;

/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
if (unlikely(file->f_flags & O_DIRECT)) {
loff_t endbyte;
ssize_t written_buffered;

written = generic_file_direct_write(iocb, iov, &nr_segs, pos,
ppos, count, ocount);
if (written < 0 || written == count)
goto out;
/*
   * direct-io write to a hole: fall through to buffered I/O
   * for completing the rest of the request.
   */
pos += written;
count -= written;
written_buffered = generic_file_buffered_write(iocb, iov,
nr_segs, pos, ppos, count,
written);
/*
   * If generic_file_buffered_write() retuned a synchronous error
   * then we want to return the number of bytes which were
   * direct-written, or the error code if that was zero.  Note
   * that this differs from normal direct-io semantics, which
   * will return -EFOO even if some bytes were written.
   */
if (written_buffered < 0) {
err = written_buffered;
goto out;
}

/*
   * We need to ensure that the page cache pages are written to
   * disk and invalidated to preserve the expected O_DIRECT
   * semantics.
   */
endbyte = pos + written_buffered - written - 1;
err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte);
if (err == 0) {
written = written_buffered;
invalidate_mapping_pages(mapping,
   pos >> PAGE_CACHE_SHIFT,
   endbyte >> PAGE_CACHE_SHIFT);
} else {
/*
   * We don't know how much we wrote, so just return
   * the number of bytes which were direct-written
   */
}
} else {
written = generic_file_buffered_write(iocb, iov, nr_segs,
pos, ppos, count, written);
}
out:
current->backing_dev_info = NULL;
return written ? written : err;
}
ssize_t
generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos, loff_t *ppos,
size_t count, ssize_t written)
{
struct file *file = iocb->ki_filp;
ssize_t status;
struct iov_iter i;

iov_iter_init(&i, iov, nr_segs, count, written);
status = generic_perform_write(file, &i, pos);

if (likely(status >= 0)) {
written += status;
*ppos = pos + status;
    }

return written ? written : status;
}
****************************************************************************************
同sys_read一样，开始调用流程都很明了。
generic_perform_write的pos表示要写的文件内偏移，iov_iter->count保存要写的数据大小。根据这两个参数计算出write涉及到的所有page偏移，对每个page分别处理：
1. a_ops->write_begin，查找或者分配相应的page缓存，有必要为这个page的每个block分配buffer_head，有必要还要从等待从磁盘读取这些数据。基本上write_begin起了一个类似于读取磁盘的作用。
2. iov_iter_copy_from_user_atomic通过kmap_atomic临时内核映射page到一个内核地址，然后把用户buf数据拷贝这个地址中去，然后释放映射。pagefault disable/enable作用？？？
3. a_ops->write_end根据情况更新buffer_head，page的状态，并且调用mark_inode_dirty通知per-bdi内核线程进行回写。
****************************************************************************************
static ssize_t generic_perform_write(struct file *file,
struct iov_iter *i, loff_t pos)
{
struct address_space *mapping = file->f_mapping;
const struct address_space_operations *a_ops = mapping->a_ops;
long status = 0;
ssize_t written = 0;
unsigned int flags = 0;

/*
   * Copies from kernel address space cannot fail (NFSD is a big user).
   */
if (segment_eq(get_fs(), KERNEL_DS))
flags |= AOP_FLAG_UNINTERRUPTIBLE;

do {
struct page *page;
unsigned long offset; /* Offset into pagecache page */
unsigned long bytes; /* Bytes to write to page */
size_t copied; /* Bytes copied from user */
void *fsdata;

offset = (pos & (PAGE_CACHE_SIZE - 1));
bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
iov_iter_count(i));

again:
/*
   * Bring in the user page that we will copy from _first_.
   * Otherwise there's a nasty deadlock on copying from the
   * same page as we're writing to, without it being marked
   * up-to-date.
   *
   * Not only is this an optimisation, but it is also required
   * to check that the address is actually valid, when atomic
   * usercopies are used, below.
   */
if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
status = -EFAULT;
break;
}

status = a_ops->write_begin(file, mapping, pos, bytes, flags,
&page, &fsdata);
if (unlikely(status))
break;

if (mapping_writably_mapped(mapping))
flush_dcache_page(page);

pagefault_disable();
copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
pagefault_enable();
flush_dcache_page(page);

mark_page_accessed(page);
status = a_ops->write_end(file, mapping, pos, bytes, copied,
page, fsdata);
if (unlikely(status < 0))
break;
copied = status;

cond_resched();

iov_iter_advance(i, copied);
if (unlikely(copied == 0)) {
/*
   * If we were unable to copy any data at all, we must
   * fall back to a single segment length write.
   *
   * If we didn't fallback here, we could livelock
   * because not all segments in the iov can be copied at
   * once without a pagefault.
   */
bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
iov_iter_single_seg_count(i));
goto again;
}
pos += copied;
written += copied;

balance_dirty_pages_ratelimited(mapping);
if (fatal_signal_pending(current)) {
status = -EINTR;
break;
}
} while (iov_iter_count(i));

return written ? written : status;
}

****************************************************************************************
ext2_write_begin首先调用grab_cache_page_write_begin，查找page，没有则分配一个，但是如果page正在被写回（PG_writeback状态，写请求正在被提交但是没完成），则必须等待上一个写过程完成，要不然数据就乱大了。
然后有必要的话__block_write_begin为page的每个block分配buffer_head，然后对每个buffer_head分别进行处理：
1. 检查该写请求是否涉及到该block，没有设计的话则继续处理下一个block。
2. 如果该buffer_head还没有映射到磁盘，BH_Mapped没有设置，新分配的buffer_head都是没有影射的，所以这里又要使用ext2_get_block去读取这个映射关系了，同sys_read中的区别是，这里只读取一个块的映射关系。
3. 检查page如果已经uptodate，设置buffer_head uptodate。如果用户先sys_read,再sys_write的话这里page一般是uptodate的。
4. 反之假设用户直接sys_write的话，这里的page就不是uptodate的，通过ll_rw_block提交buffer_head读请求将磁盘数据读取出来。
所以block都处理完毕了，一种直接sys_write的情况就是，通过ll_rw_block发出了4个buffer_head读取请求，那么现在必须通过wait_on_buffer等待这些读取全部完成才能继续。
****************************************************************************************
static int ext2_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
{
int ret;

ret = block_write_begin(mapping, pos, len, flags, pagep,
ext2_get_block);
if (ret < 0)
ext2_write_failed(mapping, pos + len);
return ret;
}
int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
unsigned flags, struct page **pagep, get_block_t *get_block)
{
pgoff_t index = pos >> PAGE_CACHE_SHIFT;
struct page *page;
int status;

page = grab_cache_page_write_begin(mapping, index, flags);
if (!page)
return -ENOMEM;

status = __block_write_begin(page, pos, len, get_block);
if (unlikely(status)) {
unlock_page(page);
page_cache_release(page);
page = NULL;
}

*pagep = page;
return status;
}
struct page *grab_cache_page_write_begin(struct address_space *mapping,
pgoff_t index, unsigned flags)
{
int status;
gfp_t gfp_mask;
struct page *page;
gfp_t gfp_notmask = 0;

gfp_mask = mapping_gfp_mask(mapping);
if (mapping_cap_account_dirty(mapping))
gfp_mask |= __GFP_WRITE;
if (flags & AOP_FLAG_NOFS)
gfp_notmask = __GFP_FS;
repeat:
page = find_lock_page(mapping, index);
if (page)
goto found;

page = __page_cache_alloc(gfp_mask & ~gfp_notmask);
if (!page)
return NULL;
status = add_to_page_cache_lru(page, mapping, index,
GFP_KERNEL & ~gfp_notmask);
if (unlikely(status)) {
page_cache_release(page);
if (status == -EEXIST)
goto repeat;
return NULL;
}
found:
wait_on_page_writeback(page);
return page;
}
int __block_write_begin(struct page *page, loff_t pos, unsigned len,
get_block_t *get_block)
{
unsigned from = pos & (PAGE_CACHE_SIZE - 1);
unsigned to = from + len;
struct inode *inode = page->mapping->host;
unsigned block_start, block_end;
sector_t block;
int err = 0;
unsigned blocksize, bbits;
struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;

BUG_ON(!PageLocked(page));
BUG_ON(from > PAGE_CACHE_SIZE);
BUG_ON(to > PAGE_CACHE_SIZE);
BUG_ON(from > to);

blocksize = 1 << inode->i_blkbits;
if (!page_has_buffers(page))
create_empty_buffers(page, blocksize, 0);
head = page_buffers(page);

bbits = inode->i_blkbits;
block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);

for(bh = head, block_start = 0; bh != head || !block_start;
      block++, block_start=block_end, bh = bh->b_this_page) {
block_end = block_start + blocksize;
if (block_end <= from || block_start >= to) {
if (PageUptodate(page)) {
if (!buffer_uptodate(bh))
set_buffer_uptodate(bh);
}
continue;
}
if (buffer_new(bh))
clear_buffer_new(bh);
if (!buffer_mapped(bh)) {
WARN_ON(bh->b_size != blocksize);
err = get_block(inode, block, bh, 1);
if (err)
break;
if (buffer_new(bh)) {
unmap_underlying_metadata(bh->b_bdev,
bh->b_blocknr);
if (PageUptodate(page)) {
clear_buffer_new(bh);
set_buffer_uptodate(bh);
mark_buffer_dirty(bh);
continue;
}
if (block_end > to || block_start < from)
zero_user_segments(page,
to, block_end,
block_start, from);
continue;
}
}
if (PageUptodate(page)) {
if (!buffer_uptodate(bh))
set_buffer_uptodate(bh);
continue;
}
if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
      !buffer_unwritten(bh) &&
      (block_start < from || block_end > to)) {
ll_rw_block(READ, 1, &bh);
*wait_bh++=bh;
}
}
/*
   * If we issued read requests - let them complete.
   */
while(wait_bh > wait) {
wait_on_buffer(*--wait_bh);
if (!buffer_uptodate(*wait_bh))
err = -EIO;
}
if (unlikely(err))
page_zero_new_buffers(page, from, to);
return err;
}

评分

参与人数 1	可用积分 +6	收起理由
瀚海书香	+ 6	赞一个!

查看全部评分

文库|博客

blake326

丰衣足食

论坛徽章:: 0

2楼 [报告]

发表于 2012-10-09 18:27 |只看该作者

本帖最后由 blake326 于 2012-10-09 18:52 编辑

****************************************************************************************
block_write_end中，首先flush_dcache_page体系相关的刷新数据cache（一般risc需要，比较复杂，表示不理解？）
然后调用__block_commit_write对page每个buffer_head进行处理，对本次写操作涉及到的buffer_head都mark_buffer_dirty设置dirty位。并且如果所有buffer_head都是最新的，刚写过的肯定是最新的，没写的也是最新的话，设置page的uptodate位。
最后解锁page，返回写的字节数。
****************************************************************************************
static int ext2_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata)
{
int ret;

ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
if (ret < len)
ext2_write_failed(mapping, pos + len);
return ret;
}
int generic_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata)
{
struct inode *inode = mapping->host;
int i_size_changed = 0;

copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);

/*
   * No need to use i_size_read() here, the i_size
   * cannot change under us because we hold i_mutex.
   *
   * But it's important to update i_size while still holding page lock:
   * page writeout could otherwise come in and zero beyond i_size.
   */
if (pos+copied > inode->i_size) {
i_size_write(inode, pos+copied);
i_size_changed = 1;
}

unlock_page(page);
page_cache_release(page);

/*
   * Don't mark the inode dirty under page lock. First, it unnecessarily
   * makes the holding time of page lock longer. Second, it forces lock
   * ordering of page lock and transaction start for journaling
   * filesystems.
   */
if (i_size_changed)
mark_inode_dirty(inode);

return copied;
}

int block_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata)
{
struct inode *inode = mapping->host;
unsigned start;

start = pos & (PAGE_CACHE_SIZE - 1);

if (unlikely(copied < len)) {
/*
   * The buffers that were written will now be uptodate, so we
   * don't have to worry about a readpage reading them and
   * overwriting a partial write. However if we have encountered
   * a short write and only partially written into a buffer, it
   * will not be marked uptodate, so a readpage might come in and
   * destroy our partial write.
   *
   * Do the simplest thing, and just treat any short write to a
   * non uptodate page as a zero-length write, and force the
   * caller to redo the whole thing.
   */
if (!PageUptodate(page))
copied = 0;

page_zero_new_buffers(page, start+copied, start+len);
}
flush_dcache_page(page);

/* This could be a short (even 0-length) commit */
__block_commit_write(inode, page, start, start+copied);

return copied;
}
static int __block_commit_write(struct inode *inode, struct page *page,
unsigned from, unsigned to)
{
unsigned block_start, block_end;
int partial = 0;
unsigned blocksize;
struct buffer_head *bh, *head;

blocksize = 1 << inode->i_blkbits;

for(bh = head = page_buffers(page), block_start = 0;
      bh != head || !block_start;
      block_start=block_end, bh = bh->b_this_page) {
block_end = block_start + blocksize;
if (block_end <= from || block_start >= to) {
if (!buffer_uptodate(bh))
partial = 1;
} else {
set_buffer_uptodate(bh);
mark_buffer_dirty(bh);
}
clear_buffer_new(bh);
}

/*
   * If this is a partial write which happened to make all buffers
   * uptodate then we can optimize away a bogus readpage() for
   * the next read(). Here we 'discover' whether the page went
   * uptodate as a result of this (potentially partial) write.
   */
if (!partial)
SetPageUptodate(page);
return 0;
}

****************************************************************************************
test_set_buffer_dirty设置dirty位，返回旧的dirty位。所以正常情况，首次对一个page写的话将会执行到__set_page_dirty设置page的dirty。当然__set_page_dirty不仅仅设置一个dirty位，还调用到__mark_inode_dirty唤醒writeback线程。
__mark_inode_dirty检查如果inode不是dirty的话，将它设置为dirty，然后看inode关联的bdi的dirty list（bdi->wb.b_dirty）是否有inode要处理，然后将该inode move到bdi dirty list中，这个时候看看是否要延迟唤醒（默认5s）bdi writeback线程进行真正的写磁盘动作。
****************************************************************************************
void mark_buffer_dirty(struct buffer_head *bh)
{
WARN_ON_ONCE(!buffer_uptodate(bh));

/*
   * Very *carefully* optimize the it-is-already-dirty case.
   *
   * Don't let the final "is it dirty" escape to before we
   * perhaps modified the buffer.
   */
if (buffer_dirty(bh)) {
smp_mb();
if (buffer_dirty(bh))
return;
}

if (!test_set_buffer_dirty(bh)) {
struct page *page = bh->b_page;
if (!TestSetPageDirty(page)) {
struct address_space *mapping = page_mapping(page);
if (mapping)
__set_page_dirty(page, mapping, 0);
}
}
}
static void __set_page_dirty(struct page *page,
struct address_space *mapping, int warn)
{
spin_lock_irq(&mapping->tree_lock);
if (page->mapping) { /* Race with truncate? */
WARN_ON_ONCE(warn && !PageUptodate(page));
account_page_dirtied(page, mapping);
radix_tree_tag_set(&mapping->page_tree,
page_index(page), PAGECACHE_TAG_DIRTY);
}
spin_unlock_irq(&mapping->tree_lock);
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
}
void __mark_inode_dirty(struct inode *inode, int flags)
{
struct super_block *sb = inode->i_sb;
struct backing_dev_info *bdi = NULL;

/*
   * Don't do this for I_DIRTY_PAGES - that doesn't actually
   * dirty the inode itself
   */
if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
if (sb->s_op->dirty_inode)
sb->s_op->dirty_inode(inode, flags);
}

/*
   * make sure that changes are seen by all cpus before we test i_state
   * -- mikulas
   */
smp_mb();

/* avoid the locking if we can */
if ((inode->i_state & flags) == flags)
return;

if (unlikely(block_dump))
block_dump___mark_inode_dirty(inode);

spin_lock(&inode->i_lock);
if ((inode->i_state & flags) != flags) {
const int was_dirty = inode->i_state & I_DIRTY;

inode->i_state |= flags;

/*
   * If the inode is being synced, just update its dirty state.
   * The unlocker will place the inode on the appropriate
   * superblock list, based upon its state.
   */
if (inode->i_state & I_SYNC)
goto out_unlock_inode;

/*
   * Only add valid (hashed) inodes to the superblock's
   * dirty list.  Add blockdev inodes as well.
   */
if (!S_ISBLK(inode->i_mode)) {
if (inode_unhashed(inode))
goto out_unlock_inode;
}
if (inode->i_state & I_FREEING)
goto out_unlock_inode;

/*
   * If the inode was already on b_dirty/b_io/b_more_io, don't
   * reposition it (that would break b_dirty time-ordering).
   */
if (!was_dirty) {
bool wakeup_bdi = false;
bdi = inode_to_bdi(inode);

if (bdi_cap_writeback_dirty(bdi)) {
WARN(!test_bit(BDI_registered, &bdi->state),
      "bdi-%s not registered\n", bdi->name);

/*
   * If this is the first dirty inode for this
   * bdi, we have to wake-up the corresponding
   * bdi thread to make sure background
   * write-back happens later.
   */
if (!wb_has_dirty_io(&bdi->wb))
wakeup_bdi = true;
}

spin_unlock(&inode->i_lock);
spin_lock(&bdi->wb.list_lock);
inode->dirtied_when = jiffies;
list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
spin_unlock(&bdi->wb.list_lock);

if (wakeup_bdi)
bdi_wakeup_thread_delayed(bdi);
return;
}
}
out_unlock_inode:
spin_unlock(&inode->i_lock);

}

实战分享：从技术角度谈机器学习入门| 【大话IT】RadonDB低门槛向MySQL集群下战书 | ChinaUnix打赏功能已上线！ | 新一代分布式关系型数据库RadonDB知多少？

blake326

丰衣足食

论坛徽章:: 0

3楼 [报告]

发表于 2012-10-09 18:27 |只看该作者

****************************************************************************************
sys_write仅仅是写page，并没有提交写请求，具体的写工作是一个叫做bdi writeback线程来负责的。在代码里面是由每个磁盘相关的request_queue.backing_dev_info来管理的。backing_dev_info维护了一些磁盘辅助信息，最重要的一个属性就是bdi_writeback结构：
bdi_writeback的task指向了writeback线程，如果存在的话。
wakeup_timer定时器，默认的回调方法就是唤醒default_backing_dev_info.wb.task或者当前bdi.wb.task
b_dirty链表保存了被磁盘上脏的inode。
在add_disk中，通过bdi_register_dev(&disk->queue->backing_dev_info, disk_devt(disk))注册bdi。就是说每个gendisk就有一个bdi。
****************************************************************************************
struct backing_dev_info {
struct list_head bdi_list;
unsigned long ra_pages; /* max readahead in PAGE_CACHE_SIZE units */
unsigned long state; /* Always use atomic bitops on this */
unsigned int capabilities; /* Device capabilities */
congested_fn *congested_fn; /* Function pointer if device is md/dm */
void *congested_data; /* Pointer to aux data for congested func */

char *name;

struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS];

unsigned long bw_time_stamp; /* last time write bw is updated */
unsigned long dirtied_stamp;
unsigned long written_stamp; /* pages written at bw_time_stamp */
unsigned long write_bandwidth; /* the estimated write bandwidth */
unsigned long avg_write_bandwidth; /* further smoothed write bw */

/*
   * The base dirty throttle rate, re-calculated on every 200ms.
   * All the bdi tasks' dirty rate will be curbed under it.
   * @dirty_ratelimit tracks the estimated @balanced_dirty_ratelimit
   * in small steps and is much more smooth/stable than the latter.
   */
unsigned long dirty_ratelimit;
unsigned long balanced_dirty_ratelimit;

struct fprop_local_percpu completions;
int dirty_exceeded;

unsigned int min_ratio;
unsigned int max_ratio, max_prop_frac;

struct bdi_writeback wb;  /* default writeback info for this bdi */
spinlock_t wb_lock;    /* protects work_list */

struct list_head work_list;

struct device *dev;

struct timer_list laptop_mode_wb_timer;

#ifdef CONFIG_DEBUG_FS
struct dentry *debug_dir;
struct dentry *debug_stats;
#endif
};
struct bdi_writeback {
struct backing_dev_info *bdi; /* our parent bdi */
unsigned int nr;

unsigned long last_old_flush; /* last old data flush */
unsigned long last_active; /* last time bdi thread was active */

struct task_struct *task; /* writeback thread */
struct timer_list wakeup_timer; /* used for delayed bdi thread wakeup */
struct list_head b_dirty; /* dirty inodes */
struct list_head b_io; /* parked for writeback */
struct list_head b_more_io; /* parked for more writeback */
spinlock_t list_lock; /* protects the b_* lists */
};
static void wakeup_timer_fn(unsigned long data)
{
struct backing_dev_info *bdi = (struct backing_dev_info *)data;

spin_lock_bh(&bdi->wb_lock);
if (bdi->wb.task) {
trace_writeback_wake_thread(bdi);
wake_up_process(bdi->wb.task);
} else if (bdi->dev) {
/*
   * When bdi tasks are inactive for long time, they are killed.
   * In this case we have to wake-up the forker thread which
   * should create and run the bdi thread.
   */
trace_writeback_wake_forker_thread(bdi);
wake_up_process(default_backing_dev_info.wb.task);
}
spin_unlock_bh(&bdi->wb_lock);
}

****************************************************************************************
除了每个gendisk都会bid_register一个bdi之外，内核还注册了一个默认的default_backing_dev_info 。这个bdi创建了一个bdi-default的线程管理了其他普通的bdi的writeback线程。
****************************************************************************************
struct backing_dev_info default_backing_dev_info = {
.name = "default",
.ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE,
.state = 0,
.capabilities = BDI_CAP_MAP_COPY,
};
static int __init default_bdi_init(void)
{
int err;

err = bdi_init(&default_backing_dev_info);
if (!err)
bdi_register(&default_backing_dev_info, NULL, "default"

;
err = bdi_init(&noop_backing_dev_info);

return err;
}
subsys_initcall(default_bdi_init);
int bdi_register(struct backing_dev_info *bdi, struct device *parent,
const char *fmt, ...)
{
va_list args;
struct device *dev;

if (bdi->dev) /* The driver needs to use separate queues per device */
return 0;

va_start(args, fmt);
dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args);
va_end(args);
if (IS_ERR(dev))
return PTR_ERR(dev);

bdi->dev = dev;

/*
   * Just start the forker thread for our default backing_dev_info,
   * and add other bdi's to the list. They will get a thread created
   * on-demand when they need it.
   */
if (bdi_cap_flush_forker(bdi)) {
struct bdi_writeback *wb = &bdi->wb;

wb->task = kthread_run(bdi_forker_thread, wb, "bdi-%s",
dev_name(dev));
if (IS_ERR(wb->task))
return PTR_ERR(wb->task);
}

bdi_debug_register(bdi, dev_name(dev));
set_bit(BDI_registered, &bdi->state);

spin_lock_bh(&bdi_lock);
list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
spin_unlock_bh(&bdi_lock);

trace_writeback_bdi_register(bdi);
return 0;
}

****************************************************************************************
default bdi线程bdi_forker_thread实际上是一个死循环，一般是由wakeup_timer定时器唤醒。具体处理如下：
1. 检查default bdi的wb是否有io请求，或者是否bdi work。有的话则删除wakeup_timer，调用wb_do_writeback进行回写。
2. 通过bdi_list链表遍历所有注册的bdi，对每个bdi进行检查。
如果bdi没有写回的特点比如内存bdi则继续下一个bdi。
然后同样检查bdi的wb是否有请求或者有bdi work，如果有的话，并且bdi->wb.task=null,则设置bdi的action=FORK_THREAD。然后break。
如果bdi没有工作要处理，并且现在该bdi->wb.task已经空闲了5秒钟以上了，则设置bid的action=KILL_THRREAD。然后break。
3. 现在假设发现了要启动一个bdi，kthread_create(bdi_writeback_thread, &bdi->wb,"flush-%s", dev_name(bdi->dev))创建一个flush-8:0之类的writeback线程。
4. 假设没有fork，也没有kill的bdi。则default bdi线程睡眠5s钟，或许醒来之后会发现哪个bdi长期不活动可以kill掉了。
****************************************************************************************
static int bdi_forker_thread(void *ptr)
{
struct bdi_writeback *me = ptr;

current->flags |= PF_SWAPWRITE;
set_freezable();

/*
   * Our parent may run at a different priority, just set us to normal
   */
set_user_nice(current, 0);

for (;

{
struct task_struct *task = NULL;
struct backing_dev_info *bdi;
enum {
NO_ACTION, /* Nothing to do */
FORK_THREAD, /* Fork bdi thread */
KILL_THREAD, /* Kill inactive bdi thread */
} action = NO_ACTION;

/*
   * Temporary measure, we want to make sure we don't see
   * dirty data on the default backing_dev_info
   */
if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) {
del_timer(&me->wakeup_timer);
wb_do_writeback(me, 0);
}

spin_lock_bh(&bdi_lock);
/*
   * In the following loop we are going to check whether we have
   * some work to do without any synchronization with tasks
   * waking us up to do work for them. Set the task state here
   * so that we don't miss wakeups after verifying conditions.
   */
set_current_state(TASK_INTERRUPTIBLE);

list_for_each_entry(bdi, &bdi_list, bdi_list) {
bool have_dirty_io;

if (!bdi_cap_writeback_dirty(bdi) ||
      bdi_cap_flush_forker(bdi))
continue;

WARN(!test_bit(BDI_registered, &bdi->state),
      "bdi %p/%s is not registered!\n", bdi, bdi->name);

have_dirty_io = !list_empty(&bdi->work_list) ||
wb_has_dirty_io(&bdi->wb);

/*
   * If the bdi has work to do, but the thread does not
   * exist - create it.
   */
if (!bdi->wb.task && have_dirty_io) {
/*
   * Set the pending bit - if someone will try to
   * unregister this bdi - it'll wait on this bit.
   */
set_bit(BDI_pending, &bdi->state);
action = FORK_THREAD;
break;
}

spin_lock(&bdi->wb_lock);

/*
   * If there is no work to do and the bdi thread was
   * inactive long enough - kill it. The wb_lock is taken
   * to make sure no-one adds more work to this bdi and
   * wakes the bdi thread up.
   */
if (bdi->wb.task && !have_dirty_io &&
      time_after(jiffies, bdi->wb.last_active +
bdi_longest_inactive())) {
task = bdi->wb.task;
bdi->wb.task = NULL;
spin_unlock(&bdi->wb_lock);
set_bit(BDI_pending, &bdi->state);
action = KILL_THREAD;
break;
}
spin_unlock(&bdi->wb_lock);
}
spin_unlock_bh(&bdi_lock);

/* Keep working if default bdi still has things to do */
if (!list_empty(&me->bdi->work_list))
__set_current_state(TASK_RUNNING);

switch (action) {
case FORK_THREAD:
__set_current_state(TASK_RUNNING);
task = kthread_create(bdi_writeback_thread, &bdi->wb,
      "flush-%s", dev_name(bdi->dev));
if (IS_ERR(task)) {
/*
   * If thread creation fails, force writeout of
   * the bdi from the thread. Hopefully 1024 is
   * large enough for efficient IO.
   */
writeback_inodes_wb(&bdi->wb, 1024,
      WB_REASON_FORKER_THREAD);
} else {
/*
   * The spinlock makes sure we do not lose
   * wake-ups when racing with 'bdi_queue_work()'.
   * And as soon as the bdi thread is visible, we
   * can start it.
   */
spin_lock_bh(&bdi->wb_lock);
bdi->wb.task = task;
spin_unlock_bh(&bdi->wb_lock);
wake_up_process(task);
}
bdi_clear_pending(bdi);
break;

case KILL_THREAD:
__set_current_state(TASK_RUNNING);
kthread_stop(task);
bdi_clear_pending(bdi);
break;

case NO_ACTION:
if (!wb_has_dirty_io(me) || !dirty_writeback_interval)
/*
   * There are no dirty data. The only thing we
   * should now care about is checking for
   * inactive bdi threads and killing them. Thus,
   * let's sleep for longer time, save energy and
   * be friendly for battery-driven devices.
   */
schedule_timeout(bdi_longest_inactive());
else
schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
try_to_freeze();
break;
}
}

return 0;
}

****************************************************************************************
继续sys_write，执行block_write_end之后通过__mark_inode_dirty有需要的话唤醒default bdi wb或者bdi.wb.task线程。条条大路通罗马，bdi_writeback_thread最终会被执行。
bdi_writeback_thread本质上就是一个循环，调用wb_do_writeback进行回写。
回写之后发现仍然有工作要做，超时5s调度出去。否则直接调度出去，等待被sys_write之类唤醒。

很明显内核磁盘回写的策略是：
如果持续的对磁盘有写请求的话，bdi wb task将会一直存在，但是每隔5s集中处理一次回写工作。
如果只是偶尔的对磁盘有写请求的话，一般一个写请求之后，都会5s超时唤醒default bdi线程，然default bdi线程创建并运行一个磁盘的flush线程，在这个flush线程中处理回写，过了5s左右之后这个bdi wb task会被bdi-default kill掉。然后偶然又有写请求产生。
但是wb_do_writeback回写并不是一定会提交写请求给磁盘，一般两种情况满足下才会提交一部分page写请求给磁盘，分别是，page dirty超时30s，另外一个是dirty page比例太高。
****************************************************************************************
int bdi_writeback_thread(void *data)
{
while (!kthread_freezable_should_stop(NULL)) {

del_timer(&wb->wakeup_timer);

pages_written = wb_do_writeback(wb, 0);

trace_writeback_pages_written(pages_written);

if (pages_written)
wb->last_active = jiffies;

set_current_state(TASK_INTERRUPTIBLE);
if (!list_empty(&bdi->work_list) || kthread_should_stop()) {
__set_current_state(TASK_RUNNING);
continue;
}

if (wb_has_dirty_io(wb) && dirty_writeback_interval)
schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
else {
/*
   * We have nothing to do, so can go sleep without any
   * timeout and save power. When a work is queued or
   * something is made dirty - we will be woken up.
   */
schedule();
}
}

/* Flush any work that raced with us exiting */
if (!list_empty(&bdi->work_list))
wb_do_writeback(wb, 1);

trace_writeback_thread_stop(bdi);
return 0;
}

****************************************************************************************
wb_do_writeback是具体处理写回工作的函数。通过get_next_work_item，从bdi->work_list任务队列上取出所有wb_writeback_work类型的任务，并且分别调用wb_writeback进行处理。就是说，具体的工作处理都是对wb_writeback_work的处理。但是，vfs_write中我们只不过将dirty的inode move到了bdi->wb.b_dirty队列。那么wb_writeback_work任务怎么来的呢？
wb_check_old_data_flush() 产生一个for_kupdate的任务通过wb_writeback处理。
wb_check_background_flush()产生一个for_background的任务通过wb_writeback处理。
所以具体逻辑都是在wb_writeback里面处理的，for_kupdate会写回dirty超过30s的page。for_background会检测是否脏页比例过高或者可用页太少，然后写回一部分page。
****************************************************************************************
long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
{
struct backing_dev_info *bdi = wb->bdi;
struct wb_writeback_work *work;
long wrote = 0;

set_bit(BDI_writeback_running, &wb->bdi->state);
while ((work = get_next_work_item(bdi)) != NULL) {
/*
   * Override sync mode, in case we must wait for completion
   * because this thread is exiting now.
   */
if (force_wait)
work->sync_mode = WB_SYNC_ALL;

trace_writeback_exec(bdi, work);

wrote += wb_writeback(wb, work);

/*
   * Notify the caller of completion if this is a synchronous
   * work item, otherwise just free it.
   */
if (work->done)
complete(work->done);
else
kfree(work);
}

/*
   * Check for periodic writeback, kupdated() style
   */
wrote += wb_check_old_data_flush(wb);
wrote += wb_check_background_flush(wb);
clear_bit(BDI_writeback_running, &wb->bdi->state);

return wrote;
}

****************************************************************************************
对于for_background任务，通过over_bground_thresh检查dirty过多的话才真正处理work。work->older_than_this为当前时间。
对于for_kupdate任务，设置work->older_than_this为当前时间往前30s。queue_io会用到。
如果需要继续处理任务，调用queue_io准备b_io处理队列。queue_io看到将b_more_io拼接到b_io上，并且看是否有过期的inode要move到b_io上（只有for_kupdate的任务才有可能）
然后调用__writeback_inodes_wb处理b_io上所有的inode。
__writeback_inodes_wb实现比较饶人，实际上就是对每个inode分别调用了writeback_sb_inodes，从而调用了__writeback_single_inode写回单个inode。
****************************************************************************************
static long wb_writeback(struct bdi_writeback *wb,
   struct wb_writeback_work *work)
{
unsigned long wb_start = jiffies;
long nr_pages = work->nr_pages;
unsigned long oldest_jif;
struct inode *inode;
long progress;

oldest_jif = jiffies;
work->older_than_this = &oldest_jif;

spin_lock(&wb->list_lock);
for (;

{
/*
   * Stop writeback when nr_pages has been consumed
   */
if (work->nr_pages <= 0)
break;

/*
   * Background writeout and kupdate-style writeback may
   * run forever. Stop them if there is other work to do
   * so that e.g. sync can proceed. They'll be restarted
   * after the other works are all done.
   */
if ((work->for_background || work->for_kupdate) &&
      !list_empty(&wb->bdi->work_list))
break;

/*
   * For background writeout, stop when we are below the
   * background dirty threshold
   */
if (work->for_background && !over_bground_thresh(wb->bdi))
break;

/*
   * Kupdate and background works are special and we want to
   * include all inodes that need writing. Livelock avoidance is
   * handled by these works yielding to any other work so we are
   * safe.
   */
if (work->for_kupdate) {
oldest_jif = jiffies -
msecs_to_jiffies(dirty_expire_interval * 10);
} else if (work->for_background)
oldest_jif = jiffies;

trace_writeback_start(wb->bdi, work);
if (list_empty(&wb->b_io))
queue_io(wb, work);
if (work->sb)
progress = writeback_sb_inodes(work->sb, wb, work);
else
progress = __writeback_inodes_wb(wb, work);
trace_writeback_written(wb->bdi, work);

wb_update_bandwidth(wb, wb_start);

/*
   * Did we write something? Try for more
   *
   * Dirty inodes are moved to b_io for writeback in batches.
   * The completion of the current batch does not necessarily
   * mean the overall work is done. So we keep looping as long
   * as made some progress on cleaning pages or inodes.
   */
if (progress)
continue;
/*
   * No more inodes for IO, bail
   */
if (list_empty(&wb->b_more_io))
break;
/*
   * Nothing written. Wait for some inode to
   * become available for writeback. Otherwise
   * we'll just busyloop.
   */
if (!list_empty(&wb->b_more_io))  {
trace_writeback_wait(wb->bdi, work);
inode = wb_inode(wb->b_more_io.prev);
spin_lock(&inode->i_lock);
spin_unlock(&wb->list_lock);
/* This function drops i_lock... */
inode_sleep_on_writeback(inode);
spin_lock(&wb->list_lock);
}
}
spin_unlock(&wb->list_lock);

return nr_pages - work->nr_pages;
}

/*
* Queue all expired dirty inodes for io, eldest first.
* Before
*       newly dirtied    b_dirty b_io b_more_io
*       =============> gf       edc    BA
* After
*       newly dirtied    b_dirty b_io b_more_io
*       =============> g       fBAedc
*                                        |
*                                        +--> dequeue for IO
*/
static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work)
{
int moved;
assert_spin_locked(&wb->list_lock);
list_splice_init(&wb->b_more_io, &wb->b_io);
moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, work);
trace_writeback_queue_io(wb, work, moved);
}

实战分享：从技术角度谈机器学习入门| 【大话IT】RadonDB低门槛向MySQL集群下战书 | ChinaUnix打赏功能已上线！ | 新一代分布式关系型数据库RadonDB知多少？

blake326

丰衣足食

论坛徽章:: 0

4楼 [报告]

发表于 2012-10-09 18:28 |只看该作者

****************************************************************************************
首先调用do_writepages，启用plug写入address_space的所有page。
有必要的话写回inode。
我们只关心do_writepages
****************************************************************************************
static int
__writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
   struct writeback_control *wbc)
{
struct address_space *mapping = inode->i_mapping;
long nr_to_write = wbc->nr_to_write;
unsigned dirty;
int ret;

WARN_ON(!(inode->i_state & I_SYNC));

ret = do_writepages(mapping, wbc);

/*
   * Make sure to wait on the data before writing out the metadata.
   * This is important for filesystems that modify metadata on data
   * I/O completion.
   */
if (wbc->sync_mode == WB_SYNC_ALL) {
int err = filemap_fdatawait(mapping);
if (ret == 0)
ret = err;
}

/*
   * Some filesystems may redirty the inode during the writeback
   * due to delalloc, clear dirty metadata flags right before
   * write_inode()
   */
spin_lock(&inode->i_lock);
/* Clear I_DIRTY_PAGES if we've written out all dirty pages */
if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
inode->i_state &= ~I_DIRTY_PAGES;
dirty = inode->i_state & I_DIRTY;
inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC);
spin_unlock(&inode->i_lock);
/* Don't write the inode if only I_DIRTY_PAGES was set */
if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
int err = write_inode(inode, wbc);
if (ret == 0)
ret = err;
}
trace_writeback_single_inode(inode, wbc, nr_to_write);
return ret;
}

****************************************************************************************
write_cache_pages实际上对每个page分别调用了mapping->a_ops->writepage。例如ext2_writepage。最终调用到block_write_full_page_endio。
block_write_full_page_endio方法中会首先设置page的PG_writeback，然后对每个buffer_head分别submit_bh。

当写请求完成之后，每个页的每个block都是通过submit_bh提交的，所以每个block对应一个bio，当然同read一样，这些bio可以组合到一个request中。因此每个bio完成之后都会调用end_buffer_async_write回调函数。然后处理buffer_head和page两个层次的状态：
unlock_buffer，这样如果在bh写请求过程中，有读取bh的请求，那么这个时候可以唤醒它。
如果所有buffer_head都是uptodate的，则clear PG_writeback，唤醒等待在PG_writeback的进程。
****************************************************************************************
int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
int ret;

if (wbc->nr_to_write <= 0)
return 0;
if (mapping->a_ops->writepages)
ret = mapping->a_ops->writepages(mapping, wbc);
else
ret = generic_writepages(mapping, wbc);
return ret;
}
int generic_writepages(struct address_space *mapping,
         struct writeback_control *wbc)
{
struct blk_plug plug;
int ret;

/* deal with chardevs and other special file */
if (!mapping->a_ops->writepage)
return 0;

blk_start_plug(&plug);
ret = write_cache_pages(mapping, wbc, __writepage, mapping);
blk_finish_plug(&plug);
return ret;
}
static int ext2_writepage(struct page *page, struct writeback_control *wbc)
{
return block_write_full_page(page, ext2_get_block, wbc);
}
int block_write_full_page(struct page *page, get_block_t *get_block,
struct writeback_control *wbc)
{
return block_write_full_page_endio(page, get_block, wbc,
   end_buffer_async_write);
}

void end_buffer_async_write(struct buffer_head *bh, int uptodate)
{
char b[BDEVNAME_SIZE];
unsigned long flags;
struct buffer_head *first;
struct buffer_head *tmp;
struct page *page;

BUG_ON(!buffer_async_write(bh));

page = bh->b_page;
if (uptodate) {
set_buffer_uptodate(bh);
} else {
if (!quiet_error(bh)) {
buffer_io_error(bh);
printk(KERN_WARNING "lost page write due to "
"I/O error on %s\n",
         bdevname(bh->b_bdev, b));
}
set_bit(AS_EIO, &page->mapping->flags);
set_buffer_write_io_error(bh);
clear_buffer_uptodate(bh);
SetPageError(page);
}

first = page_buffers(page);
local_irq_save(flags);
bit_spin_lock(BH_Uptodate_Lock, &first->b_state);

clear_buffer_async_write(bh);
unlock_buffer(bh);
tmp = bh->b_this_page;
while (tmp != bh) {
if (buffer_async_write(tmp)) {
BUG_ON(!buffer_locked(tmp));
goto still_busy;
}
tmp = tmp->b_this_page;
}
bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
local_irq_restore(flags);
end_page_writeback(page);
return;

still_busy:
bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
local_irq_restore(flags);
return;
}
void end_page_writeback(struct page *page)
{
if (TestClearPageReclaim(page))
rotate_reclaimable_page(page);

if (!test_clear_page_writeback(page))
BUG();

smp_mb__after_clear_bit();
wake_up_page(page, PG_writeback);
}

todo：
1. 忽略了cache相关。
2. page各种状态转换。

实战分享：从技术角度谈机器学习入门| 【大话IT】RadonDB低门槛向MySQL集群下战书 | ChinaUnix打赏功能已上线！ | 新一代分布式关系型数据库RadonDB知多少？

瀚海书香

版主

论坛徽章:: 6

5楼 [报告]

发表于 2012-10-09 18:38 |只看该作者

回复 1# blake326
多谢分享！

实战分享：从技术角度谈机器学习入门| 【大话IT】RadonDB低门槛向MySQL集群下战书 | ChinaUnix打赏功能已上线！ | 新一代分布式关系型数据库RadonDB知多少？

blake326

丰衣足食

论坛徽章:: 0

6楼 [报告]

发表于 2012-10-09 18:48 |只看该作者

顺便请教一个问题：

2. iov_iter_copy_from_user_atomic通过kmap_atomic临时内核映射page到一个内核地址，然后把用户buf数据拷贝这个地址中去，然后释放映射。pagefault disable/enable作用？？

这里pagefault disable/enable禁止抢占有什么作用呢？如果不加我也没想出来会出什么异常情况。

实战分享：从技术角度谈机器学习入门| 【大话IT】RadonDB低门槛向MySQL集群下战书 | ChinaUnix打赏功能已上线！ | 新一代分布式关系型数据库RadonDB知多少？

瀚海书香

版主

论坛徽章:: 6

7楼 [报告]

发表于 2012-10-09 19:15 |只看该作者

本帖最后由瀚海书香于 2012-10-09 19:18 编辑

回复 6# blake326

这里pagefault disable/enable禁止抢占有什么作用呢？

这里禁止抢占是为了确保kmap_atomic和kunmap_atomic类似的操作顺序执行。因为如果不禁止抢占，那么可能发送如下的情形：

1. kmap_atomic
2. 中断并返回
3. 被调度到另一个进程
4. 另一个进程可能会 kmap_atomic 这样就会导致kmap_atomic和kunmap_atomic不是串行执行，导致同一内存被kmap两次。