免费注册 查看新帖 |

Chinaunix

  平台 论坛 博客 文库
1234下一页
最近访问板块 发新帖
查看: 11128 | 回复: 36

[文件系统] 文件写入过程 [复制链接]

论坛徽章:
0
发表于 2012-10-09 18:26 |显示全部楼层
本帖最后由 blake326 于 2012-10-09 18:29 编辑

3.6 kernel

SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
                size_t, count)
{
        struct file *file;
        ssize_t ret = -EBADF;
        int fput_needed;

        file = fget_light(fd, &fput_needed);
        if (file) {
                loff_t pos = file_pos_read(file);
                ret = vfs_write(file, buf, count, &pos);
                file_pos_write(file, pos);
                fput_light(file, fput_needed);
        }

        return ret;
}
ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
{
        ssize_t ret;

        if (!(file->f_mode & FMODE_WRITE))
                return -EBADF;
        if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write))
                return -EINVAL;
        if (unlikely(!access_ok(VERIFY_READ, buf, count)))
                return -EFAULT;

        ret = rw_verify_area(WRITE, file, pos, count);
        if (ret >= 0) {
                count = ret;
                if (file->f_op->write)
                        ret = file->f_op->write(file, buf, count, pos);
                else
                        ret = do_sync_write(file, buf, count, pos);
                if (ret > 0) {
                        fsnotify_modify(file);
                        add_wchar(current, ret);
                }
                inc_syscw(current);
        }

        return ret;
}
ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
{
        struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
        struct kiocb kiocb;
        ssize_t ret;

        init_sync_kiocb(&kiocb, filp);
        kiocb.ki_pos = *ppos;
        kiocb.ki_left = len;
        kiocb.ki_nbytes = len;

        for (; {
                ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
                if (ret != -EIOCBRETRY)
                        break;
                wait_on_retry_sync_kiocb(&kiocb);
        }

        if (-EIOCBQUEUED == ret)
                ret = wait_on_sync_kiocb(&kiocb);
        *ppos = kiocb.ki_pos;
        return ret;
}
ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
                unsigned long nr_segs, loff_t pos)
{
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
        ssize_t ret;

        BUG_ON(iocb->ki_pos != pos);

        sb_start_write(inode->i_sb);
        mutex_lock(&inode->i_mutex);
        ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
        mutex_unlock(&inode->i_mutex);

        if (ret > 0 || ret == -EIOCBQUEUED) {
                ssize_t err;

                err = generic_write_sync(file, pos, ret);
                if (err < 0 && ret > 0)
                        ret = err;
        }
        sb_end_write(inode->i_sb);
        return ret;
}

ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
                                 unsigned long nr_segs, loff_t *ppos)
{
        struct file *file = iocb->ki_filp;
        struct address_space * mapping = file->f_mapping;
        size_t ocount;                /* original count */
        size_t count;                /* after file limit checks */
        struct inode         *inode = mapping->host;
        loff_t                pos;
        ssize_t                written;
        ssize_t                err;

        ocount = 0;
        err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
        if (err)
                return err;

        count = ocount;
        pos = *ppos;

        /* We can write back this queue in page reclaim */
        current->backing_dev_info = mapping->backing_dev_info;
        written = 0;

        err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
        if (err)
                goto out;

        if (count == 0)
                goto out;

        err = file_remove_suid(file);
        if (err)
                goto out;

        err = file_update_time(file);
        if (err)
                goto out;

        /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
        if (unlikely(file->f_flags & O_DIRECT)) {
                loff_t endbyte;
                ssize_t written_buffered;

                written = generic_file_direct_write(iocb, iov, &nr_segs, pos,
                                                        ppos, count, ocount);
                if (written < 0 || written == count)
                        goto out;
                /*
                 * direct-io write to a hole: fall through to buffered I/O
                 * for completing the rest of the request.
                 */
                pos += written;
                count -= written;
                written_buffered = generic_file_buffered_write(iocb, iov,
                                                nr_segs, pos, ppos, count,
                                                written);
                /*
                 * If generic_file_buffered_write() retuned a synchronous error
                 * then we want to return the number of bytes which were
                 * direct-written, or the error code if that was zero.  Note
                 * that this differs from normal direct-io semantics, which
                 * will return -EFOO even if some bytes were written.
                 */
                if (written_buffered < 0) {
                        err = written_buffered;
                        goto out;
                }

                /*
                 * We need to ensure that the page cache pages are written to
                 * disk and invalidated to preserve the expected O_DIRECT
                 * semantics.
                 */
                endbyte = pos + written_buffered - written - 1;
                err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte);
                if (err == 0) {
                        written = written_buffered;
                        invalidate_mapping_pages(mapping,
                                                 pos >> PAGE_CACHE_SHIFT,
                                                 endbyte >> PAGE_CACHE_SHIFT);
                } else {
                        /*
                         * We don't know how much we wrote, so just return
                         * the number of bytes which were direct-written
                         */
                }
        } else {
                written = generic_file_buffered_write(iocb, iov, nr_segs,
                                pos, ppos, count, written);
        }
out:
        current->backing_dev_info = NULL;
        return written ? written : err;
}
ssize_t
generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
                unsigned long nr_segs, loff_t pos, loff_t *ppos,
                size_t count, ssize_t written)
{
        struct file *file = iocb->ki_filp;
        ssize_t status;
        struct iov_iter i;

        iov_iter_init(&i, iov, nr_segs, count, written);
        status = generic_perform_write(file, &i, pos);

        if (likely(status >= 0)) {
                written += status;
                *ppos = pos + status;
          }
       
        return written ? written : status;
}
****************************************************************************************
同sys_read一样,开始调用流程都很明了。
generic_perform_write的pos表示要写的文件内偏移,iov_iter->count保存要写的数据大小。根据这两个参数计算出write涉及到的所有page偏移,对每个page分别处理:
1. a_ops->write_begin,查找或者分配相应的page缓存,有必要为这个page的每个block分配buffer_head,有必要还要从等待从磁盘读取这些数据。基本上write_begin起了一个类似于读取磁盘的作用。
2. iov_iter_copy_from_user_atomic通过kmap_atomic临时内核映射page到一个内核地址,然后把用户buf数据拷贝这个地址中去,然后释放映射。pagefault disable/enable作用???
3. a_ops->write_end根据情况更新buffer_head,page的状态,并且调用mark_inode_dirty通知per-bdi内核线程进行回写。
****************************************************************************************
static ssize_t generic_perform_write(struct file *file,
                                struct iov_iter *i, loff_t pos)
{
        struct address_space *mapping = file->f_mapping;
        const struct address_space_operations *a_ops = mapping->a_ops;
        long status = 0;
        ssize_t written = 0;
        unsigned int flags = 0;

        /*
         * Copies from kernel address space cannot fail (NFSD is a big user).
         */
        if (segment_eq(get_fs(), KERNEL_DS))
                flags |= AOP_FLAG_UNINTERRUPTIBLE;

        do {
                struct page *page;
                unsigned long offset;        /* Offset into pagecache page */
                unsigned long bytes;        /* Bytes to write to page */
                size_t copied;                /* Bytes copied from user */
                void *fsdata;

                offset = (pos & (PAGE_CACHE_SIZE - 1));
                bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
                                                iov_iter_count(i));

again:
                /*
                 * Bring in the user page that we will copy from _first_.
                 * Otherwise there's a nasty deadlock on copying from the
                 * same page as we're writing to, without it being marked
                 * up-to-date.
                 *
                 * Not only is this an optimisation, but it is also required
                 * to check that the address is actually valid, when atomic
                 * usercopies are used, below.
                 */
                if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
                        status = -EFAULT;
                        break;
                }

                status = a_ops->write_begin(file, mapping, pos, bytes, flags,
                                                &page, &fsdata);
                if (unlikely(status))
                        break;

                if (mapping_writably_mapped(mapping))
                        flush_dcache_page(page);

                pagefault_disable();
                copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
                pagefault_enable();
                flush_dcache_page(page);

                mark_page_accessed(page);
                status = a_ops->write_end(file, mapping, pos, bytes, copied,
                                                page, fsdata);
                if (unlikely(status < 0))
                        break;
                copied = status;

                cond_resched();

                iov_iter_advance(i, copied);
                if (unlikely(copied == 0)) {
                        /*
                         * If we were unable to copy any data at all, we must
                         * fall back to a single segment length write.
                         *
                         * If we didn't fallback here, we could livelock
                         * because not all segments in the iov can be copied at
                         * once without a pagefault.
                         */
                        bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
                                                iov_iter_single_seg_count(i));
                        goto again;
                }
                pos += copied;
                written += copied;

                balance_dirty_pages_ratelimited(mapping);
                if (fatal_signal_pending(current)) {
                        status = -EINTR;
                        break;
                }
        } while (iov_iter_count(i));

        return written ? written : status;
}

****************************************************************************************
ext2_write_begin首先调用grab_cache_page_write_begin,查找page,没有则分配一个,但是如果page正在被写回(PG_writeback状态,写请求正在被提交但是没完成),则必须等待上一个写过程完成,要不然数据就乱大了。
然后有必要的话__block_write_begin为page的每个block分配buffer_head,然后对每个buffer_head分别进行处理:
1. 检查该写请求是否涉及到该block,没有设计的话则继续处理下一个block。
2. 如果该buffer_head还没有映射到磁盘,BH_Mapped没有设置,新分配的buffer_head都是没有影射的,所以这里又要使用ext2_get_block去读取这个映射关系了,同sys_read中的区别是,这里只读取一个块的映射关系。
3. 检查page如果已经uptodate,设置buffer_head uptodate。如果用户先sys_read,再sys_write的话这里page一般是uptodate的。
4. 反之假设用户直接sys_write的话,这里的page就不是uptodate的,通过ll_rw_block提交buffer_head读请求将磁盘数据读取出来。
所以block都处理完毕了,一种直接sys_write的情况就是,通过ll_rw_block发出了4个buffer_head读取请求,那么现在必须通过wait_on_buffer等待这些读取全部完成才能继续。
****************************************************************************************
static int ext2_write_begin(struct file *file, struct address_space *mapping,
                loff_t pos, unsigned len, unsigned flags,
                struct page **pagep, void **fsdata)
{
        int ret;

        ret = block_write_begin(mapping, pos, len, flags, pagep,
                                ext2_get_block);
        if (ret < 0)
                ext2_write_failed(mapping, pos + len);
        return ret;
}
int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
                unsigned flags, struct page **pagep, get_block_t *get_block)
{
        pgoff_t index = pos >> PAGE_CACHE_SHIFT;
        struct page *page;
        int status;

        page = grab_cache_page_write_begin(mapping, index, flags);
        if (!page)
                return -ENOMEM;

        status = __block_write_begin(page, pos, len, get_block);
        if (unlikely(status)) {
                unlock_page(page);
                page_cache_release(page);
                page = NULL;
        }

        *pagep = page;
        return status;
}
struct page *grab_cache_page_write_begin(struct address_space *mapping,
                                        pgoff_t index, unsigned flags)
{
        int status;
        gfp_t gfp_mask;
        struct page *page;
        gfp_t gfp_notmask = 0;

        gfp_mask = mapping_gfp_mask(mapping);
        if (mapping_cap_account_dirty(mapping))
                gfp_mask |= __GFP_WRITE;
        if (flags & AOP_FLAG_NOFS)
                gfp_notmask = __GFP_FS;
repeat:
        page = find_lock_page(mapping, index);
        if (page)
                goto found;

        page = __page_cache_alloc(gfp_mask & ~gfp_notmask);
        if (!page)
                return NULL;
        status = add_to_page_cache_lru(page, mapping, index,
                                                GFP_KERNEL & ~gfp_notmask);
        if (unlikely(status)) {
                page_cache_release(page);
                if (status == -EEXIST)
                        goto repeat;
                return NULL;
        }
found:
        wait_on_page_writeback(page);
        return page;
}
int __block_write_begin(struct page *page, loff_t pos, unsigned len,
                get_block_t *get_block)
{
        unsigned from = pos & (PAGE_CACHE_SIZE - 1);
        unsigned to = from + len;
        struct inode *inode = page->mapping->host;
        unsigned block_start, block_end;
        sector_t block;
        int err = 0;
        unsigned blocksize, bbits;
        struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;

        BUG_ON(!PageLocked(page));
        BUG_ON(from > PAGE_CACHE_SIZE);
        BUG_ON(to > PAGE_CACHE_SIZE);
        BUG_ON(from > to);

        blocksize = 1 << inode->i_blkbits;
        if (!page_has_buffers(page))
                create_empty_buffers(page, blocksize, 0);
        head = page_buffers(page);

        bbits = inode->i_blkbits;
        block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);

        for(bh = head, block_start = 0; bh != head || !block_start;
            block++, block_start=block_end, bh = bh->b_this_page) {
                block_end = block_start + blocksize;
                if (block_end <= from || block_start >= to) {
                        if (PageUptodate(page)) {
                                if (!buffer_uptodate(bh))
                                        set_buffer_uptodate(bh);
                        }
                        continue;
                }
                if (buffer_new(bh))
                        clear_buffer_new(bh);
                if (!buffer_mapped(bh)) {
                        WARN_ON(bh->b_size != blocksize);
                        err = get_block(inode, block, bh, 1);
                        if (err)
                                break;
                        if (buffer_new(bh)) {
                                unmap_underlying_metadata(bh->b_bdev,
                                                        bh->b_blocknr);
                                if (PageUptodate(page)) {
                                        clear_buffer_new(bh);
                                        set_buffer_uptodate(bh);
                                        mark_buffer_dirty(bh);
                                        continue;
                                }
                                if (block_end > to || block_start < from)
                                        zero_user_segments(page,
                                                to, block_end,
                                                block_start, from);
                                continue;
                        }
                }
                if (PageUptodate(page)) {
                        if (!buffer_uptodate(bh))
                                set_buffer_uptodate(bh);
                        continue;
                }
                if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
                    !buffer_unwritten(bh) &&
                     (block_start < from || block_end > to)) {
                        ll_rw_block(READ, 1, &bh);
                        *wait_bh++=bh;
                }
        }
        /*
         * If we issued read requests - let them complete.
         */
        while(wait_bh > wait) {
                wait_on_buffer(*--wait_bh);
                if (!buffer_uptodate(*wait_bh))
                        err = -EIO;
        }
        if (unlikely(err))
                page_zero_new_buffers(page, from, to);
        return err;
}




评分

参与人数 1可用积分 +6 收起 理由
瀚海书香 + 6 赞一个!

查看全部评分

论坛徽章:
0
发表于 2012-10-09 18:27 |显示全部楼层
本帖最后由 blake326 于 2012-10-09 18:52 编辑

****************************************************************************************
block_write_end中,首先flush_dcache_page体系相关的刷新数据cache(一般risc需要,比较复杂,表示不理解?)
然后调用__block_commit_write对page每个buffer_head进行处理,对本次写操作涉及到的buffer_head都mark_buffer_dirty设置dirty位。并且如果所有buffer_head都是最新的,刚写过的肯定是最新的,没写的也是最新的话,设置page的uptodate位。
最后解锁page,返回写的字节数。
****************************************************************************************
static int ext2_write_end(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned copied,
                        struct page *page, void *fsdata)
{
        int ret;

        ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
        if (ret < len)
                ext2_write_failed(mapping, pos + len);
        return ret;
}
int generic_write_end(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned copied,
                        struct page *page, void *fsdata)
{
        struct inode *inode = mapping->host;
        int i_size_changed = 0;

        copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);

        /*
         * No need to use i_size_read() here, the i_size
         * cannot change under us because we hold i_mutex.
         *
         * But it's important to update i_size while still holding page lock:
         * page writeout could otherwise come in and zero beyond i_size.
         */
        if (pos+copied > inode->i_size) {
                i_size_write(inode, pos+copied);
                i_size_changed = 1;
        }

        unlock_page(page);
        page_cache_release(page);

        /*
         * Don't mark the inode dirty under page lock. First, it unnecessarily
         * makes the holding time of page lock longer. Second, it forces lock
         * ordering of page lock and transaction start for journaling
         * filesystems.
         */
        if (i_size_changed)
                mark_inode_dirty(inode);

        return copied;
}

int block_write_end(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned copied,
                        struct page *page, void *fsdata)
{
        struct inode *inode = mapping->host;
        unsigned start;

        start = pos & (PAGE_CACHE_SIZE - 1);

        if (unlikely(copied < len)) {
                /*
                 * The buffers that were written will now be uptodate, so we
                 * don't have to worry about a readpage reading them and
                 * overwriting a partial write. However if we have encountered
                 * a short write and only partially written into a buffer, it
                 * will not be marked uptodate, so a readpage might come in and
                 * destroy our partial write.
                 *
                 * Do the simplest thing, and just treat any short write to a
                 * non uptodate page as a zero-length write, and force the
                 * caller to redo the whole thing.
                 */
                if (!PageUptodate(page))
                        copied = 0;

                page_zero_new_buffers(page, start+copied, start+len);
        }
        flush_dcache_page(page);

        /* This could be a short (even 0-length) commit */
        __block_commit_write(inode, page, start, start+copied);

        return copied;
}
static int __block_commit_write(struct inode *inode, struct page *page,
                unsigned from, unsigned to)
{
        unsigned block_start, block_end;
        int partial = 0;
        unsigned blocksize;
        struct buffer_head *bh, *head;

        blocksize = 1 << inode->i_blkbits;

        for(bh = head = page_buffers(page), block_start = 0;
            bh != head || !block_start;
            block_start=block_end, bh = bh->b_this_page) {
                block_end = block_start + blocksize;
                if (block_end <= from || block_start >= to) {
                        if (!buffer_uptodate(bh))
                                partial = 1;
                } else {
                        set_buffer_uptodate(bh);
                        mark_buffer_dirty(bh);
                }
                clear_buffer_new(bh);
        }

        /*
         * If this is a partial write which happened to make all buffers
         * uptodate then we can optimize away a bogus readpage() for
         * the next read(). Here we 'discover' whether the page went
         * uptodate as a result of this (potentially partial) write.
         */
        if (!partial)
                SetPageUptodate(page);
        return 0;
}

****************************************************************************************
test_set_buffer_dirty设置dirty位,返回旧的dirty位。所以正常情况,首次对一个page写的话将会执行到__set_page_dirty设置page的dirty。当然__set_page_dirty不仅仅设置一个dirty位,还调用到__mark_inode_dirty唤醒writeback线程。
__mark_inode_dirty检查如果inode不是dirty的话,将它设置为dirty,然后看inode关联的bdi的dirty list(bdi->wb.b_dirty)是否有inode要处理,然后将该inode move到bdi dirty list中,这个时候看看是否要延迟唤醒(默认5s)bdi writeback线程进行真正的写磁盘动作。
****************************************************************************************
void mark_buffer_dirty(struct buffer_head *bh)
{
        WARN_ON_ONCE(!buffer_uptodate(bh));

        /*
         * Very *carefully* optimize the it-is-already-dirty case.
         *
         * Don't let the final "is it dirty" escape to before we
         * perhaps modified the buffer.
         */
        if (buffer_dirty(bh)) {
                smp_mb();
                if (buffer_dirty(bh))
                        return;
        }

        if (!test_set_buffer_dirty(bh)) {
                struct page *page = bh->b_page;
                if (!TestSetPageDirty(page)) {
                        struct address_space *mapping = page_mapping(page);
                        if (mapping)
                                __set_page_dirty(page, mapping, 0);
                }
        }
}
static void __set_page_dirty(struct page *page,
                struct address_space *mapping, int warn)
{
        spin_lock_irq(&mapping->tree_lock);
        if (page->mapping) {        /* Race with truncate? */
                WARN_ON_ONCE(warn && !PageUptodate(page));
                account_page_dirtied(page, mapping);
                radix_tree_tag_set(&mapping->page_tree,
                                page_index(page), PAGECACHE_TAG_DIRTY);
        }
        spin_unlock_irq(&mapping->tree_lock);
        __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
}
void __mark_inode_dirty(struct inode *inode, int flags)
{
        struct super_block *sb = inode->i_sb;
        struct backing_dev_info *bdi = NULL;

        /*
         * Don't do this for I_DIRTY_PAGES - that doesn't actually
         * dirty the inode itself
         */
        if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
                if (sb->s_op->dirty_inode)
                        sb->s_op->dirty_inode(inode, flags);
        }

        /*
         * make sure that changes are seen by all cpus before we test i_state
         * -- mikulas
         */
        smp_mb();

        /* avoid the locking if we can */
        if ((inode->i_state & flags) == flags)
                return;

        if (unlikely(block_dump))
                block_dump___mark_inode_dirty(inode);

        spin_lock(&inode->i_lock);
        if ((inode->i_state & flags) != flags) {
                const int was_dirty = inode->i_state & I_DIRTY;

                inode->i_state |= flags;

                /*
                 * If the inode is being synced, just update its dirty state.
                 * The unlocker will place the inode on the appropriate
                 * superblock list, based upon its state.
                 */
                if (inode->i_state & I_SYNC)
                        goto out_unlock_inode;

                /*
                 * Only add valid (hashed) inodes to the superblock's
                 * dirty list.  Add blockdev inodes as well.
                 */
                if (!S_ISBLK(inode->i_mode)) {
                        if (inode_unhashed(inode))
                                goto out_unlock_inode;
                }
                if (inode->i_state & I_FREEING)
                        goto out_unlock_inode;

                /*
                 * If the inode was already on b_dirty/b_io/b_more_io, don't
                 * reposition it (that would break b_dirty time-ordering).
                 */
                if (!was_dirty) {
                        bool wakeup_bdi = false;
                        bdi = inode_to_bdi(inode);

                        if (bdi_cap_writeback_dirty(bdi)) {
                                WARN(!test_bit(BDI_registered, &bdi->state),
                                     "bdi-%s not registered\n", bdi->name);

                                /*
                                 * If this is the first dirty inode for this
                                 * bdi, we have to wake-up the corresponding
                                 * bdi thread to make sure background
                                 * write-back happens later.
                                 */
                                if (!wb_has_dirty_io(&bdi->wb))
                                        wakeup_bdi = true;
                        }

                        spin_unlock(&inode->i_lock);
                        spin_lock(&bdi->wb.list_lock);
                        inode->dirtied_when = jiffies;
                        list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
                        spin_unlock(&bdi->wb.list_lock);

                        if (wakeup_bdi)
                                bdi_wakeup_thread_delayed(bdi);
                        return;
                }
        }
out_unlock_inode:
        spin_unlock(&inode->i_lock);

}

论坛徽章:
0
发表于 2012-10-09 18:27 |显示全部楼层
****************************************************************************************
sys_write仅仅是写page,并没有提交写请求,具体的写工作是一个叫做bdi writeback线程来负责的。在代码里面是由每个磁盘相关的request_queue.backing_dev_info来管理的。backing_dev_info维护了一些磁盘辅助信息,最重要的一个属性就是bdi_writeback结构:
bdi_writeback的task指向了writeback线程,如果存在的话。
wakeup_timer定时器,默认的回调方法就是唤醒default_backing_dev_info.wb.task或者当前bdi.wb.task
b_dirty链表保存了被磁盘上脏的inode。
在add_disk中,通过bdi_register_dev(&disk->queue->backing_dev_info, disk_devt(disk))注册bdi。就是说每个gendisk就有一个bdi。
****************************************************************************************
struct backing_dev_info {
        struct list_head bdi_list;
        unsigned long ra_pages;        /* max readahead in PAGE_CACHE_SIZE units */
        unsigned long state;        /* Always use atomic bitops on this */
        unsigned int capabilities; /* Device capabilities */
        congested_fn *congested_fn; /* Function pointer if device is md/dm */
        void *congested_data;        /* Pointer to aux data for congested func */

        char *name;

        struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS];

        unsigned long bw_time_stamp;        /* last time write bw is updated */
        unsigned long dirtied_stamp;
        unsigned long written_stamp;        /* pages written at bw_time_stamp */
        unsigned long write_bandwidth;        /* the estimated write bandwidth */
        unsigned long avg_write_bandwidth; /* further smoothed write bw */

        /*
         * The base dirty throttle rate, re-calculated on every 200ms.
         * All the bdi tasks' dirty rate will be curbed under it.
         * @dirty_ratelimit tracks the estimated @balanced_dirty_ratelimit
         * in small steps and is much more smooth/stable than the latter.
         */
        unsigned long dirty_ratelimit;
        unsigned long balanced_dirty_ratelimit;

        struct fprop_local_percpu completions;
        int dirty_exceeded;

        unsigned int min_ratio;
        unsigned int max_ratio, max_prop_frac;

        struct bdi_writeback wb;  /* default writeback info for this bdi */
        spinlock_t wb_lock;          /* protects work_list */

        struct list_head work_list;

        struct device *dev;

        struct timer_list laptop_mode_wb_timer;

#ifdef CONFIG_DEBUG_FS
        struct dentry *debug_dir;
        struct dentry *debug_stats;
#endif
};
struct bdi_writeback {
        struct backing_dev_info *bdi;        /* our parent bdi */
        unsigned int nr;

        unsigned long last_old_flush;        /* last old data flush */
        unsigned long last_active;        /* last time bdi thread was active */

        struct task_struct *task;        /* writeback thread */
        struct timer_list wakeup_timer; /* used for delayed bdi thread wakeup */
        struct list_head b_dirty;        /* dirty inodes */
        struct list_head b_io;                /* parked for writeback */
        struct list_head b_more_io;        /* parked for more writeback */
        spinlock_t list_lock;                /* protects the b_* lists */
};
static void wakeup_timer_fn(unsigned long data)
{
        struct backing_dev_info *bdi = (struct backing_dev_info *)data;

        spin_lock_bh(&bdi->wb_lock);
        if (bdi->wb.task) {
                trace_writeback_wake_thread(bdi);
                wake_up_process(bdi->wb.task);
        } else if (bdi->dev) {
                /*
                 * When bdi tasks are inactive for long time, they are killed.
                 * In this case we have to wake-up the forker thread which
                 * should create and run the bdi thread.
                 */
                trace_writeback_wake_forker_thread(bdi);
                wake_up_process(default_backing_dev_info.wb.task);
        }
        spin_unlock_bh(&bdi->wb_lock);
}



****************************************************************************************
除了每个gendisk都会bid_register一个bdi之外,内核还注册了一个默认的default_backing_dev_info 。这个bdi创建了一个bdi-default的线程管理了其他普通的bdi的writeback线程。
****************************************************************************************
struct backing_dev_info default_backing_dev_info = {
        .name                = "default",
        .ra_pages        = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE,
        .state                = 0,
        .capabilities        = BDI_CAP_MAP_COPY,
};
static int __init default_bdi_init(void)
{
        int err;

        err = bdi_init(&default_backing_dev_info);
        if (!err)
                bdi_register(&default_backing_dev_info, NULL, "default";
        err = bdi_init(&noop_backing_dev_info);

        return err;
}
subsys_initcall(default_bdi_init);
int bdi_register(struct backing_dev_info *bdi, struct device *parent,
                const char *fmt, ...)
{
        va_list args;
        struct device *dev;

        if (bdi->dev)        /* The driver needs to use separate queues per device */
                return 0;

        va_start(args, fmt);
        dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args);
        va_end(args);
        if (IS_ERR(dev))
                return PTR_ERR(dev);

        bdi->dev = dev;

        /*
         * Just start the forker thread for our default backing_dev_info,
         * and add other bdi's to the list. They will get a thread created
         * on-demand when they need it.
         */
        if (bdi_cap_flush_forker(bdi)) {
                struct bdi_writeback *wb = &bdi->wb;

                wb->task = kthread_run(bdi_forker_thread, wb, "bdi-%s",
                                                dev_name(dev));
                if (IS_ERR(wb->task))
                        return PTR_ERR(wb->task);
        }

        bdi_debug_register(bdi, dev_name(dev));
        set_bit(BDI_registered, &bdi->state);

        spin_lock_bh(&bdi_lock);
        list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
        spin_unlock_bh(&bdi_lock);

        trace_writeback_bdi_register(bdi);
        return 0;
}


****************************************************************************************
default bdi线程bdi_forker_thread实际上是一个死循环,一般是由wakeup_timer定时器唤醒。具体处理如下:
1. 检查default bdi的wb是否有io请求,或者是否bdi work。有的话则删除wakeup_timer,调用wb_do_writeback进行回写。
2. 通过bdi_list链表遍历所有注册的bdi,对每个bdi进行检查。
如果bdi没有写回的特点比如内存bdi则继续下一个bdi。
然后同样检查bdi的wb是否有请求或者有bdi work,如果有的话,并且bdi->wb.task=null,则设置bdi的action=FORK_THREAD。然后break。
如果bdi没有工作要处理,并且现在该bdi->wb.task已经空闲了5秒钟以上了,则设置bid的action=KILL_THRREAD。然后break。
3. 现在假设发现了要启动一个bdi,kthread_create(bdi_writeback_thread, &bdi->wb,"flush-%s", dev_name(bdi->dev))创建一个flush-8:0之类的writeback线程。
4. 假设没有fork,也没有kill的bdi。则default bdi线程睡眠5s钟,或许醒来之后会发现哪个bdi长期不活动可以kill掉了。
****************************************************************************************
static int bdi_forker_thread(void *ptr)
{
        struct bdi_writeback *me = ptr;

        current->flags |= PF_SWAPWRITE;
        set_freezable();

        /*
         * Our parent may run at a different priority, just set us to normal
         */
        set_user_nice(current, 0);

        for (; {
                struct task_struct *task = NULL;
                struct backing_dev_info *bdi;
                enum {
                        NO_ACTION,   /* Nothing to do */
                        FORK_THREAD, /* Fork bdi thread */
                        KILL_THREAD, /* Kill inactive bdi thread */
                } action = NO_ACTION;

                /*
                 * Temporary measure, we want to make sure we don't see
                 * dirty data on the default backing_dev_info
                 */
                if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) {
                        del_timer(&me->wakeup_timer);
                        wb_do_writeback(me, 0);
                }

                spin_lock_bh(&bdi_lock);
                /*
                 * In the following loop we are going to check whether we have
                 * some work to do without any synchronization with tasks
                 * waking us up to do work for them. Set the task state here
                 * so that we don't miss wakeups after verifying conditions.
                 */
                set_current_state(TASK_INTERRUPTIBLE);

                list_for_each_entry(bdi, &bdi_list, bdi_list) {
                        bool have_dirty_io;

                        if (!bdi_cap_writeback_dirty(bdi) ||
                             bdi_cap_flush_forker(bdi))
                                continue;

                        WARN(!test_bit(BDI_registered, &bdi->state),
                             "bdi %p/%s is not registered!\n", bdi, bdi->name);

                        have_dirty_io = !list_empty(&bdi->work_list) ||
                                        wb_has_dirty_io(&bdi->wb);

                        /*
                         * If the bdi has work to do, but the thread does not
                         * exist - create it.
                         */
                        if (!bdi->wb.task && have_dirty_io) {
                                /*
                                 * Set the pending bit - if someone will try to
                                 * unregister this bdi - it'll wait on this bit.
                                 */
                                set_bit(BDI_pending, &bdi->state);
                                action = FORK_THREAD;
                                break;
                        }

                        spin_lock(&bdi->wb_lock);

                        /*
                         * If there is no work to do and the bdi thread was
                         * inactive long enough - kill it. The wb_lock is taken
                         * to make sure no-one adds more work to this bdi and
                         * wakes the bdi thread up.
                         */
                        if (bdi->wb.task && !have_dirty_io &&
                            time_after(jiffies, bdi->wb.last_active +
                                                bdi_longest_inactive())) {
                                task = bdi->wb.task;
                                bdi->wb.task = NULL;
                                spin_unlock(&bdi->wb_lock);
                                set_bit(BDI_pending, &bdi->state);
                                action = KILL_THREAD;
                                break;
                        }
                        spin_unlock(&bdi->wb_lock);
                }
                spin_unlock_bh(&bdi_lock);

                /* Keep working if default bdi still has things to do */
                if (!list_empty(&me->bdi->work_list))
                        __set_current_state(TASK_RUNNING);

                switch (action) {
                case FORK_THREAD:
                        __set_current_state(TASK_RUNNING);
                        task = kthread_create(bdi_writeback_thread, &bdi->wb,
                                              "flush-%s", dev_name(bdi->dev));
                        if (IS_ERR(task)) {
                                /*
                                 * If thread creation fails, force writeout of
                                 * the bdi from the thread. Hopefully 1024 is
                                 * large enough for efficient IO.
                                 */
                                writeback_inodes_wb(&bdi->wb, 1024,
                                                    WB_REASON_FORKER_THREAD);
                        } else {
                                /*
                                 * The spinlock makes sure we do not lose
                                 * wake-ups when racing with 'bdi_queue_work()'.
                                 * And as soon as the bdi thread is visible, we
                                 * can start it.
                                 */
                                spin_lock_bh(&bdi->wb_lock);
                                bdi->wb.task = task;
                                spin_unlock_bh(&bdi->wb_lock);
                                wake_up_process(task);
                        }
                        bdi_clear_pending(bdi);
                        break;

                case KILL_THREAD:
                        __set_current_state(TASK_RUNNING);
                        kthread_stop(task);
                        bdi_clear_pending(bdi);
                        break;

                case NO_ACTION:
                        if (!wb_has_dirty_io(me) || !dirty_writeback_interval)
                                /*
                                 * There are no dirty data. The only thing we
                                 * should now care about is checking for
                                 * inactive bdi threads and killing them. Thus,
                                 * let's sleep for longer time, save energy and
                                 * be friendly for battery-driven devices.
                                 */
                                schedule_timeout(bdi_longest_inactive());
                        else
                                schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
                        try_to_freeze();
                        break;
                }
        }

        return 0;
}

****************************************************************************************
继续sys_write,执行block_write_end之后通过__mark_inode_dirty有需要的话唤醒default bdi wb或者bdi.wb.task线程。条条大路通罗马,bdi_writeback_thread最终会被执行。
bdi_writeback_thread本质上就是一个循环,调用wb_do_writeback进行回写。
回写之后发现仍然有工作要做,超时5s调度出去。否则直接调度出去,等待被sys_write之类唤醒。

很明显内核磁盘回写的策略是:
如果持续的对磁盘有写请求的话,bdi wb task将会一直存在,但是每隔5s集中处理一次回写工作。
如果只是偶尔的对磁盘有写请求的话,一般一个写请求之后,都会5s超时唤醒default bdi线程,然default bdi线程创建并运行一个磁盘的flush线程,在这个flush线程中处理回写,过了5s左右之后这个bdi wb task会被bdi-default kill掉。然后偶然又有写请求产生。
但是wb_do_writeback回写并不是一定会提交写请求给磁盘,一般两种情况满足下才会提交一部分page写请求给磁盘,分别是,page dirty超时30s,另外一个是dirty page比例太高。
****************************************************************************************
int bdi_writeback_thread(void *data)
{
        while (!kthread_freezable_should_stop(NULL)) {

                del_timer(&wb->wakeup_timer);

                pages_written = wb_do_writeback(wb, 0);

                trace_writeback_pages_written(pages_written);

                if (pages_written)
                        wb->last_active = jiffies;

                set_current_state(TASK_INTERRUPTIBLE);
                if (!list_empty(&bdi->work_list) || kthread_should_stop()) {
                        __set_current_state(TASK_RUNNING);
                        continue;
                }

                if (wb_has_dirty_io(wb) && dirty_writeback_interval)
                        schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
                else {
                        /*
                         * We have nothing to do, so can go sleep without any
                         * timeout and save power. When a work is queued or
                         * something is made dirty - we will be woken up.
                         */
                        schedule();
                }
        }

        /* Flush any work that raced with us exiting */
        if (!list_empty(&bdi->work_list))
                wb_do_writeback(wb, 1);

        trace_writeback_thread_stop(bdi);
        return 0;
}



****************************************************************************************
wb_do_writeback是具体处理写回工作的函数。通过get_next_work_item,从bdi->work_list任务队列上取出所有wb_writeback_work类型的任务,并且分别调用wb_writeback进行处理。就是说,具体的工作处理都是对wb_writeback_work的处理。但是,vfs_write中我们只不过将dirty的inode move到了bdi->wb.b_dirty队列。那么wb_writeback_work任务怎么来的呢?
wb_check_old_data_flush() 产生一个for_kupdate的任务通过wb_writeback处理。
wb_check_background_flush()产生一个for_background的任务通过wb_writeback处理。
所以具体逻辑都是在wb_writeback里面处理的,for_kupdate会写回dirty超过30s的page。for_background会检测是否脏页比例过高或者可用页太少,然后写回一部分page。
****************************************************************************************
long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
{
        struct backing_dev_info *bdi = wb->bdi;
        struct wb_writeback_work *work;
        long wrote = 0;

        set_bit(BDI_writeback_running, &wb->bdi->state);
        while ((work = get_next_work_item(bdi)) != NULL) {
                /*
                 * Override sync mode, in case we must wait for completion
                 * because this thread is exiting now.
                 */
                if (force_wait)
                        work->sync_mode = WB_SYNC_ALL;

                trace_writeback_exec(bdi, work);

                wrote += wb_writeback(wb, work);

                /*
                 * Notify the caller of completion if this is a synchronous
                 * work item, otherwise just free it.
                 */
                if (work->done)
                        complete(work->done);
                else
                        kfree(work);
        }

        /*
         * Check for periodic writeback, kupdated() style
         */
        wrote += wb_check_old_data_flush(wb);
        wrote += wb_check_background_flush(wb);
        clear_bit(BDI_writeback_running, &wb->bdi->state);

        return wrote;
}


****************************************************************************************
对于for_background任务,通过over_bground_thresh检查dirty过多的话才真正处理work。work->older_than_this为当前时间。
对于for_kupdate任务,设置work->older_than_this为当前时间往前30s。queue_io会用到。
如果需要继续处理任务,调用queue_io准备b_io处理队列。queue_io看到将b_more_io拼接到b_io上,并且看是否有过期的inode要move到b_io上(只有for_kupdate的任务才有可能)
然后调用__writeback_inodes_wb处理b_io上所有的inode。
__writeback_inodes_wb实现比较饶人,实际上就是对每个inode分别调用了writeback_sb_inodes,从而调用了__writeback_single_inode写回单个inode。
****************************************************************************************
static long wb_writeback(struct bdi_writeback *wb,
                         struct wb_writeback_work *work)
{
        unsigned long wb_start = jiffies;
        long nr_pages = work->nr_pages;
        unsigned long oldest_jif;
        struct inode *inode;
        long progress;

        oldest_jif = jiffies;
        work->older_than_this = &oldest_jif;

        spin_lock(&wb->list_lock);
        for (; {
                /*
                 * Stop writeback when nr_pages has been consumed
                 */
                if (work->nr_pages <= 0)
                        break;

                /*
                 * Background writeout and kupdate-style writeback may
                 * run forever. Stop them if there is other work to do
                 * so that e.g. sync can proceed. They'll be restarted
                 * after the other works are all done.
                 */
                if ((work->for_background || work->for_kupdate) &&
                    !list_empty(&wb->bdi->work_list))
                        break;

                /*
                 * For background writeout, stop when we are below the
                 * background dirty threshold
                 */
                if (work->for_background && !over_bground_thresh(wb->bdi))
                        break;

                /*
                 * Kupdate and background works are special and we want to
                 * include all inodes that need writing. Livelock avoidance is
                 * handled by these works yielding to any other work so we are
                 * safe.
                 */
                if (work->for_kupdate) {
                        oldest_jif = jiffies -
                                msecs_to_jiffies(dirty_expire_interval * 10);
                } else if (work->for_background)
                        oldest_jif = jiffies;

                trace_writeback_start(wb->bdi, work);
                if (list_empty(&wb->b_io))
                        queue_io(wb, work);
                if (work->sb)
                        progress = writeback_sb_inodes(work->sb, wb, work);
                else
                        progress = __writeback_inodes_wb(wb, work);
                trace_writeback_written(wb->bdi, work);

                wb_update_bandwidth(wb, wb_start);

                /*
                 * Did we write something? Try for more
                 *
                 * Dirty inodes are moved to b_io for writeback in batches.
                 * The completion of the current batch does not necessarily
                 * mean the overall work is done. So we keep looping as long
                 * as made some progress on cleaning pages or inodes.
                 */
                if (progress)
                        continue;
                /*
                 * No more inodes for IO, bail
                 */
                if (list_empty(&wb->b_more_io))
                        break;
                /*
                 * Nothing written. Wait for some inode to
                 * become available for writeback. Otherwise
                 * we'll just busyloop.
                 */
                if (!list_empty(&wb->b_more_io))  {
                        trace_writeback_wait(wb->bdi, work);
                        inode = wb_inode(wb->b_more_io.prev);
                        spin_lock(&inode->i_lock);
                        spin_unlock(&wb->list_lock);
                        /* This function drops i_lock... */
                        inode_sleep_on_writeback(inode);
                        spin_lock(&wb->list_lock);
                }
        }
        spin_unlock(&wb->list_lock);

        return nr_pages - work->nr_pages;
}


/*
* Queue all expired dirty inodes for io, eldest first.
* Before
*         newly dirtied     b_dirty    b_io    b_more_io
*         =============>    gf         edc     BA
* After
*         newly dirtied     b_dirty    b_io    b_more_io
*         =============>    g          fBAedc
*                                           |
*                                           +--> dequeue for IO
*/
static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work)
{
        int moved;
        assert_spin_locked(&wb->list_lock);
        list_splice_init(&wb->b_more_io, &wb->b_io);
        moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, work);
        trace_writeback_queue_io(wb, work, moved);
}

论坛徽章:
0
发表于 2012-10-09 18:28 |显示全部楼层
****************************************************************************************
首先调用do_writepages,启用plug写入address_space的所有page。
有必要的话写回inode。
我们只关心do_writepages
****************************************************************************************
static int
__writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
                         struct writeback_control *wbc)
{
        struct address_space *mapping = inode->i_mapping;
        long nr_to_write = wbc->nr_to_write;
        unsigned dirty;
        int ret;

        WARN_ON(!(inode->i_state & I_SYNC));

        ret = do_writepages(mapping, wbc);

        /*
         * Make sure to wait on the data before writing out the metadata.
         * This is important for filesystems that modify metadata on data
         * I/O completion.
         */
        if (wbc->sync_mode == WB_SYNC_ALL) {
                int err = filemap_fdatawait(mapping);
                if (ret == 0)
                        ret = err;
        }

        /*
         * Some filesystems may redirty the inode during the writeback
         * due to delalloc, clear dirty metadata flags right before
         * write_inode()
         */
        spin_lock(&inode->i_lock);
        /* Clear I_DIRTY_PAGES if we've written out all dirty pages */
        if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
                inode->i_state &= ~I_DIRTY_PAGES;
        dirty = inode->i_state & I_DIRTY;
        inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC);
        spin_unlock(&inode->i_lock);
        /* Don't write the inode if only I_DIRTY_PAGES was set */
        if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
                int err = write_inode(inode, wbc);
                if (ret == 0)
                        ret = err;
        }
        trace_writeback_single_inode(inode, wbc, nr_to_write);
        return ret;
}


****************************************************************************************
write_cache_pages实际上对每个page分别调用了mapping->a_ops->writepage。例如ext2_writepage。最终调用到block_write_full_page_endio。
block_write_full_page_endio方法中会首先设置page的PG_writeback,然后对每个buffer_head分别submit_bh。

当写请求完成之后,每个页的每个block都是通过submit_bh提交的,所以每个block对应一个bio,当然同read一样,这些bio可以组合到一个request中。因此每个bio完成之后都会调用end_buffer_async_write回调函数。然后处理buffer_head和page两个层次的状态:
unlock_buffer,这样如果在bh写请求过程中,有读取bh的请求,那么这个时候可以唤醒它。
如果所有buffer_head都是uptodate的,则clear PG_writeback,唤醒等待在PG_writeback的进程。
****************************************************************************************
int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
        int ret;

        if (wbc->nr_to_write <= 0)
                return 0;
        if (mapping->a_ops->writepages)
                ret = mapping->a_ops->writepages(mapping, wbc);
        else
                ret = generic_writepages(mapping, wbc);
        return ret;
}
int generic_writepages(struct address_space *mapping,
                       struct writeback_control *wbc)
{
        struct blk_plug plug;
        int ret;

        /* deal with chardevs and other special file */
        if (!mapping->a_ops->writepage)
                return 0;

        blk_start_plug(&plug);
        ret = write_cache_pages(mapping, wbc, __writepage, mapping);
        blk_finish_plug(&plug);
        return ret;
}
static int ext2_writepage(struct page *page, struct writeback_control *wbc)
{
        return block_write_full_page(page, ext2_get_block, wbc);
}
int block_write_full_page(struct page *page, get_block_t *get_block,
                        struct writeback_control *wbc)
{
        return block_write_full_page_endio(page, get_block, wbc,
                                           end_buffer_async_write);
}

void end_buffer_async_write(struct buffer_head *bh, int uptodate)
{
        char b[BDEVNAME_SIZE];
        unsigned long flags;
        struct buffer_head *first;
        struct buffer_head *tmp;
        struct page *page;

        BUG_ON(!buffer_async_write(bh));

        page = bh->b_page;
        if (uptodate) {
                set_buffer_uptodate(bh);
        } else {
                if (!quiet_error(bh)) {
                        buffer_io_error(bh);
                        printk(KERN_WARNING "lost page write due to "
                                        "I/O error on %s\n",
                               bdevname(bh->b_bdev, b));
                }
                set_bit(AS_EIO, &page->mapping->flags);
                set_buffer_write_io_error(bh);
                clear_buffer_uptodate(bh);
                SetPageError(page);
        }

        first = page_buffers(page);
        local_irq_save(flags);
        bit_spin_lock(BH_Uptodate_Lock, &first->b_state);

        clear_buffer_async_write(bh);
        unlock_buffer(bh);
        tmp = bh->b_this_page;
        while (tmp != bh) {
                if (buffer_async_write(tmp)) {
                        BUG_ON(!buffer_locked(tmp));
                        goto still_busy;
                }
                tmp = tmp->b_this_page;
        }
        bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
        local_irq_restore(flags);
        end_page_writeback(page);
        return;

still_busy:
        bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
        local_irq_restore(flags);
        return;
}
void end_page_writeback(struct page *page)
{
        if (TestClearPageReclaim(page))
                rotate_reclaimable_page(page);

        if (!test_clear_page_writeback(page))
                BUG();

        smp_mb__after_clear_bit();
        wake_up_page(page, PG_writeback);
}


todo:
1. 忽略了cache相关。
2. page各种状态转换。

论坛徽章:
6
金牛座
日期:2013-10-08 10:19:10技术图书徽章
日期:2013-10-14 16:24:09CU十二周年纪念徽章
日期:2013-10-24 15:41:34狮子座
日期:2013-11-24 19:26:19未羊
日期:2014-01-23 15:50:002015年亚洲杯之阿联酋
日期:2015-05-09 14:36:15
发表于 2012-10-09 18:38 |显示全部楼层
回复 1# blake326
多谢分享!

   

论坛徽章:
0
发表于 2012-10-09 18:48 |显示全部楼层
顺便请教一个问题:

2. iov_iter_copy_from_user_atomic通过kmap_atomic临时内核映射page到一个内核地址,然后把用户buf数据拷贝这个地址中去,然后释放映射。pagefault disable/enable作用??

这里pagefault disable/enable禁止抢占有什么作用呢?如果不加我也没想出来会出什么异常情况。

论坛徽章:
6
金牛座
日期:2013-10-08 10:19:10技术图书徽章
日期:2013-10-14 16:24:09CU十二周年纪念徽章
日期:2013-10-24 15:41:34狮子座
日期:2013-11-24 19:26:19未羊
日期:2014-01-23 15:50:002015年亚洲杯之阿联酋
日期:2015-05-09 14:36:15
发表于 2012-10-09 19:15 |显示全部楼层
本帖最后由 瀚海书香 于 2012-10-09 19:18 编辑

回复 6# blake326
这里pagefault disable/enable禁止抢占有什么作用呢?

这里禁止抢占是为了确保kmap_atomic和kunmap_atomic类似的操作顺序执行。因为如果不禁止抢占,那么可能发送如下的情形:

1. kmap_atomic
2. 中断并返回
3. 被调度到另一个进程
4. 另一个进程可能会 kmap_atomic 这样就会导致kmap_atomic和kunmap_atomic不是串行执行,导致同一内存被kmap两次。

论坛徽章:
6
金牛座
日期:2013-10-08 10:19:10技术图书徽章
日期:2013-10-14 16:24:09CU十二周年纪念徽章
日期:2013-10-24 15:41:34狮子座
日期:2013-11-24 19:26:19未羊
日期:2014-01-23 15:50:002015年亚洲杯之阿联酋
日期:2015-05-09 14:36:15
发表于 2012-10-09 19:18 |显示全部楼层
回复 1# blake326
按照代码的描述,如果进程write一个页恰好是PG_writeback,那么会被阻塞,直到回写完成。
那么如果总是以a+的方式进行写文件,对应的页是否会出现PG_writeback,写是否会被阻塞?

   

论坛徽章:
0
发表于 2012-10-09 19:39 |显示全部楼层
回复 7# 瀚海书香


    kmap_atomic是临时内核映射,函数内部已经禁止了抢占。不同于永久内核映射kmap。



   
   

论坛徽章:
0
发表于 2012-10-09 19:43 |显示全部楼层
回复 8# 瀚海书香


    a+的话,默认pos递增的,所以一般来说page也不同,因此不会相互影响吧。

    如果同时写一个page,肯定会受PG_writeback影响的。
   
   
您需要登录后才可以回帖 登录 | 注册

本版积分规则 发表回复

  

北京盛拓优讯信息技术有限公司. 版权所有 京ICP备16024965号-6 北京市公安局海淀分局网监中心备案编号:11010802020122 niuxiaotong@pcpop.com 17352615567
未成年举报专区
中国互联网协会会员  联系我们:huangweiwei@itpub.net
感谢所有关心和支持过ChinaUnix的朋友们 转载本站内容请注明原作者名及出处

清除 Cookies - ChinaUnix - Archiver - WAP - TOP