免费注册 查看新帖 |

Chinaunix

  平台 论坛 博客 文库
1234
最近访问板块 发新帖
楼主: blake326
打印 上一主题 下一主题

[文件系统] 文件写入过程 [复制链接]

论坛徽章:
6
金牛座
日期:2013-10-08 10:19:10技术图书徽章
日期:2013-10-14 16:24:09CU十二周年纪念徽章
日期:2013-10-24 15:41:34狮子座
日期:2013-11-24 19:26:19未羊
日期:2014-01-23 15:50:002015年亚洲杯之阿联酋
日期:2015-05-09 14:36:15
8 [报告]
发表于 2012-10-09 19:18 |只看该作者
回复 1# blake326
按照代码的描述,如果进程write一个页恰好是PG_writeback,那么会被阻塞,直到回写完成。
那么如果总是以a+的方式进行写文件,对应的页是否会出现PG_writeback,写是否会被阻塞?

   

论坛徽章:
6
金牛座
日期:2013-10-08 10:19:10技术图书徽章
日期:2013-10-14 16:24:09CU十二周年纪念徽章
日期:2013-10-24 15:41:34狮子座
日期:2013-11-24 19:26:19未羊
日期:2014-01-23 15:50:002015年亚洲杯之阿联酋
日期:2015-05-09 14:36:15
7 [报告]
发表于 2012-10-09 19:15 |只看该作者
本帖最后由 瀚海书香 于 2012-10-09 19:18 编辑

回复 6# blake326
这里pagefault disable/enable禁止抢占有什么作用呢?

这里禁止抢占是为了确保kmap_atomic和kunmap_atomic类似的操作顺序执行。因为如果不禁止抢占,那么可能发送如下的情形:

1. kmap_atomic
2. 中断并返回
3. 被调度到另一个进程
4. 另一个进程可能会 kmap_atomic 这样就会导致kmap_atomic和kunmap_atomic不是串行执行,导致同一内存被kmap两次。

论坛徽章:
0
6 [报告]
发表于 2012-10-09 18:48 |只看该作者
顺便请教一个问题:

2. iov_iter_copy_from_user_atomic通过kmap_atomic临时内核映射page到一个内核地址,然后把用户buf数据拷贝这个地址中去,然后释放映射。pagefault disable/enable作用??

这里pagefault disable/enable禁止抢占有什么作用呢?如果不加我也没想出来会出什么异常情况。

论坛徽章:
6
金牛座
日期:2013-10-08 10:19:10技术图书徽章
日期:2013-10-14 16:24:09CU十二周年纪念徽章
日期:2013-10-24 15:41:34狮子座
日期:2013-11-24 19:26:19未羊
日期:2014-01-23 15:50:002015年亚洲杯之阿联酋
日期:2015-05-09 14:36:15
5 [报告]
发表于 2012-10-09 18:38 |只看该作者
回复 1# blake326
多谢分享!

   

论坛徽章:
0
4 [报告]
发表于 2012-10-09 18:28 |只看该作者
****************************************************************************************
首先调用do_writepages,启用plug写入address_space的所有page。
有必要的话写回inode。
我们只关心do_writepages
****************************************************************************************
static int
__writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
                         struct writeback_control *wbc)
{
        struct address_space *mapping = inode->i_mapping;
        long nr_to_write = wbc->nr_to_write;
        unsigned dirty;
        int ret;

        WARN_ON(!(inode->i_state & I_SYNC));

        ret = do_writepages(mapping, wbc);

        /*
         * Make sure to wait on the data before writing out the metadata.
         * This is important for filesystems that modify metadata on data
         * I/O completion.
         */
        if (wbc->sync_mode == WB_SYNC_ALL) {
                int err = filemap_fdatawait(mapping);
                if (ret == 0)
                        ret = err;
        }

        /*
         * Some filesystems may redirty the inode during the writeback
         * due to delalloc, clear dirty metadata flags right before
         * write_inode()
         */
        spin_lock(&inode->i_lock);
        /* Clear I_DIRTY_PAGES if we've written out all dirty pages */
        if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
                inode->i_state &= ~I_DIRTY_PAGES;
        dirty = inode->i_state & I_DIRTY;
        inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC);
        spin_unlock(&inode->i_lock);
        /* Don't write the inode if only I_DIRTY_PAGES was set */
        if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
                int err = write_inode(inode, wbc);
                if (ret == 0)
                        ret = err;
        }
        trace_writeback_single_inode(inode, wbc, nr_to_write);
        return ret;
}


****************************************************************************************
write_cache_pages实际上对每个page分别调用了mapping->a_ops->writepage。例如ext2_writepage。最终调用到block_write_full_page_endio。
block_write_full_page_endio方法中会首先设置page的PG_writeback,然后对每个buffer_head分别submit_bh。

当写请求完成之后,每个页的每个block都是通过submit_bh提交的,所以每个block对应一个bio,当然同read一样,这些bio可以组合到一个request中。因此每个bio完成之后都会调用end_buffer_async_write回调函数。然后处理buffer_head和page两个层次的状态:
unlock_buffer,这样如果在bh写请求过程中,有读取bh的请求,那么这个时候可以唤醒它。
如果所有buffer_head都是uptodate的,则clear PG_writeback,唤醒等待在PG_writeback的进程。
****************************************************************************************
int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
        int ret;

        if (wbc->nr_to_write <= 0)
                return 0;
        if (mapping->a_ops->writepages)
                ret = mapping->a_ops->writepages(mapping, wbc);
        else
                ret = generic_writepages(mapping, wbc);
        return ret;
}
int generic_writepages(struct address_space *mapping,
                       struct writeback_control *wbc)
{
        struct blk_plug plug;
        int ret;

        /* deal with chardevs and other special file */
        if (!mapping->a_ops->writepage)
                return 0;

        blk_start_plug(&plug);
        ret = write_cache_pages(mapping, wbc, __writepage, mapping);
        blk_finish_plug(&plug);
        return ret;
}
static int ext2_writepage(struct page *page, struct writeback_control *wbc)
{
        return block_write_full_page(page, ext2_get_block, wbc);
}
int block_write_full_page(struct page *page, get_block_t *get_block,
                        struct writeback_control *wbc)
{
        return block_write_full_page_endio(page, get_block, wbc,
                                           end_buffer_async_write);
}

void end_buffer_async_write(struct buffer_head *bh, int uptodate)
{
        char b[BDEVNAME_SIZE];
        unsigned long flags;
        struct buffer_head *first;
        struct buffer_head *tmp;
        struct page *page;

        BUG_ON(!buffer_async_write(bh));

        page = bh->b_page;
        if (uptodate) {
                set_buffer_uptodate(bh);
        } else {
                if (!quiet_error(bh)) {
                        buffer_io_error(bh);
                        printk(KERN_WARNING "lost page write due to "
                                        "I/O error on %s\n",
                               bdevname(bh->b_bdev, b));
                }
                set_bit(AS_EIO, &page->mapping->flags);
                set_buffer_write_io_error(bh);
                clear_buffer_uptodate(bh);
                SetPageError(page);
        }

        first = page_buffers(page);
        local_irq_save(flags);
        bit_spin_lock(BH_Uptodate_Lock, &first->b_state);

        clear_buffer_async_write(bh);
        unlock_buffer(bh);
        tmp = bh->b_this_page;
        while (tmp != bh) {
                if (buffer_async_write(tmp)) {
                        BUG_ON(!buffer_locked(tmp));
                        goto still_busy;
                }
                tmp = tmp->b_this_page;
        }
        bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
        local_irq_restore(flags);
        end_page_writeback(page);
        return;

still_busy:
        bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
        local_irq_restore(flags);
        return;
}
void end_page_writeback(struct page *page)
{
        if (TestClearPageReclaim(page))
                rotate_reclaimable_page(page);

        if (!test_clear_page_writeback(page))
                BUG();

        smp_mb__after_clear_bit();
        wake_up_page(page, PG_writeback);
}


todo:
1. 忽略了cache相关。
2. page各种状态转换。

论坛徽章:
0
3 [报告]
发表于 2012-10-09 18:27 |只看该作者
****************************************************************************************
sys_write仅仅是写page,并没有提交写请求,具体的写工作是一个叫做bdi writeback线程来负责的。在代码里面是由每个磁盘相关的request_queue.backing_dev_info来管理的。backing_dev_info维护了一些磁盘辅助信息,最重要的一个属性就是bdi_writeback结构:
bdi_writeback的task指向了writeback线程,如果存在的话。
wakeup_timer定时器,默认的回调方法就是唤醒default_backing_dev_info.wb.task或者当前bdi.wb.task
b_dirty链表保存了被磁盘上脏的inode。
在add_disk中,通过bdi_register_dev(&disk->queue->backing_dev_info, disk_devt(disk))注册bdi。就是说每个gendisk就有一个bdi。
****************************************************************************************
struct backing_dev_info {
        struct list_head bdi_list;
        unsigned long ra_pages;        /* max readahead in PAGE_CACHE_SIZE units */
        unsigned long state;        /* Always use atomic bitops on this */
        unsigned int capabilities; /* Device capabilities */
        congested_fn *congested_fn; /* Function pointer if device is md/dm */
        void *congested_data;        /* Pointer to aux data for congested func */

        char *name;

        struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS];

        unsigned long bw_time_stamp;        /* last time write bw is updated */
        unsigned long dirtied_stamp;
        unsigned long written_stamp;        /* pages written at bw_time_stamp */
        unsigned long write_bandwidth;        /* the estimated write bandwidth */
        unsigned long avg_write_bandwidth; /* further smoothed write bw */

        /*
         * The base dirty throttle rate, re-calculated on every 200ms.
         * All the bdi tasks' dirty rate will be curbed under it.
         * @dirty_ratelimit tracks the estimated @balanced_dirty_ratelimit
         * in small steps and is much more smooth/stable than the latter.
         */
        unsigned long dirty_ratelimit;
        unsigned long balanced_dirty_ratelimit;

        struct fprop_local_percpu completions;
        int dirty_exceeded;

        unsigned int min_ratio;
        unsigned int max_ratio, max_prop_frac;

        struct bdi_writeback wb;  /* default writeback info for this bdi */
        spinlock_t wb_lock;          /* protects work_list */

        struct list_head work_list;

        struct device *dev;

        struct timer_list laptop_mode_wb_timer;

#ifdef CONFIG_DEBUG_FS
        struct dentry *debug_dir;
        struct dentry *debug_stats;
#endif
};
struct bdi_writeback {
        struct backing_dev_info *bdi;        /* our parent bdi */
        unsigned int nr;

        unsigned long last_old_flush;        /* last old data flush */
        unsigned long last_active;        /* last time bdi thread was active */

        struct task_struct *task;        /* writeback thread */
        struct timer_list wakeup_timer; /* used for delayed bdi thread wakeup */
        struct list_head b_dirty;        /* dirty inodes */
        struct list_head b_io;                /* parked for writeback */
        struct list_head b_more_io;        /* parked for more writeback */
        spinlock_t list_lock;                /* protects the b_* lists */
};
static void wakeup_timer_fn(unsigned long data)
{
        struct backing_dev_info *bdi = (struct backing_dev_info *)data;

        spin_lock_bh(&bdi->wb_lock);
        if (bdi->wb.task) {
                trace_writeback_wake_thread(bdi);
                wake_up_process(bdi->wb.task);
        } else if (bdi->dev) {
                /*
                 * When bdi tasks are inactive for long time, they are killed.
                 * In this case we have to wake-up the forker thread which
                 * should create and run the bdi thread.
                 */
                trace_writeback_wake_forker_thread(bdi);
                wake_up_process(default_backing_dev_info.wb.task);
        }
        spin_unlock_bh(&bdi->wb_lock);
}



****************************************************************************************
除了每个gendisk都会bid_register一个bdi之外,内核还注册了一个默认的default_backing_dev_info 。这个bdi创建了一个bdi-default的线程管理了其他普通的bdi的writeback线程。
****************************************************************************************
struct backing_dev_info default_backing_dev_info = {
        .name                = "default",
        .ra_pages        = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE,
        .state                = 0,
        .capabilities        = BDI_CAP_MAP_COPY,
};
static int __init default_bdi_init(void)
{
        int err;

        err = bdi_init(&default_backing_dev_info);
        if (!err)
                bdi_register(&default_backing_dev_info, NULL, "default";
        err = bdi_init(&noop_backing_dev_info);

        return err;
}
subsys_initcall(default_bdi_init);
int bdi_register(struct backing_dev_info *bdi, struct device *parent,
                const char *fmt, ...)
{
        va_list args;
        struct device *dev;

        if (bdi->dev)        /* The driver needs to use separate queues per device */
                return 0;

        va_start(args, fmt);
        dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args);
        va_end(args);
        if (IS_ERR(dev))
                return PTR_ERR(dev);

        bdi->dev = dev;

        /*
         * Just start the forker thread for our default backing_dev_info,
         * and add other bdi's to the list. They will get a thread created
         * on-demand when they need it.
         */
        if (bdi_cap_flush_forker(bdi)) {
                struct bdi_writeback *wb = &bdi->wb;

                wb->task = kthread_run(bdi_forker_thread, wb, "bdi-%s",
                                                dev_name(dev));
                if (IS_ERR(wb->task))
                        return PTR_ERR(wb->task);
        }

        bdi_debug_register(bdi, dev_name(dev));
        set_bit(BDI_registered, &bdi->state);

        spin_lock_bh(&bdi_lock);
        list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
        spin_unlock_bh(&bdi_lock);

        trace_writeback_bdi_register(bdi);
        return 0;
}


****************************************************************************************
default bdi线程bdi_forker_thread实际上是一个死循环,一般是由wakeup_timer定时器唤醒。具体处理如下:
1. 检查default bdi的wb是否有io请求,或者是否bdi work。有的话则删除wakeup_timer,调用wb_do_writeback进行回写。
2. 通过bdi_list链表遍历所有注册的bdi,对每个bdi进行检查。
如果bdi没有写回的特点比如内存bdi则继续下一个bdi。
然后同样检查bdi的wb是否有请求或者有bdi work,如果有的话,并且bdi->wb.task=null,则设置bdi的action=FORK_THREAD。然后break。
如果bdi没有工作要处理,并且现在该bdi->wb.task已经空闲了5秒钟以上了,则设置bid的action=KILL_THRREAD。然后break。
3. 现在假设发现了要启动一个bdi,kthread_create(bdi_writeback_thread, &bdi->wb,"flush-%s", dev_name(bdi->dev))创建一个flush-8:0之类的writeback线程。
4. 假设没有fork,也没有kill的bdi。则default bdi线程睡眠5s钟,或许醒来之后会发现哪个bdi长期不活动可以kill掉了。
****************************************************************************************
static int bdi_forker_thread(void *ptr)
{
        struct bdi_writeback *me = ptr;

        current->flags |= PF_SWAPWRITE;
        set_freezable();

        /*
         * Our parent may run at a different priority, just set us to normal
         */
        set_user_nice(current, 0);

        for (; {
                struct task_struct *task = NULL;
                struct backing_dev_info *bdi;
                enum {
                        NO_ACTION,   /* Nothing to do */
                        FORK_THREAD, /* Fork bdi thread */
                        KILL_THREAD, /* Kill inactive bdi thread */
                } action = NO_ACTION;

                /*
                 * Temporary measure, we want to make sure we don't see
                 * dirty data on the default backing_dev_info
                 */
                if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) {
                        del_timer(&me->wakeup_timer);
                        wb_do_writeback(me, 0);
                }

                spin_lock_bh(&bdi_lock);
                /*
                 * In the following loop we are going to check whether we have
                 * some work to do without any synchronization with tasks
                 * waking us up to do work for them. Set the task state here
                 * so that we don't miss wakeups after verifying conditions.
                 */
                set_current_state(TASK_INTERRUPTIBLE);

                list_for_each_entry(bdi, &bdi_list, bdi_list) {
                        bool have_dirty_io;

                        if (!bdi_cap_writeback_dirty(bdi) ||
                             bdi_cap_flush_forker(bdi))
                                continue;

                        WARN(!test_bit(BDI_registered, &bdi->state),
                             "bdi %p/%s is not registered!\n", bdi, bdi->name);

                        have_dirty_io = !list_empty(&bdi->work_list) ||
                                        wb_has_dirty_io(&bdi->wb);

                        /*
                         * If the bdi has work to do, but the thread does not
                         * exist - create it.
                         */
                        if (!bdi->wb.task && have_dirty_io) {
                                /*
                                 * Set the pending bit - if someone will try to
                                 * unregister this bdi - it'll wait on this bit.
                                 */
                                set_bit(BDI_pending, &bdi->state);
                                action = FORK_THREAD;
                                break;
                        }

                        spin_lock(&bdi->wb_lock);

                        /*
                         * If there is no work to do and the bdi thread was
                         * inactive long enough - kill it. The wb_lock is taken
                         * to make sure no-one adds more work to this bdi and
                         * wakes the bdi thread up.
                         */
                        if (bdi->wb.task && !have_dirty_io &&
                            time_after(jiffies, bdi->wb.last_active +
                                                bdi_longest_inactive())) {
                                task = bdi->wb.task;
                                bdi->wb.task = NULL;
                                spin_unlock(&bdi->wb_lock);
                                set_bit(BDI_pending, &bdi->state);
                                action = KILL_THREAD;
                                break;
                        }
                        spin_unlock(&bdi->wb_lock);
                }
                spin_unlock_bh(&bdi_lock);

                /* Keep working if default bdi still has things to do */
                if (!list_empty(&me->bdi->work_list))
                        __set_current_state(TASK_RUNNING);

                switch (action) {
                case FORK_THREAD:
                        __set_current_state(TASK_RUNNING);
                        task = kthread_create(bdi_writeback_thread, &bdi->wb,
                                              "flush-%s", dev_name(bdi->dev));
                        if (IS_ERR(task)) {
                                /*
                                 * If thread creation fails, force writeout of
                                 * the bdi from the thread. Hopefully 1024 is
                                 * large enough for efficient IO.
                                 */
                                writeback_inodes_wb(&bdi->wb, 1024,
                                                    WB_REASON_FORKER_THREAD);
                        } else {
                                /*
                                 * The spinlock makes sure we do not lose
                                 * wake-ups when racing with 'bdi_queue_work()'.
                                 * And as soon as the bdi thread is visible, we
                                 * can start it.
                                 */
                                spin_lock_bh(&bdi->wb_lock);
                                bdi->wb.task = task;
                                spin_unlock_bh(&bdi->wb_lock);
                                wake_up_process(task);
                        }
                        bdi_clear_pending(bdi);
                        break;

                case KILL_THREAD:
                        __set_current_state(TASK_RUNNING);
                        kthread_stop(task);
                        bdi_clear_pending(bdi);
                        break;

                case NO_ACTION:
                        if (!wb_has_dirty_io(me) || !dirty_writeback_interval)
                                /*
                                 * There are no dirty data. The only thing we
                                 * should now care about is checking for
                                 * inactive bdi threads and killing them. Thus,
                                 * let's sleep for longer time, save energy and
                                 * be friendly for battery-driven devices.
                                 */
                                schedule_timeout(bdi_longest_inactive());
                        else
                                schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
                        try_to_freeze();
                        break;
                }
        }

        return 0;
}

****************************************************************************************
继续sys_write,执行block_write_end之后通过__mark_inode_dirty有需要的话唤醒default bdi wb或者bdi.wb.task线程。条条大路通罗马,bdi_writeback_thread最终会被执行。
bdi_writeback_thread本质上就是一个循环,调用wb_do_writeback进行回写。
回写之后发现仍然有工作要做,超时5s调度出去。否则直接调度出去,等待被sys_write之类唤醒。

很明显内核磁盘回写的策略是:
如果持续的对磁盘有写请求的话,bdi wb task将会一直存在,但是每隔5s集中处理一次回写工作。
如果只是偶尔的对磁盘有写请求的话,一般一个写请求之后,都会5s超时唤醒default bdi线程,然default bdi线程创建并运行一个磁盘的flush线程,在这个flush线程中处理回写,过了5s左右之后这个bdi wb task会被bdi-default kill掉。然后偶然又有写请求产生。
但是wb_do_writeback回写并不是一定会提交写请求给磁盘,一般两种情况满足下才会提交一部分page写请求给磁盘,分别是,page dirty超时30s,另外一个是dirty page比例太高。
****************************************************************************************
int bdi_writeback_thread(void *data)
{
        while (!kthread_freezable_should_stop(NULL)) {

                del_timer(&wb->wakeup_timer);

                pages_written = wb_do_writeback(wb, 0);

                trace_writeback_pages_written(pages_written);

                if (pages_written)
                        wb->last_active = jiffies;

                set_current_state(TASK_INTERRUPTIBLE);
                if (!list_empty(&bdi->work_list) || kthread_should_stop()) {
                        __set_current_state(TASK_RUNNING);
                        continue;
                }

                if (wb_has_dirty_io(wb) && dirty_writeback_interval)
                        schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
                else {
                        /*
                         * We have nothing to do, so can go sleep without any
                         * timeout and save power. When a work is queued or
                         * something is made dirty - we will be woken up.
                         */
                        schedule();
                }
        }

        /* Flush any work that raced with us exiting */
        if (!list_empty(&bdi->work_list))
                wb_do_writeback(wb, 1);

        trace_writeback_thread_stop(bdi);
        return 0;
}



****************************************************************************************
wb_do_writeback是具体处理写回工作的函数。通过get_next_work_item,从bdi->work_list任务队列上取出所有wb_writeback_work类型的任务,并且分别调用wb_writeback进行处理。就是说,具体的工作处理都是对wb_writeback_work的处理。但是,vfs_write中我们只不过将dirty的inode move到了bdi->wb.b_dirty队列。那么wb_writeback_work任务怎么来的呢?
wb_check_old_data_flush() 产生一个for_kupdate的任务通过wb_writeback处理。
wb_check_background_flush()产生一个for_background的任务通过wb_writeback处理。
所以具体逻辑都是在wb_writeback里面处理的,for_kupdate会写回dirty超过30s的page。for_background会检测是否脏页比例过高或者可用页太少,然后写回一部分page。
****************************************************************************************
long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
{
        struct backing_dev_info *bdi = wb->bdi;
        struct wb_writeback_work *work;
        long wrote = 0;

        set_bit(BDI_writeback_running, &wb->bdi->state);
        while ((work = get_next_work_item(bdi)) != NULL) {
                /*
                 * Override sync mode, in case we must wait for completion
                 * because this thread is exiting now.
                 */
                if (force_wait)
                        work->sync_mode = WB_SYNC_ALL;

                trace_writeback_exec(bdi, work);

                wrote += wb_writeback(wb, work);

                /*
                 * Notify the caller of completion if this is a synchronous
                 * work item, otherwise just free it.
                 */
                if (work->done)
                        complete(work->done);
                else
                        kfree(work);
        }

        /*
         * Check for periodic writeback, kupdated() style
         */
        wrote += wb_check_old_data_flush(wb);
        wrote += wb_check_background_flush(wb);
        clear_bit(BDI_writeback_running, &wb->bdi->state);

        return wrote;
}


****************************************************************************************
对于for_background任务,通过over_bground_thresh检查dirty过多的话才真正处理work。work->older_than_this为当前时间。
对于for_kupdate任务,设置work->older_than_this为当前时间往前30s。queue_io会用到。
如果需要继续处理任务,调用queue_io准备b_io处理队列。queue_io看到将b_more_io拼接到b_io上,并且看是否有过期的inode要move到b_io上(只有for_kupdate的任务才有可能)
然后调用__writeback_inodes_wb处理b_io上所有的inode。
__writeback_inodes_wb实现比较饶人,实际上就是对每个inode分别调用了writeback_sb_inodes,从而调用了__writeback_single_inode写回单个inode。
****************************************************************************************
static long wb_writeback(struct bdi_writeback *wb,
                         struct wb_writeback_work *work)
{
        unsigned long wb_start = jiffies;
        long nr_pages = work->nr_pages;
        unsigned long oldest_jif;
        struct inode *inode;
        long progress;

        oldest_jif = jiffies;
        work->older_than_this = &oldest_jif;

        spin_lock(&wb->list_lock);
        for (; {
                /*
                 * Stop writeback when nr_pages has been consumed
                 */
                if (work->nr_pages <= 0)
                        break;

                /*
                 * Background writeout and kupdate-style writeback may
                 * run forever. Stop them if there is other work to do
                 * so that e.g. sync can proceed. They'll be restarted
                 * after the other works are all done.
                 */
                if ((work->for_background || work->for_kupdate) &&
                    !list_empty(&wb->bdi->work_list))
                        break;

                /*
                 * For background writeout, stop when we are below the
                 * background dirty threshold
                 */
                if (work->for_background && !over_bground_thresh(wb->bdi))
                        break;

                /*
                 * Kupdate and background works are special and we want to
                 * include all inodes that need writing. Livelock avoidance is
                 * handled by these works yielding to any other work so we are
                 * safe.
                 */
                if (work->for_kupdate) {
                        oldest_jif = jiffies -
                                msecs_to_jiffies(dirty_expire_interval * 10);
                } else if (work->for_background)
                        oldest_jif = jiffies;

                trace_writeback_start(wb->bdi, work);
                if (list_empty(&wb->b_io))
                        queue_io(wb, work);
                if (work->sb)
                        progress = writeback_sb_inodes(work->sb, wb, work);
                else
                        progress = __writeback_inodes_wb(wb, work);
                trace_writeback_written(wb->bdi, work);

                wb_update_bandwidth(wb, wb_start);

                /*
                 * Did we write something? Try for more
                 *
                 * Dirty inodes are moved to b_io for writeback in batches.
                 * The completion of the current batch does not necessarily
                 * mean the overall work is done. So we keep looping as long
                 * as made some progress on cleaning pages or inodes.
                 */
                if (progress)
                        continue;
                /*
                 * No more inodes for IO, bail
                 */
                if (list_empty(&wb->b_more_io))
                        break;
                /*
                 * Nothing written. Wait for some inode to
                 * become available for writeback. Otherwise
                 * we'll just busyloop.
                 */
                if (!list_empty(&wb->b_more_io))  {
                        trace_writeback_wait(wb->bdi, work);
                        inode = wb_inode(wb->b_more_io.prev);
                        spin_lock(&inode->i_lock);
                        spin_unlock(&wb->list_lock);
                        /* This function drops i_lock... */
                        inode_sleep_on_writeback(inode);
                        spin_lock(&wb->list_lock);
                }
        }
        spin_unlock(&wb->list_lock);

        return nr_pages - work->nr_pages;
}


/*
* Queue all expired dirty inodes for io, eldest first.
* Before
*         newly dirtied     b_dirty    b_io    b_more_io
*         =============>    gf         edc     BA
* After
*         newly dirtied     b_dirty    b_io    b_more_io
*         =============>    g          fBAedc
*                                           |
*                                           +--> dequeue for IO
*/
static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work)
{
        int moved;
        assert_spin_locked(&wb->list_lock);
        list_splice_init(&wb->b_more_io, &wb->b_io);
        moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, work);
        trace_writeback_queue_io(wb, work, moved);
}

论坛徽章:
0
2 [报告]
发表于 2012-10-09 18:27 |只看该作者
本帖最后由 blake326 于 2012-10-09 18:52 编辑

****************************************************************************************
block_write_end中,首先flush_dcache_page体系相关的刷新数据cache(一般risc需要,比较复杂,表示不理解?)
然后调用__block_commit_write对page每个buffer_head进行处理,对本次写操作涉及到的buffer_head都mark_buffer_dirty设置dirty位。并且如果所有buffer_head都是最新的,刚写过的肯定是最新的,没写的也是最新的话,设置page的uptodate位。
最后解锁page,返回写的字节数。
****************************************************************************************
static int ext2_write_end(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned copied,
                        struct page *page, void *fsdata)
{
        int ret;

        ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
        if (ret < len)
                ext2_write_failed(mapping, pos + len);
        return ret;
}
int generic_write_end(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned copied,
                        struct page *page, void *fsdata)
{
        struct inode *inode = mapping->host;
        int i_size_changed = 0;

        copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);

        /*
         * No need to use i_size_read() here, the i_size
         * cannot change under us because we hold i_mutex.
         *
         * But it's important to update i_size while still holding page lock:
         * page writeout could otherwise come in and zero beyond i_size.
         */
        if (pos+copied > inode->i_size) {
                i_size_write(inode, pos+copied);
                i_size_changed = 1;
        }

        unlock_page(page);
        page_cache_release(page);

        /*
         * Don't mark the inode dirty under page lock. First, it unnecessarily
         * makes the holding time of page lock longer. Second, it forces lock
         * ordering of page lock and transaction start for journaling
         * filesystems.
         */
        if (i_size_changed)
                mark_inode_dirty(inode);

        return copied;
}

int block_write_end(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned copied,
                        struct page *page, void *fsdata)
{
        struct inode *inode = mapping->host;
        unsigned start;

        start = pos & (PAGE_CACHE_SIZE - 1);

        if (unlikely(copied < len)) {
                /*
                 * The buffers that were written will now be uptodate, so we
                 * don't have to worry about a readpage reading them and
                 * overwriting a partial write. However if we have encountered
                 * a short write and only partially written into a buffer, it
                 * will not be marked uptodate, so a readpage might come in and
                 * destroy our partial write.
                 *
                 * Do the simplest thing, and just treat any short write to a
                 * non uptodate page as a zero-length write, and force the
                 * caller to redo the whole thing.
                 */
                if (!PageUptodate(page))
                        copied = 0;

                page_zero_new_buffers(page, start+copied, start+len);
        }
        flush_dcache_page(page);

        /* This could be a short (even 0-length) commit */
        __block_commit_write(inode, page, start, start+copied);

        return copied;
}
static int __block_commit_write(struct inode *inode, struct page *page,
                unsigned from, unsigned to)
{
        unsigned block_start, block_end;
        int partial = 0;
        unsigned blocksize;
        struct buffer_head *bh, *head;

        blocksize = 1 << inode->i_blkbits;

        for(bh = head = page_buffers(page), block_start = 0;
            bh != head || !block_start;
            block_start=block_end, bh = bh->b_this_page) {
                block_end = block_start + blocksize;
                if (block_end <= from || block_start >= to) {
                        if (!buffer_uptodate(bh))
                                partial = 1;
                } else {
                        set_buffer_uptodate(bh);
                        mark_buffer_dirty(bh);
                }
                clear_buffer_new(bh);
        }

        /*
         * If this is a partial write which happened to make all buffers
         * uptodate then we can optimize away a bogus readpage() for
         * the next read(). Here we 'discover' whether the page went
         * uptodate as a result of this (potentially partial) write.
         */
        if (!partial)
                SetPageUptodate(page);
        return 0;
}

****************************************************************************************
test_set_buffer_dirty设置dirty位,返回旧的dirty位。所以正常情况,首次对一个page写的话将会执行到__set_page_dirty设置page的dirty。当然__set_page_dirty不仅仅设置一个dirty位,还调用到__mark_inode_dirty唤醒writeback线程。
__mark_inode_dirty检查如果inode不是dirty的话,将它设置为dirty,然后看inode关联的bdi的dirty list(bdi->wb.b_dirty)是否有inode要处理,然后将该inode move到bdi dirty list中,这个时候看看是否要延迟唤醒(默认5s)bdi writeback线程进行真正的写磁盘动作。
****************************************************************************************
void mark_buffer_dirty(struct buffer_head *bh)
{
        WARN_ON_ONCE(!buffer_uptodate(bh));

        /*
         * Very *carefully* optimize the it-is-already-dirty case.
         *
         * Don't let the final "is it dirty" escape to before we
         * perhaps modified the buffer.
         */
        if (buffer_dirty(bh)) {
                smp_mb();
                if (buffer_dirty(bh))
                        return;
        }

        if (!test_set_buffer_dirty(bh)) {
                struct page *page = bh->b_page;
                if (!TestSetPageDirty(page)) {
                        struct address_space *mapping = page_mapping(page);
                        if (mapping)
                                __set_page_dirty(page, mapping, 0);
                }
        }
}
static void __set_page_dirty(struct page *page,
                struct address_space *mapping, int warn)
{
        spin_lock_irq(&mapping->tree_lock);
        if (page->mapping) {        /* Race with truncate? */
                WARN_ON_ONCE(warn && !PageUptodate(page));
                account_page_dirtied(page, mapping);
                radix_tree_tag_set(&mapping->page_tree,
                                page_index(page), PAGECACHE_TAG_DIRTY);
        }
        spin_unlock_irq(&mapping->tree_lock);
        __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
}
void __mark_inode_dirty(struct inode *inode, int flags)
{
        struct super_block *sb = inode->i_sb;
        struct backing_dev_info *bdi = NULL;

        /*
         * Don't do this for I_DIRTY_PAGES - that doesn't actually
         * dirty the inode itself
         */
        if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
                if (sb->s_op->dirty_inode)
                        sb->s_op->dirty_inode(inode, flags);
        }

        /*
         * make sure that changes are seen by all cpus before we test i_state
         * -- mikulas
         */
        smp_mb();

        /* avoid the locking if we can */
        if ((inode->i_state & flags) == flags)
                return;

        if (unlikely(block_dump))
                block_dump___mark_inode_dirty(inode);

        spin_lock(&inode->i_lock);
        if ((inode->i_state & flags) != flags) {
                const int was_dirty = inode->i_state & I_DIRTY;

                inode->i_state |= flags;

                /*
                 * If the inode is being synced, just update its dirty state.
                 * The unlocker will place the inode on the appropriate
                 * superblock list, based upon its state.
                 */
                if (inode->i_state & I_SYNC)
                        goto out_unlock_inode;

                /*
                 * Only add valid (hashed) inodes to the superblock's
                 * dirty list.  Add blockdev inodes as well.
                 */
                if (!S_ISBLK(inode->i_mode)) {
                        if (inode_unhashed(inode))
                                goto out_unlock_inode;
                }
                if (inode->i_state & I_FREEING)
                        goto out_unlock_inode;

                /*
                 * If the inode was already on b_dirty/b_io/b_more_io, don't
                 * reposition it (that would break b_dirty time-ordering).
                 */
                if (!was_dirty) {
                        bool wakeup_bdi = false;
                        bdi = inode_to_bdi(inode);

                        if (bdi_cap_writeback_dirty(bdi)) {
                                WARN(!test_bit(BDI_registered, &bdi->state),
                                     "bdi-%s not registered\n", bdi->name);

                                /*
                                 * If this is the first dirty inode for this
                                 * bdi, we have to wake-up the corresponding
                                 * bdi thread to make sure background
                                 * write-back happens later.
                                 */
                                if (!wb_has_dirty_io(&bdi->wb))
                                        wakeup_bdi = true;
                        }

                        spin_unlock(&inode->i_lock);
                        spin_lock(&bdi->wb.list_lock);
                        inode->dirtied_when = jiffies;
                        list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
                        spin_unlock(&bdi->wb.list_lock);

                        if (wakeup_bdi)
                                bdi_wakeup_thread_delayed(bdi);
                        return;
                }
        }
out_unlock_inode:
        spin_unlock(&inode->i_lock);

}
  

北京盛拓优讯信息技术有限公司. 版权所有 京ICP备16024965号-6 北京市公安局海淀分局网监中心备案编号:11010802020122 niuxiaotong@pcpop.com 17352615567
未成年举报专区
中国互联网协会会员  联系我们:huangweiwei@itpub.net
感谢所有关心和支持过ChinaUnix的朋友们 转载本站内容请注明原作者名及出处

清除 Cookies - ChinaUnix - Archiver - WAP - TOP