- 论坛徽章:
- 0
|
****************************************************************************************
sys_write仅仅是写page,并没有提交写请求,具体的写工作是一个叫做bdi writeback线程来负责的。在代码里面是由每个磁盘相关的request_queue.backing_dev_info来管理的。backing_dev_info维护了一些磁盘辅助信息,最重要的一个属性就是bdi_writeback结构:
bdi_writeback的task指向了writeback线程,如果存在的话。
wakeup_timer定时器,默认的回调方法就是唤醒default_backing_dev_info.wb.task或者当前bdi.wb.task
b_dirty链表保存了被磁盘上脏的inode。
在add_disk中,通过bdi_register_dev(&disk->queue->backing_dev_info, disk_devt(disk))注册bdi。就是说每个gendisk就有一个bdi。
****************************************************************************************
struct backing_dev_info {
struct list_head bdi_list;
unsigned long ra_pages; /* max readahead in PAGE_CACHE_SIZE units */
unsigned long state; /* Always use atomic bitops on this */
unsigned int capabilities; /* Device capabilities */
congested_fn *congested_fn; /* Function pointer if device is md/dm */
void *congested_data; /* Pointer to aux data for congested func */
char *name;
struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS];
unsigned long bw_time_stamp; /* last time write bw is updated */
unsigned long dirtied_stamp;
unsigned long written_stamp; /* pages written at bw_time_stamp */
unsigned long write_bandwidth; /* the estimated write bandwidth */
unsigned long avg_write_bandwidth; /* further smoothed write bw */
/*
* The base dirty throttle rate, re-calculated on every 200ms.
* All the bdi tasks' dirty rate will be curbed under it.
* @dirty_ratelimit tracks the estimated @balanced_dirty_ratelimit
* in small steps and is much more smooth/stable than the latter.
*/
unsigned long dirty_ratelimit;
unsigned long balanced_dirty_ratelimit;
struct fprop_local_percpu completions;
int dirty_exceeded;
unsigned int min_ratio;
unsigned int max_ratio, max_prop_frac;
struct bdi_writeback wb; /* default writeback info for this bdi */
spinlock_t wb_lock; /* protects work_list */
struct list_head work_list;
struct device *dev;
struct timer_list laptop_mode_wb_timer;
#ifdef CONFIG_DEBUG_FS
struct dentry *debug_dir;
struct dentry *debug_stats;
#endif
};
struct bdi_writeback {
struct backing_dev_info *bdi; /* our parent bdi */
unsigned int nr;
unsigned long last_old_flush; /* last old data flush */
unsigned long last_active; /* last time bdi thread was active */
struct task_struct *task; /* writeback thread */
struct timer_list wakeup_timer; /* used for delayed bdi thread wakeup */
struct list_head b_dirty; /* dirty inodes */
struct list_head b_io; /* parked for writeback */
struct list_head b_more_io; /* parked for more writeback */
spinlock_t list_lock; /* protects the b_* lists */
};
static void wakeup_timer_fn(unsigned long data)
{
struct backing_dev_info *bdi = (struct backing_dev_info *)data;
spin_lock_bh(&bdi->wb_lock);
if (bdi->wb.task) {
trace_writeback_wake_thread(bdi);
wake_up_process(bdi->wb.task);
} else if (bdi->dev) {
/*
* When bdi tasks are inactive for long time, they are killed.
* In this case we have to wake-up the forker thread which
* should create and run the bdi thread.
*/
trace_writeback_wake_forker_thread(bdi);
wake_up_process(default_backing_dev_info.wb.task);
}
spin_unlock_bh(&bdi->wb_lock);
}
****************************************************************************************
除了每个gendisk都会bid_register一个bdi之外,内核还注册了一个默认的default_backing_dev_info 。这个bdi创建了一个bdi-default的线程管理了其他普通的bdi的writeback线程。
****************************************************************************************
struct backing_dev_info default_backing_dev_info = {
.name = "default",
.ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE,
.state = 0,
.capabilities = BDI_CAP_MAP_COPY,
};
static int __init default_bdi_init(void)
{
int err;
err = bdi_init(&default_backing_dev_info);
if (!err)
bdi_register(&default_backing_dev_info, NULL, "default" ;
err = bdi_init(&noop_backing_dev_info);
return err;
}
subsys_initcall(default_bdi_init);
int bdi_register(struct backing_dev_info *bdi, struct device *parent,
const char *fmt, ...)
{
va_list args;
struct device *dev;
if (bdi->dev) /* The driver needs to use separate queues per device */
return 0;
va_start(args, fmt);
dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args);
va_end(args);
if (IS_ERR(dev))
return PTR_ERR(dev);
bdi->dev = dev;
/*
* Just start the forker thread for our default backing_dev_info,
* and add other bdi's to the list. They will get a thread created
* on-demand when they need it.
*/
if (bdi_cap_flush_forker(bdi)) {
struct bdi_writeback *wb = &bdi->wb;
wb->task = kthread_run(bdi_forker_thread, wb, "bdi-%s",
dev_name(dev));
if (IS_ERR(wb->task))
return PTR_ERR(wb->task);
}
bdi_debug_register(bdi, dev_name(dev));
set_bit(BDI_registered, &bdi->state);
spin_lock_bh(&bdi_lock);
list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
spin_unlock_bh(&bdi_lock);
trace_writeback_bdi_register(bdi);
return 0;
}
****************************************************************************************
default bdi线程bdi_forker_thread实际上是一个死循环,一般是由wakeup_timer定时器唤醒。具体处理如下:
1. 检查default bdi的wb是否有io请求,或者是否bdi work。有的话则删除wakeup_timer,调用wb_do_writeback进行回写。
2. 通过bdi_list链表遍历所有注册的bdi,对每个bdi进行检查。
如果bdi没有写回的特点比如内存bdi则继续下一个bdi。
然后同样检查bdi的wb是否有请求或者有bdi work,如果有的话,并且bdi->wb.task=null,则设置bdi的action=FORK_THREAD。然后break。
如果bdi没有工作要处理,并且现在该bdi->wb.task已经空闲了5秒钟以上了,则设置bid的action=KILL_THRREAD。然后break。
3. 现在假设发现了要启动一个bdi,kthread_create(bdi_writeback_thread, &bdi->wb,"flush-%s", dev_name(bdi->dev))创建一个flush-8:0之类的writeback线程。
4. 假设没有fork,也没有kill的bdi。则default bdi线程睡眠5s钟,或许醒来之后会发现哪个bdi长期不活动可以kill掉了。
****************************************************************************************
static int bdi_forker_thread(void *ptr)
{
struct bdi_writeback *me = ptr;
current->flags |= PF_SWAPWRITE;
set_freezable();
/*
* Our parent may run at a different priority, just set us to normal
*/
set_user_nice(current, 0);
for (; {
struct task_struct *task = NULL;
struct backing_dev_info *bdi;
enum {
NO_ACTION, /* Nothing to do */
FORK_THREAD, /* Fork bdi thread */
KILL_THREAD, /* Kill inactive bdi thread */
} action = NO_ACTION;
/*
* Temporary measure, we want to make sure we don't see
* dirty data on the default backing_dev_info
*/
if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) {
del_timer(&me->wakeup_timer);
wb_do_writeback(me, 0);
}
spin_lock_bh(&bdi_lock);
/*
* In the following loop we are going to check whether we have
* some work to do without any synchronization with tasks
* waking us up to do work for them. Set the task state here
* so that we don't miss wakeups after verifying conditions.
*/
set_current_state(TASK_INTERRUPTIBLE);
list_for_each_entry(bdi, &bdi_list, bdi_list) {
bool have_dirty_io;
if (!bdi_cap_writeback_dirty(bdi) ||
bdi_cap_flush_forker(bdi))
continue;
WARN(!test_bit(BDI_registered, &bdi->state),
"bdi %p/%s is not registered!\n", bdi, bdi->name);
have_dirty_io = !list_empty(&bdi->work_list) ||
wb_has_dirty_io(&bdi->wb);
/*
* If the bdi has work to do, but the thread does not
* exist - create it.
*/
if (!bdi->wb.task && have_dirty_io) {
/*
* Set the pending bit - if someone will try to
* unregister this bdi - it'll wait on this bit.
*/
set_bit(BDI_pending, &bdi->state);
action = FORK_THREAD;
break;
}
spin_lock(&bdi->wb_lock);
/*
* If there is no work to do and the bdi thread was
* inactive long enough - kill it. The wb_lock is taken
* to make sure no-one adds more work to this bdi and
* wakes the bdi thread up.
*/
if (bdi->wb.task && !have_dirty_io &&
time_after(jiffies, bdi->wb.last_active +
bdi_longest_inactive())) {
task = bdi->wb.task;
bdi->wb.task = NULL;
spin_unlock(&bdi->wb_lock);
set_bit(BDI_pending, &bdi->state);
action = KILL_THREAD;
break;
}
spin_unlock(&bdi->wb_lock);
}
spin_unlock_bh(&bdi_lock);
/* Keep working if default bdi still has things to do */
if (!list_empty(&me->bdi->work_list))
__set_current_state(TASK_RUNNING);
switch (action) {
case FORK_THREAD:
__set_current_state(TASK_RUNNING);
task = kthread_create(bdi_writeback_thread, &bdi->wb,
"flush-%s", dev_name(bdi->dev));
if (IS_ERR(task)) {
/*
* If thread creation fails, force writeout of
* the bdi from the thread. Hopefully 1024 is
* large enough for efficient IO.
*/
writeback_inodes_wb(&bdi->wb, 1024,
WB_REASON_FORKER_THREAD);
} else {
/*
* The spinlock makes sure we do not lose
* wake-ups when racing with 'bdi_queue_work()'.
* And as soon as the bdi thread is visible, we
* can start it.
*/
spin_lock_bh(&bdi->wb_lock);
bdi->wb.task = task;
spin_unlock_bh(&bdi->wb_lock);
wake_up_process(task);
}
bdi_clear_pending(bdi);
break;
case KILL_THREAD:
__set_current_state(TASK_RUNNING);
kthread_stop(task);
bdi_clear_pending(bdi);
break;
case NO_ACTION:
if (!wb_has_dirty_io(me) || !dirty_writeback_interval)
/*
* There are no dirty data. The only thing we
* should now care about is checking for
* inactive bdi threads and killing them. Thus,
* let's sleep for longer time, save energy and
* be friendly for battery-driven devices.
*/
schedule_timeout(bdi_longest_inactive());
else
schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
try_to_freeze();
break;
}
}
return 0;
}
****************************************************************************************
继续sys_write,执行block_write_end之后通过__mark_inode_dirty有需要的话唤醒default bdi wb或者bdi.wb.task线程。条条大路通罗马,bdi_writeback_thread最终会被执行。
bdi_writeback_thread本质上就是一个循环,调用wb_do_writeback进行回写。
回写之后发现仍然有工作要做,超时5s调度出去。否则直接调度出去,等待被sys_write之类唤醒。
很明显内核磁盘回写的策略是:
如果持续的对磁盘有写请求的话,bdi wb task将会一直存在,但是每隔5s集中处理一次回写工作。
如果只是偶尔的对磁盘有写请求的话,一般一个写请求之后,都会5s超时唤醒default bdi线程,然default bdi线程创建并运行一个磁盘的flush线程,在这个flush线程中处理回写,过了5s左右之后这个bdi wb task会被bdi-default kill掉。然后偶然又有写请求产生。
但是wb_do_writeback回写并不是一定会提交写请求给磁盘,一般两种情况满足下才会提交一部分page写请求给磁盘,分别是,page dirty超时30s,另外一个是dirty page比例太高。
****************************************************************************************
int bdi_writeback_thread(void *data)
{
while (!kthread_freezable_should_stop(NULL)) {
del_timer(&wb->wakeup_timer);
pages_written = wb_do_writeback(wb, 0);
trace_writeback_pages_written(pages_written);
if (pages_written)
wb->last_active = jiffies;
set_current_state(TASK_INTERRUPTIBLE);
if (!list_empty(&bdi->work_list) || kthread_should_stop()) {
__set_current_state(TASK_RUNNING);
continue;
}
if (wb_has_dirty_io(wb) && dirty_writeback_interval)
schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
else {
/*
* We have nothing to do, so can go sleep without any
* timeout and save power. When a work is queued or
* something is made dirty - we will be woken up.
*/
schedule();
}
}
/* Flush any work that raced with us exiting */
if (!list_empty(&bdi->work_list))
wb_do_writeback(wb, 1);
trace_writeback_thread_stop(bdi);
return 0;
}
****************************************************************************************
wb_do_writeback是具体处理写回工作的函数。通过get_next_work_item,从bdi->work_list任务队列上取出所有wb_writeback_work类型的任务,并且分别调用wb_writeback进行处理。就是说,具体的工作处理都是对wb_writeback_work的处理。但是,vfs_write中我们只不过将dirty的inode move到了bdi->wb.b_dirty队列。那么wb_writeback_work任务怎么来的呢?
wb_check_old_data_flush() 产生一个for_kupdate的任务通过wb_writeback处理。
wb_check_background_flush()产生一个for_background的任务通过wb_writeback处理。
所以具体逻辑都是在wb_writeback里面处理的,for_kupdate会写回dirty超过30s的page。for_background会检测是否脏页比例过高或者可用页太少,然后写回一部分page。
****************************************************************************************
long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
{
struct backing_dev_info *bdi = wb->bdi;
struct wb_writeback_work *work;
long wrote = 0;
set_bit(BDI_writeback_running, &wb->bdi->state);
while ((work = get_next_work_item(bdi)) != NULL) {
/*
* Override sync mode, in case we must wait for completion
* because this thread is exiting now.
*/
if (force_wait)
work->sync_mode = WB_SYNC_ALL;
trace_writeback_exec(bdi, work);
wrote += wb_writeback(wb, work);
/*
* Notify the caller of completion if this is a synchronous
* work item, otherwise just free it.
*/
if (work->done)
complete(work->done);
else
kfree(work);
}
/*
* Check for periodic writeback, kupdated() style
*/
wrote += wb_check_old_data_flush(wb);
wrote += wb_check_background_flush(wb);
clear_bit(BDI_writeback_running, &wb->bdi->state);
return wrote;
}
****************************************************************************************
对于for_background任务,通过over_bground_thresh检查dirty过多的话才真正处理work。work->older_than_this为当前时间。
对于for_kupdate任务,设置work->older_than_this为当前时间往前30s。queue_io会用到。
如果需要继续处理任务,调用queue_io准备b_io处理队列。queue_io看到将b_more_io拼接到b_io上,并且看是否有过期的inode要move到b_io上(只有for_kupdate的任务才有可能)
然后调用__writeback_inodes_wb处理b_io上所有的inode。
__writeback_inodes_wb实现比较饶人,实际上就是对每个inode分别调用了writeback_sb_inodes,从而调用了__writeback_single_inode写回单个inode。
****************************************************************************************
static long wb_writeback(struct bdi_writeback *wb,
struct wb_writeback_work *work)
{
unsigned long wb_start = jiffies;
long nr_pages = work->nr_pages;
unsigned long oldest_jif;
struct inode *inode;
long progress;
oldest_jif = jiffies;
work->older_than_this = &oldest_jif;
spin_lock(&wb->list_lock);
for (; {
/*
* Stop writeback when nr_pages has been consumed
*/
if (work->nr_pages <= 0)
break;
/*
* Background writeout and kupdate-style writeback may
* run forever. Stop them if there is other work to do
* so that e.g. sync can proceed. They'll be restarted
* after the other works are all done.
*/
if ((work->for_background || work->for_kupdate) &&
!list_empty(&wb->bdi->work_list))
break;
/*
* For background writeout, stop when we are below the
* background dirty threshold
*/
if (work->for_background && !over_bground_thresh(wb->bdi))
break;
/*
* Kupdate and background works are special and we want to
* include all inodes that need writing. Livelock avoidance is
* handled by these works yielding to any other work so we are
* safe.
*/
if (work->for_kupdate) {
oldest_jif = jiffies -
msecs_to_jiffies(dirty_expire_interval * 10);
} else if (work->for_background)
oldest_jif = jiffies;
trace_writeback_start(wb->bdi, work);
if (list_empty(&wb->b_io))
queue_io(wb, work);
if (work->sb)
progress = writeback_sb_inodes(work->sb, wb, work);
else
progress = __writeback_inodes_wb(wb, work);
trace_writeback_written(wb->bdi, work);
wb_update_bandwidth(wb, wb_start);
/*
* Did we write something? Try for more
*
* Dirty inodes are moved to b_io for writeback in batches.
* The completion of the current batch does not necessarily
* mean the overall work is done. So we keep looping as long
* as made some progress on cleaning pages or inodes.
*/
if (progress)
continue;
/*
* No more inodes for IO, bail
*/
if (list_empty(&wb->b_more_io))
break;
/*
* Nothing written. Wait for some inode to
* become available for writeback. Otherwise
* we'll just busyloop.
*/
if (!list_empty(&wb->b_more_io)) {
trace_writeback_wait(wb->bdi, work);
inode = wb_inode(wb->b_more_io.prev);
spin_lock(&inode->i_lock);
spin_unlock(&wb->list_lock);
/* This function drops i_lock... */
inode_sleep_on_writeback(inode);
spin_lock(&wb->list_lock);
}
}
spin_unlock(&wb->list_lock);
return nr_pages - work->nr_pages;
}
/*
* Queue all expired dirty inodes for io, eldest first.
* Before
* newly dirtied b_dirty b_io b_more_io
* =============> gf edc BA
* After
* newly dirtied b_dirty b_io b_more_io
* =============> g fBAedc
* |
* +--> dequeue for IO
*/
static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work)
{
int moved;
assert_spin_locked(&wb->list_lock);
list_splice_init(&wb->b_more_io, &wb->b_io);
moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, work);
trace_writeback_queue_io(wb, work, moved);
}
|
|