11da177e4SLinus Torvalds /* 21da177e4SLinus Torvalds * fs/fs-writeback.c 31da177e4SLinus Torvalds * 41da177e4SLinus Torvalds * Copyright (C) 2002, Linus Torvalds. 51da177e4SLinus Torvalds * 61da177e4SLinus Torvalds * Contains all the functions related to writing back and waiting 71da177e4SLinus Torvalds * upon dirty inodes against superblocks, and writing back dirty 81da177e4SLinus Torvalds * pages against inodes. ie: data writeback. Writeout of the 91da177e4SLinus Torvalds * inode itself is not handled here. 101da177e4SLinus Torvalds * 11e1f8e874SFrancois Cami * 10Apr2002 Andrew Morton 121da177e4SLinus Torvalds * Split out of fs/inode.c 131da177e4SLinus Torvalds * Additions for address_space-based writeback 141da177e4SLinus Torvalds */ 151da177e4SLinus Torvalds 161da177e4SLinus Torvalds #include <linux/kernel.h> 17630d9c47SPaul Gortmaker #include <linux/export.h> 181da177e4SLinus Torvalds #include <linux/spinlock.h> 195a0e3ad6STejun Heo #include <linux/slab.h> 201da177e4SLinus Torvalds #include <linux/sched.h> 211da177e4SLinus Torvalds #include <linux/fs.h> 221da177e4SLinus Torvalds #include <linux/mm.h> 23bc31b86aSWu Fengguang #include <linux/pagemap.h> 2403ba3782SJens Axboe #include <linux/kthread.h> 251da177e4SLinus Torvalds #include <linux/writeback.h> 261da177e4SLinus Torvalds #include <linux/blkdev.h> 271da177e4SLinus Torvalds #include <linux/backing-dev.h> 28455b2864SDave Chinner #include <linux/tracepoint.h> 29719ea2fbSAl Viro #include <linux/device.h> 3021c6321fSTejun Heo #include <linux/memcontrol.h> 3107f3f05cSDavid Howells #include "internal.h" 321da177e4SLinus Torvalds 33d0bceac7SJens Axboe /* 34bc31b86aSWu Fengguang * 4MB minimal write chunk size 35bc31b86aSWu Fengguang */ 36bc31b86aSWu Fengguang #define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_CACHE_SHIFT - 10)) 37bc31b86aSWu Fengguang 38cc395d7fSTejun Heo struct wb_completion { 39cc395d7fSTejun Heo atomic_t cnt; 40cc395d7fSTejun Heo }; 41cc395d7fSTejun Heo 42bc31b86aSWu Fengguang /* 43c4a77a6cSJens Axboe * Passed into wb_writeback(), essentially a subset of writeback_control 44c4a77a6cSJens Axboe */ 4583ba7b07SChristoph Hellwig struct wb_writeback_work { 46c4a77a6cSJens Axboe long nr_pages; 47c4a77a6cSJens Axboe struct super_block *sb; 480dc83bd3SJan Kara unsigned long *older_than_this; 49c4a77a6cSJens Axboe enum writeback_sync_modes sync_mode; 506e6938b6SWu Fengguang unsigned int tagged_writepages:1; 5152957fe1SH Hartley Sweeten unsigned int for_kupdate:1; 5252957fe1SH Hartley Sweeten unsigned int range_cyclic:1; 5352957fe1SH Hartley Sweeten unsigned int for_background:1; 547747bd4bSDave Chinner unsigned int for_sync:1; /* sync(2) WB_SYNC_ALL writeback */ 55ac7b19a3STejun Heo unsigned int auto_free:1; /* free on completion */ 5698754bf7STejun Heo unsigned int single_wait:1; 5798754bf7STejun Heo unsigned int single_done:1; 580e175a18SCurt Wohlgemuth enum wb_reason reason; /* why was writeback initiated? */ 59c4a77a6cSJens Axboe 608010c3b6SJens Axboe struct list_head list; /* pending work list */ 61cc395d7fSTejun Heo struct wb_completion *done; /* set if the caller waits */ 6203ba3782SJens Axboe }; 6303ba3782SJens Axboe 64a2f48706STheodore Ts'o /* 65cc395d7fSTejun Heo * If one wants to wait for one or more wb_writeback_works, each work's 66cc395d7fSTejun Heo * ->done should be set to a wb_completion defined using the following 67cc395d7fSTejun Heo * macro. Once all work items are issued with wb_queue_work(), the caller 68cc395d7fSTejun Heo * can wait for the completion of all using wb_wait_for_completion(). Work 69cc395d7fSTejun Heo * items which are waited upon aren't freed automatically on completion. 70cc395d7fSTejun Heo */ 71cc395d7fSTejun Heo #define DEFINE_WB_COMPLETION_ONSTACK(cmpl) \ 72cc395d7fSTejun Heo struct wb_completion cmpl = { \ 73cc395d7fSTejun Heo .cnt = ATOMIC_INIT(1), \ 74cc395d7fSTejun Heo } 75cc395d7fSTejun Heo 76cc395d7fSTejun Heo 77cc395d7fSTejun Heo /* 78a2f48706STheodore Ts'o * If an inode is constantly having its pages dirtied, but then the 79a2f48706STheodore Ts'o * updates stop dirtytime_expire_interval seconds in the past, it's 80a2f48706STheodore Ts'o * possible for the worst case time between when an inode has its 81a2f48706STheodore Ts'o * timestamps updated and when they finally get written out to be two 82a2f48706STheodore Ts'o * dirtytime_expire_intervals. We set the default to 12 hours (in 83a2f48706STheodore Ts'o * seconds), which means most of the time inodes will have their 84a2f48706STheodore Ts'o * timestamps written to disk after 12 hours, but in the worst case a 85a2f48706STheodore Ts'o * few inodes might not their timestamps updated for 24 hours. 86a2f48706STheodore Ts'o */ 87a2f48706STheodore Ts'o unsigned int dirtytime_expire_interval = 12 * 60 * 60; 88a2f48706STheodore Ts'o 897ccf19a8SNick Piggin static inline struct inode *wb_inode(struct list_head *head) 907ccf19a8SNick Piggin { 917ccf19a8SNick Piggin return list_entry(head, struct inode, i_wb_list); 927ccf19a8SNick Piggin } 937ccf19a8SNick Piggin 9415eb77a0SWu Fengguang /* 9515eb77a0SWu Fengguang * Include the creation of the trace points after defining the 9615eb77a0SWu Fengguang * wb_writeback_work structure and inline functions so that the definition 9715eb77a0SWu Fengguang * remains local to this file. 9815eb77a0SWu Fengguang */ 9915eb77a0SWu Fengguang #define CREATE_TRACE_POINTS 10015eb77a0SWu Fengguang #include <trace/events/writeback.h> 10115eb77a0SWu Fengguang 102774016b2SSteven Whitehouse EXPORT_TRACEPOINT_SYMBOL_GPL(wbc_writepage); 103774016b2SSteven Whitehouse 104d6c10f1fSTejun Heo static bool wb_io_lists_populated(struct bdi_writeback *wb) 105d6c10f1fSTejun Heo { 106d6c10f1fSTejun Heo if (wb_has_dirty_io(wb)) { 107d6c10f1fSTejun Heo return false; 108d6c10f1fSTejun Heo } else { 109d6c10f1fSTejun Heo set_bit(WB_has_dirty_io, &wb->state); 11095a46c65STejun Heo WARN_ON_ONCE(!wb->avg_write_bandwidth); 111766a9d6eSTejun Heo atomic_long_add(wb->avg_write_bandwidth, 112766a9d6eSTejun Heo &wb->bdi->tot_write_bandwidth); 113d6c10f1fSTejun Heo return true; 114d6c10f1fSTejun Heo } 115d6c10f1fSTejun Heo } 116d6c10f1fSTejun Heo 117d6c10f1fSTejun Heo static void wb_io_lists_depopulated(struct bdi_writeback *wb) 118d6c10f1fSTejun Heo { 119d6c10f1fSTejun Heo if (wb_has_dirty_io(wb) && list_empty(&wb->b_dirty) && 120766a9d6eSTejun Heo list_empty(&wb->b_io) && list_empty(&wb->b_more_io)) { 121d6c10f1fSTejun Heo clear_bit(WB_has_dirty_io, &wb->state); 12295a46c65STejun Heo WARN_ON_ONCE(atomic_long_sub_return(wb->avg_write_bandwidth, 12395a46c65STejun Heo &wb->bdi->tot_write_bandwidth) < 0); 124766a9d6eSTejun Heo } 125d6c10f1fSTejun Heo } 126d6c10f1fSTejun Heo 127d6c10f1fSTejun Heo /** 128d6c10f1fSTejun Heo * inode_wb_list_move_locked - move an inode onto a bdi_writeback IO list 129d6c10f1fSTejun Heo * @inode: inode to be moved 130d6c10f1fSTejun Heo * @wb: target bdi_writeback 131d6c10f1fSTejun Heo * @head: one of @wb->b_{dirty|io|more_io} 132d6c10f1fSTejun Heo * 133d6c10f1fSTejun Heo * Move @inode->i_wb_list to @list of @wb and set %WB_has_dirty_io. 134d6c10f1fSTejun Heo * Returns %true if @inode is the first occupant of the !dirty_time IO 135d6c10f1fSTejun Heo * lists; otherwise, %false. 136d6c10f1fSTejun Heo */ 137d6c10f1fSTejun Heo static bool inode_wb_list_move_locked(struct inode *inode, 138d6c10f1fSTejun Heo struct bdi_writeback *wb, 139d6c10f1fSTejun Heo struct list_head *head) 140d6c10f1fSTejun Heo { 141d6c10f1fSTejun Heo assert_spin_locked(&wb->list_lock); 142d6c10f1fSTejun Heo 143d6c10f1fSTejun Heo list_move(&inode->i_wb_list, head); 144d6c10f1fSTejun Heo 145d6c10f1fSTejun Heo /* dirty_time doesn't count as dirty_io until expiration */ 146d6c10f1fSTejun Heo if (head != &wb->b_dirty_time) 147d6c10f1fSTejun Heo return wb_io_lists_populated(wb); 148d6c10f1fSTejun Heo 149d6c10f1fSTejun Heo wb_io_lists_depopulated(wb); 150d6c10f1fSTejun Heo return false; 151d6c10f1fSTejun Heo } 152d6c10f1fSTejun Heo 153d6c10f1fSTejun Heo /** 154d6c10f1fSTejun Heo * inode_wb_list_del_locked - remove an inode from its bdi_writeback IO list 155d6c10f1fSTejun Heo * @inode: inode to be removed 156d6c10f1fSTejun Heo * @wb: bdi_writeback @inode is being removed from 157d6c10f1fSTejun Heo * 158d6c10f1fSTejun Heo * Remove @inode which may be on one of @wb->b_{dirty|io|more_io} lists and 159d6c10f1fSTejun Heo * clear %WB_has_dirty_io if all are empty afterwards. 160d6c10f1fSTejun Heo */ 161d6c10f1fSTejun Heo static void inode_wb_list_del_locked(struct inode *inode, 162d6c10f1fSTejun Heo struct bdi_writeback *wb) 163d6c10f1fSTejun Heo { 164d6c10f1fSTejun Heo assert_spin_locked(&wb->list_lock); 165d6c10f1fSTejun Heo 166d6c10f1fSTejun Heo list_del_init(&inode->i_wb_list); 167d6c10f1fSTejun Heo wb_io_lists_depopulated(wb); 168d6c10f1fSTejun Heo } 169d6c10f1fSTejun Heo 170f0054bb1STejun Heo static void wb_wakeup(struct bdi_writeback *wb) 1715acda9d1SJan Kara { 172f0054bb1STejun Heo spin_lock_bh(&wb->work_lock); 173f0054bb1STejun Heo if (test_bit(WB_registered, &wb->state)) 174f0054bb1STejun Heo mod_delayed_work(bdi_wq, &wb->dwork, 0); 175f0054bb1STejun Heo spin_unlock_bh(&wb->work_lock); 1765acda9d1SJan Kara } 1775acda9d1SJan Kara 178f0054bb1STejun Heo static void wb_queue_work(struct bdi_writeback *wb, 1796585027aSJan Kara struct wb_writeback_work *work) 1806585027aSJan Kara { 181f0054bb1STejun Heo trace_writeback_queue(wb->bdi, work); 1826585027aSJan Kara 183f0054bb1STejun Heo spin_lock_bh(&wb->work_lock); 18498754bf7STejun Heo if (!test_bit(WB_registered, &wb->state)) { 18598754bf7STejun Heo if (work->single_wait) 18698754bf7STejun Heo work->single_done = 1; 1875acda9d1SJan Kara goto out_unlock; 18898754bf7STejun Heo } 189cc395d7fSTejun Heo if (work->done) 190cc395d7fSTejun Heo atomic_inc(&work->done->cnt); 191f0054bb1STejun Heo list_add_tail(&work->list, &wb->work_list); 192f0054bb1STejun Heo mod_delayed_work(bdi_wq, &wb->dwork, 0); 1935acda9d1SJan Kara out_unlock: 194f0054bb1STejun Heo spin_unlock_bh(&wb->work_lock); 19503ba3782SJens Axboe } 1961da177e4SLinus Torvalds 197cc395d7fSTejun Heo /** 198cc395d7fSTejun Heo * wb_wait_for_completion - wait for completion of bdi_writeback_works 199cc395d7fSTejun Heo * @bdi: bdi work items were issued to 200cc395d7fSTejun Heo * @done: target wb_completion 201cc395d7fSTejun Heo * 202cc395d7fSTejun Heo * Wait for one or more work items issued to @bdi with their ->done field 203cc395d7fSTejun Heo * set to @done, which should have been defined with 204cc395d7fSTejun Heo * DEFINE_WB_COMPLETION_ONSTACK(). This function returns after all such 205cc395d7fSTejun Heo * work items are completed. Work items which are waited upon aren't freed 206cc395d7fSTejun Heo * automatically on completion. 207cc395d7fSTejun Heo */ 208cc395d7fSTejun Heo static void wb_wait_for_completion(struct backing_dev_info *bdi, 209cc395d7fSTejun Heo struct wb_completion *done) 210cc395d7fSTejun Heo { 211cc395d7fSTejun Heo atomic_dec(&done->cnt); /* put down the initial count */ 212cc395d7fSTejun Heo wait_event(bdi->wb_waitq, !atomic_read(&done->cnt)); 213cc395d7fSTejun Heo } 214cc395d7fSTejun Heo 215703c2708STejun Heo #ifdef CONFIG_CGROUP_WRITEBACK 216703c2708STejun Heo 2172a814908STejun Heo /* parameters for foreign inode detection, see wb_detach_inode() */ 2182a814908STejun Heo #define WB_FRN_TIME_SHIFT 13 /* 1s = 2^13, upto 8 secs w/ 16bit */ 2192a814908STejun Heo #define WB_FRN_TIME_AVG_SHIFT 3 /* avg = avg * 7/8 + new * 1/8 */ 2202a814908STejun Heo #define WB_FRN_TIME_CUT_DIV 2 /* ignore rounds < avg / 2 */ 2212a814908STejun Heo #define WB_FRN_TIME_PERIOD (2 * (1 << WB_FRN_TIME_SHIFT)) /* 2s */ 2222a814908STejun Heo 2232a814908STejun Heo #define WB_FRN_HIST_SLOTS 16 /* inode->i_wb_frn_history is 16bit */ 2242a814908STejun Heo #define WB_FRN_HIST_UNIT (WB_FRN_TIME_PERIOD / WB_FRN_HIST_SLOTS) 2252a814908STejun Heo /* each slot's duration is 2s / 16 */ 2262a814908STejun Heo #define WB_FRN_HIST_THR_SLOTS (WB_FRN_HIST_SLOTS / 2) 2272a814908STejun Heo /* if foreign slots >= 8, switch */ 2282a814908STejun Heo #define WB_FRN_HIST_MAX_SLOTS (WB_FRN_HIST_THR_SLOTS / 2 + 1) 2292a814908STejun Heo /* one round can affect upto 5 slots */ 2302a814908STejun Heo 23121c6321fSTejun Heo void __inode_attach_wb(struct inode *inode, struct page *page) 23221c6321fSTejun Heo { 23321c6321fSTejun Heo struct backing_dev_info *bdi = inode_to_bdi(inode); 23421c6321fSTejun Heo struct bdi_writeback *wb = NULL; 23521c6321fSTejun Heo 23621c6321fSTejun Heo if (inode_cgwb_enabled(inode)) { 23721c6321fSTejun Heo struct cgroup_subsys_state *memcg_css; 23821c6321fSTejun Heo 23921c6321fSTejun Heo if (page) { 24021c6321fSTejun Heo memcg_css = mem_cgroup_css_from_page(page); 24121c6321fSTejun Heo wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); 24221c6321fSTejun Heo } else { 24321c6321fSTejun Heo /* must pin memcg_css, see wb_get_create() */ 24421c6321fSTejun Heo memcg_css = task_get_css(current, memory_cgrp_id); 24521c6321fSTejun Heo wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); 24621c6321fSTejun Heo css_put(memcg_css); 24721c6321fSTejun Heo } 24821c6321fSTejun Heo } 24921c6321fSTejun Heo 25021c6321fSTejun Heo if (!wb) 25121c6321fSTejun Heo wb = &bdi->wb; 25221c6321fSTejun Heo 25321c6321fSTejun Heo /* 25421c6321fSTejun Heo * There may be multiple instances of this function racing to 25521c6321fSTejun Heo * update the same inode. Use cmpxchg() to tell the winner. 25621c6321fSTejun Heo */ 25721c6321fSTejun Heo if (unlikely(cmpxchg(&inode->i_wb, NULL, wb))) 25821c6321fSTejun Heo wb_put(wb); 25921c6321fSTejun Heo } 26021c6321fSTejun Heo 261703c2708STejun Heo /** 26287e1d789STejun Heo * locked_inode_to_wb_and_lock_list - determine a locked inode's wb and lock it 26387e1d789STejun Heo * @inode: inode of interest with i_lock held 26487e1d789STejun Heo * 26587e1d789STejun Heo * Returns @inode's wb with its list_lock held. @inode->i_lock must be 26687e1d789STejun Heo * held on entry and is released on return. The returned wb is guaranteed 26787e1d789STejun Heo * to stay @inode's associated wb until its list_lock is released. 26887e1d789STejun Heo */ 26987e1d789STejun Heo static struct bdi_writeback * 27087e1d789STejun Heo locked_inode_to_wb_and_lock_list(struct inode *inode) 27187e1d789STejun Heo __releases(&inode->i_lock) 27287e1d789STejun Heo __acquires(&wb->list_lock) 27387e1d789STejun Heo { 27487e1d789STejun Heo while (true) { 27587e1d789STejun Heo struct bdi_writeback *wb = inode_to_wb(inode); 27687e1d789STejun Heo 27787e1d789STejun Heo /* 27887e1d789STejun Heo * inode_to_wb() association is protected by both 27987e1d789STejun Heo * @inode->i_lock and @wb->list_lock but list_lock nests 28087e1d789STejun Heo * outside i_lock. Drop i_lock and verify that the 28187e1d789STejun Heo * association hasn't changed after acquiring list_lock. 28287e1d789STejun Heo */ 28387e1d789STejun Heo wb_get(wb); 28487e1d789STejun Heo spin_unlock(&inode->i_lock); 28587e1d789STejun Heo spin_lock(&wb->list_lock); 28687e1d789STejun Heo wb_put(wb); /* not gonna deref it anymore */ 28787e1d789STejun Heo 28887e1d789STejun Heo if (likely(wb == inode_to_wb(inode))) 28987e1d789STejun Heo return wb; /* @inode already has ref */ 29087e1d789STejun Heo 29187e1d789STejun Heo spin_unlock(&wb->list_lock); 29287e1d789STejun Heo cpu_relax(); 29387e1d789STejun Heo spin_lock(&inode->i_lock); 29487e1d789STejun Heo } 29587e1d789STejun Heo } 29687e1d789STejun Heo 29787e1d789STejun Heo /** 29887e1d789STejun Heo * inode_to_wb_and_lock_list - determine an inode's wb and lock it 29987e1d789STejun Heo * @inode: inode of interest 30087e1d789STejun Heo * 30187e1d789STejun Heo * Same as locked_inode_to_wb_and_lock_list() but @inode->i_lock isn't held 30287e1d789STejun Heo * on entry. 30387e1d789STejun Heo */ 30487e1d789STejun Heo static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode) 30587e1d789STejun Heo __acquires(&wb->list_lock) 30687e1d789STejun Heo { 30787e1d789STejun Heo spin_lock(&inode->i_lock); 30887e1d789STejun Heo return locked_inode_to_wb_and_lock_list(inode); 30987e1d789STejun Heo } 31087e1d789STejun Heo 31187e1d789STejun Heo /** 312b16b1debSTejun Heo * wbc_attach_and_unlock_inode - associate wbc with target inode and unlock it 313b16b1debSTejun Heo * @wbc: writeback_control of interest 314b16b1debSTejun Heo * @inode: target inode 315b16b1debSTejun Heo * 316b16b1debSTejun Heo * @inode is locked and about to be written back under the control of @wbc. 317b16b1debSTejun Heo * Record @inode's writeback context into @wbc and unlock the i_lock. On 318b16b1debSTejun Heo * writeback completion, wbc_detach_inode() should be called. This is used 319b16b1debSTejun Heo * to track the cgroup writeback context. 320b16b1debSTejun Heo */ 321b16b1debSTejun Heo void wbc_attach_and_unlock_inode(struct writeback_control *wbc, 322b16b1debSTejun Heo struct inode *inode) 323b16b1debSTejun Heo { 324b16b1debSTejun Heo wbc->wb = inode_to_wb(inode); 3252a814908STejun Heo wbc->inode = inode; 3262a814908STejun Heo 3272a814908STejun Heo wbc->wb_id = wbc->wb->memcg_css->id; 3282a814908STejun Heo wbc->wb_lcand_id = inode->i_wb_frn_winner; 3292a814908STejun Heo wbc->wb_tcand_id = 0; 3302a814908STejun Heo wbc->wb_bytes = 0; 3312a814908STejun Heo wbc->wb_lcand_bytes = 0; 3322a814908STejun Heo wbc->wb_tcand_bytes = 0; 3332a814908STejun Heo 334b16b1debSTejun Heo wb_get(wbc->wb); 335b16b1debSTejun Heo spin_unlock(&inode->i_lock); 336b16b1debSTejun Heo } 337b16b1debSTejun Heo 338b16b1debSTejun Heo /** 3392a814908STejun Heo * wbc_detach_inode - disassociate wbc from inode and perform foreign detection 3402a814908STejun Heo * @wbc: writeback_control of the just finished writeback 341b16b1debSTejun Heo * 342b16b1debSTejun Heo * To be called after a writeback attempt of an inode finishes and undoes 343b16b1debSTejun Heo * wbc_attach_and_unlock_inode(). Can be called under any context. 3442a814908STejun Heo * 3452a814908STejun Heo * As concurrent write sharing of an inode is expected to be very rare and 3462a814908STejun Heo * memcg only tracks page ownership on first-use basis severely confining 3472a814908STejun Heo * the usefulness of such sharing, cgroup writeback tracks ownership 3482a814908STejun Heo * per-inode. While the support for concurrent write sharing of an inode 3492a814908STejun Heo * is deemed unnecessary, an inode being written to by different cgroups at 3502a814908STejun Heo * different points in time is a lot more common, and, more importantly, 3512a814908STejun Heo * charging only by first-use can too readily lead to grossly incorrect 3522a814908STejun Heo * behaviors (single foreign page can lead to gigabytes of writeback to be 3532a814908STejun Heo * incorrectly attributed). 3542a814908STejun Heo * 3552a814908STejun Heo * To resolve this issue, cgroup writeback detects the majority dirtier of 3562a814908STejun Heo * an inode and transfers the ownership to it. To avoid unnnecessary 3572a814908STejun Heo * oscillation, the detection mechanism keeps track of history and gives 3582a814908STejun Heo * out the switch verdict only if the foreign usage pattern is stable over 3592a814908STejun Heo * a certain amount of time and/or writeback attempts. 3602a814908STejun Heo * 3612a814908STejun Heo * On each writeback attempt, @wbc tries to detect the majority writer 3622a814908STejun Heo * using Boyer-Moore majority vote algorithm. In addition to the byte 3632a814908STejun Heo * count from the majority voting, it also counts the bytes written for the 3642a814908STejun Heo * current wb and the last round's winner wb (max of last round's current 3652a814908STejun Heo * wb, the winner from two rounds ago, and the last round's majority 3662a814908STejun Heo * candidate). Keeping track of the historical winner helps the algorithm 3672a814908STejun Heo * to semi-reliably detect the most active writer even when it's not the 3682a814908STejun Heo * absolute majority. 3692a814908STejun Heo * 3702a814908STejun Heo * Once the winner of the round is determined, whether the winner is 3712a814908STejun Heo * foreign or not and how much IO time the round consumed is recorded in 3722a814908STejun Heo * inode->i_wb_frn_history. If the amount of recorded foreign IO time is 3732a814908STejun Heo * over a certain threshold, the switch verdict is given. 374b16b1debSTejun Heo */ 375b16b1debSTejun Heo void wbc_detach_inode(struct writeback_control *wbc) 376b16b1debSTejun Heo { 3772a814908STejun Heo struct bdi_writeback *wb = wbc->wb; 3782a814908STejun Heo struct inode *inode = wbc->inode; 3792a814908STejun Heo u16 history = inode->i_wb_frn_history; 3802a814908STejun Heo unsigned long avg_time = inode->i_wb_frn_avg_time; 3812a814908STejun Heo unsigned long max_bytes, max_time; 3822a814908STejun Heo int max_id; 3832a814908STejun Heo 3842a814908STejun Heo /* pick the winner of this round */ 3852a814908STejun Heo if (wbc->wb_bytes >= wbc->wb_lcand_bytes && 3862a814908STejun Heo wbc->wb_bytes >= wbc->wb_tcand_bytes) { 3872a814908STejun Heo max_id = wbc->wb_id; 3882a814908STejun Heo max_bytes = wbc->wb_bytes; 3892a814908STejun Heo } else if (wbc->wb_lcand_bytes >= wbc->wb_tcand_bytes) { 3902a814908STejun Heo max_id = wbc->wb_lcand_id; 3912a814908STejun Heo max_bytes = wbc->wb_lcand_bytes; 3922a814908STejun Heo } else { 3932a814908STejun Heo max_id = wbc->wb_tcand_id; 3942a814908STejun Heo max_bytes = wbc->wb_tcand_bytes; 3952a814908STejun Heo } 3962a814908STejun Heo 3972a814908STejun Heo /* 3982a814908STejun Heo * Calculate the amount of IO time the winner consumed and fold it 3992a814908STejun Heo * into the running average kept per inode. If the consumed IO 4002a814908STejun Heo * time is lower than avag / WB_FRN_TIME_CUT_DIV, ignore it for 4012a814908STejun Heo * deciding whether to switch or not. This is to prevent one-off 4022a814908STejun Heo * small dirtiers from skewing the verdict. 4032a814908STejun Heo */ 4042a814908STejun Heo max_time = DIV_ROUND_UP((max_bytes >> PAGE_SHIFT) << WB_FRN_TIME_SHIFT, 4052a814908STejun Heo wb->avg_write_bandwidth); 4062a814908STejun Heo if (avg_time) 4072a814908STejun Heo avg_time += (max_time >> WB_FRN_TIME_AVG_SHIFT) - 4082a814908STejun Heo (avg_time >> WB_FRN_TIME_AVG_SHIFT); 4092a814908STejun Heo else 4102a814908STejun Heo avg_time = max_time; /* immediate catch up on first run */ 4112a814908STejun Heo 4122a814908STejun Heo if (max_time >= avg_time / WB_FRN_TIME_CUT_DIV) { 4132a814908STejun Heo int slots; 4142a814908STejun Heo 4152a814908STejun Heo /* 4162a814908STejun Heo * The switch verdict is reached if foreign wb's consume 4172a814908STejun Heo * more than a certain proportion of IO time in a 4182a814908STejun Heo * WB_FRN_TIME_PERIOD. This is loosely tracked by 16 slot 4192a814908STejun Heo * history mask where each bit represents one sixteenth of 4202a814908STejun Heo * the period. Determine the number of slots to shift into 4212a814908STejun Heo * history from @max_time. 4222a814908STejun Heo */ 4232a814908STejun Heo slots = min(DIV_ROUND_UP(max_time, WB_FRN_HIST_UNIT), 4242a814908STejun Heo (unsigned long)WB_FRN_HIST_MAX_SLOTS); 4252a814908STejun Heo history <<= slots; 4262a814908STejun Heo if (wbc->wb_id != max_id) 4272a814908STejun Heo history |= (1U << slots) - 1; 4282a814908STejun Heo 4292a814908STejun Heo /* 4302a814908STejun Heo * Switch if the current wb isn't the consistent winner. 4312a814908STejun Heo * If there are multiple closely competing dirtiers, the 4322a814908STejun Heo * inode may switch across them repeatedly over time, which 4332a814908STejun Heo * is okay. The main goal is avoiding keeping an inode on 4342a814908STejun Heo * the wrong wb for an extended period of time. 4352a814908STejun Heo */ 4362a814908STejun Heo if (hweight32(history) > WB_FRN_HIST_THR_SLOTS) { 4372a814908STejun Heo /* switch */ 4382a814908STejun Heo max_id = 0; 4392a814908STejun Heo avg_time = 0; 4402a814908STejun Heo history = 0; 4412a814908STejun Heo } 4422a814908STejun Heo } 4432a814908STejun Heo 4442a814908STejun Heo /* 4452a814908STejun Heo * Multiple instances of this function may race to update the 4462a814908STejun Heo * following fields but we don't mind occassional inaccuracies. 4472a814908STejun Heo */ 4482a814908STejun Heo inode->i_wb_frn_winner = max_id; 4492a814908STejun Heo inode->i_wb_frn_avg_time = min(avg_time, (unsigned long)U16_MAX); 4502a814908STejun Heo inode->i_wb_frn_history = history; 4512a814908STejun Heo 452b16b1debSTejun Heo wb_put(wbc->wb); 453b16b1debSTejun Heo wbc->wb = NULL; 454b16b1debSTejun Heo } 455b16b1debSTejun Heo 456b16b1debSTejun Heo /** 4572a814908STejun Heo * wbc_account_io - account IO issued during writeback 4582a814908STejun Heo * @wbc: writeback_control of the writeback in progress 4592a814908STejun Heo * @page: page being written out 4602a814908STejun Heo * @bytes: number of bytes being written out 4612a814908STejun Heo * 4622a814908STejun Heo * @bytes from @page are about to written out during the writeback 4632a814908STejun Heo * controlled by @wbc. Keep the book for foreign inode detection. See 4642a814908STejun Heo * wbc_detach_inode(). 4652a814908STejun Heo */ 4662a814908STejun Heo void wbc_account_io(struct writeback_control *wbc, struct page *page, 4672a814908STejun Heo size_t bytes) 4682a814908STejun Heo { 4692a814908STejun Heo int id; 4702a814908STejun Heo 4712a814908STejun Heo /* 4722a814908STejun Heo * pageout() path doesn't attach @wbc to the inode being written 4732a814908STejun Heo * out. This is intentional as we don't want the function to block 4742a814908STejun Heo * behind a slow cgroup. Ultimately, we want pageout() to kick off 4752a814908STejun Heo * regular writeback instead of writing things out itself. 4762a814908STejun Heo */ 4772a814908STejun Heo if (!wbc->wb) 4782a814908STejun Heo return; 4792a814908STejun Heo 4802a814908STejun Heo rcu_read_lock(); 4812a814908STejun Heo id = mem_cgroup_css_from_page(page)->id; 4822a814908STejun Heo rcu_read_unlock(); 4832a814908STejun Heo 4842a814908STejun Heo if (id == wbc->wb_id) { 4852a814908STejun Heo wbc->wb_bytes += bytes; 4862a814908STejun Heo return; 4872a814908STejun Heo } 4882a814908STejun Heo 4892a814908STejun Heo if (id == wbc->wb_lcand_id) 4902a814908STejun Heo wbc->wb_lcand_bytes += bytes; 4912a814908STejun Heo 4922a814908STejun Heo /* Boyer-Moore majority vote algorithm */ 4932a814908STejun Heo if (!wbc->wb_tcand_bytes) 4942a814908STejun Heo wbc->wb_tcand_id = id; 4952a814908STejun Heo if (id == wbc->wb_tcand_id) 4962a814908STejun Heo wbc->wb_tcand_bytes += bytes; 4972a814908STejun Heo else 4982a814908STejun Heo wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes); 4992a814908STejun Heo } 5002a814908STejun Heo 5012a814908STejun Heo /** 502703c2708STejun Heo * inode_congested - test whether an inode is congested 503703c2708STejun Heo * @inode: inode to test for congestion 504703c2708STejun Heo * @cong_bits: mask of WB_[a]sync_congested bits to test 505703c2708STejun Heo * 506703c2708STejun Heo * Tests whether @inode is congested. @cong_bits is the mask of congestion 507703c2708STejun Heo * bits to test and the return value is the mask of set bits. 508703c2708STejun Heo * 509703c2708STejun Heo * If cgroup writeback is enabled for @inode, the congestion state is 510703c2708STejun Heo * determined by whether the cgwb (cgroup bdi_writeback) for the blkcg 511703c2708STejun Heo * associated with @inode is congested; otherwise, the root wb's congestion 512703c2708STejun Heo * state is used. 513703c2708STejun Heo */ 514703c2708STejun Heo int inode_congested(struct inode *inode, int cong_bits) 515703c2708STejun Heo { 516703c2708STejun Heo if (inode) { 517703c2708STejun Heo struct bdi_writeback *wb = inode_to_wb(inode); 518703c2708STejun Heo if (wb) 519703c2708STejun Heo return wb_congested(wb, cong_bits); 520703c2708STejun Heo } 521703c2708STejun Heo 522703c2708STejun Heo return wb_congested(&inode_to_bdi(inode)->wb, cong_bits); 523703c2708STejun Heo } 524703c2708STejun Heo EXPORT_SYMBOL_GPL(inode_congested); 525703c2708STejun Heo 526f2b65121STejun Heo /** 52798754bf7STejun Heo * wb_wait_for_single_work - wait for completion of a single bdi_writeback_work 52898754bf7STejun Heo * @bdi: bdi the work item was issued to 52998754bf7STejun Heo * @work: work item to wait for 53098754bf7STejun Heo * 53198754bf7STejun Heo * Wait for the completion of @work which was issued to one of @bdi's 53298754bf7STejun Heo * bdi_writeback's. The caller must have set @work->single_wait before 53398754bf7STejun Heo * issuing it. This wait operates independently fo 53498754bf7STejun Heo * wb_wait_for_completion() and also disables automatic freeing of @work. 53598754bf7STejun Heo */ 53698754bf7STejun Heo static void wb_wait_for_single_work(struct backing_dev_info *bdi, 53798754bf7STejun Heo struct wb_writeback_work *work) 53898754bf7STejun Heo { 53998754bf7STejun Heo if (WARN_ON_ONCE(!work->single_wait)) 54098754bf7STejun Heo return; 54198754bf7STejun Heo 54298754bf7STejun Heo wait_event(bdi->wb_waitq, work->single_done); 54398754bf7STejun Heo 54498754bf7STejun Heo /* 54598754bf7STejun Heo * Paired with smp_wmb() in wb_do_writeback() and ensures that all 54698754bf7STejun Heo * modifications to @work prior to assertion of ->single_done is 54798754bf7STejun Heo * visible to the caller once this function returns. 54898754bf7STejun Heo */ 54998754bf7STejun Heo smp_rmb(); 55098754bf7STejun Heo } 55198754bf7STejun Heo 55298754bf7STejun Heo /** 553f2b65121STejun Heo * wb_split_bdi_pages - split nr_pages to write according to bandwidth 554f2b65121STejun Heo * @wb: target bdi_writeback to split @nr_pages to 555f2b65121STejun Heo * @nr_pages: number of pages to write for the whole bdi 556f2b65121STejun Heo * 557f2b65121STejun Heo * Split @wb's portion of @nr_pages according to @wb's write bandwidth in 558f2b65121STejun Heo * relation to the total write bandwidth of all wb's w/ dirty inodes on 559f2b65121STejun Heo * @wb->bdi. 560f2b65121STejun Heo */ 561f2b65121STejun Heo static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages) 562f2b65121STejun Heo { 563f2b65121STejun Heo unsigned long this_bw = wb->avg_write_bandwidth; 564f2b65121STejun Heo unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth); 565f2b65121STejun Heo 566f2b65121STejun Heo if (nr_pages == LONG_MAX) 567f2b65121STejun Heo return LONG_MAX; 568f2b65121STejun Heo 569f2b65121STejun Heo /* 570f2b65121STejun Heo * This may be called on clean wb's and proportional distribution 571f2b65121STejun Heo * may not make sense, just use the original @nr_pages in those 572f2b65121STejun Heo * cases. In general, we wanna err on the side of writing more. 573f2b65121STejun Heo */ 574f2b65121STejun Heo if (!tot_bw || this_bw >= tot_bw) 575f2b65121STejun Heo return nr_pages; 576f2b65121STejun Heo else 577f2b65121STejun Heo return DIV_ROUND_UP_ULL((u64)nr_pages * this_bw, tot_bw); 578f2b65121STejun Heo } 579f2b65121STejun Heo 580db125360STejun Heo /** 581db125360STejun Heo * wb_clone_and_queue_work - clone a wb_writeback_work and issue it to a wb 582db125360STejun Heo * @wb: target bdi_writeback 583db125360STejun Heo * @base_work: source wb_writeback_work 584db125360STejun Heo * 585db125360STejun Heo * Try to make a clone of @base_work and issue it to @wb. If cloning 586db125360STejun Heo * succeeds, %true is returned; otherwise, @base_work is issued directly 587db125360STejun Heo * and %false is returned. In the latter case, the caller is required to 588db125360STejun Heo * wait for @base_work's completion using wb_wait_for_single_work(). 589db125360STejun Heo * 590db125360STejun Heo * A clone is auto-freed on completion. @base_work never is. 591db125360STejun Heo */ 592db125360STejun Heo static bool wb_clone_and_queue_work(struct bdi_writeback *wb, 593db125360STejun Heo struct wb_writeback_work *base_work) 594db125360STejun Heo { 595db125360STejun Heo struct wb_writeback_work *work; 596db125360STejun Heo 597db125360STejun Heo work = kmalloc(sizeof(*work), GFP_ATOMIC); 598db125360STejun Heo if (work) { 599db125360STejun Heo *work = *base_work; 600db125360STejun Heo work->auto_free = 1; 601db125360STejun Heo work->single_wait = 0; 602db125360STejun Heo } else { 603db125360STejun Heo work = base_work; 604db125360STejun Heo work->auto_free = 0; 605db125360STejun Heo work->single_wait = 1; 606db125360STejun Heo } 607db125360STejun Heo work->single_done = 0; 608db125360STejun Heo wb_queue_work(wb, work); 609db125360STejun Heo return work != base_work; 610db125360STejun Heo } 611db125360STejun Heo 612db125360STejun Heo /** 613db125360STejun Heo * bdi_split_work_to_wbs - split a wb_writeback_work to all wb's of a bdi 614db125360STejun Heo * @bdi: target backing_dev_info 615db125360STejun Heo * @base_work: wb_writeback_work to issue 616db125360STejun Heo * @skip_if_busy: skip wb's which already have writeback in progress 617db125360STejun Heo * 618db125360STejun Heo * Split and issue @base_work to all wb's (bdi_writeback's) of @bdi which 619db125360STejun Heo * have dirty inodes. If @base_work->nr_page isn't %LONG_MAX, it's 620db125360STejun Heo * distributed to the busy wbs according to each wb's proportion in the 621db125360STejun Heo * total active write bandwidth of @bdi. 622db125360STejun Heo */ 623db125360STejun Heo static void bdi_split_work_to_wbs(struct backing_dev_info *bdi, 624db125360STejun Heo struct wb_writeback_work *base_work, 625db125360STejun Heo bool skip_if_busy) 626db125360STejun Heo { 627db125360STejun Heo long nr_pages = base_work->nr_pages; 628db125360STejun Heo int next_blkcg_id = 0; 629db125360STejun Heo struct bdi_writeback *wb; 630db125360STejun Heo struct wb_iter iter; 631db125360STejun Heo 632db125360STejun Heo might_sleep(); 633db125360STejun Heo 634db125360STejun Heo if (!bdi_has_dirty_io(bdi)) 635db125360STejun Heo return; 636db125360STejun Heo restart: 637db125360STejun Heo rcu_read_lock(); 638db125360STejun Heo bdi_for_each_wb(wb, bdi, &iter, next_blkcg_id) { 639db125360STejun Heo if (!wb_has_dirty_io(wb) || 640db125360STejun Heo (skip_if_busy && writeback_in_progress(wb))) 641db125360STejun Heo continue; 642db125360STejun Heo 643db125360STejun Heo base_work->nr_pages = wb_split_bdi_pages(wb, nr_pages); 644db125360STejun Heo if (!wb_clone_and_queue_work(wb, base_work)) { 645db125360STejun Heo next_blkcg_id = wb->blkcg_css->id + 1; 646db125360STejun Heo rcu_read_unlock(); 647db125360STejun Heo wb_wait_for_single_work(bdi, base_work); 648db125360STejun Heo goto restart; 649db125360STejun Heo } 650db125360STejun Heo } 651db125360STejun Heo rcu_read_unlock(); 652db125360STejun Heo } 653db125360STejun Heo 654f2b65121STejun Heo #else /* CONFIG_CGROUP_WRITEBACK */ 655f2b65121STejun Heo 65687e1d789STejun Heo static struct bdi_writeback * 65787e1d789STejun Heo locked_inode_to_wb_and_lock_list(struct inode *inode) 65887e1d789STejun Heo __releases(&inode->i_lock) 65987e1d789STejun Heo __acquires(&wb->list_lock) 66087e1d789STejun Heo { 66187e1d789STejun Heo struct bdi_writeback *wb = inode_to_wb(inode); 66287e1d789STejun Heo 66387e1d789STejun Heo spin_unlock(&inode->i_lock); 66487e1d789STejun Heo spin_lock(&wb->list_lock); 66587e1d789STejun Heo return wb; 66687e1d789STejun Heo } 66787e1d789STejun Heo 66887e1d789STejun Heo static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode) 66987e1d789STejun Heo __acquires(&wb->list_lock) 67087e1d789STejun Heo { 67187e1d789STejun Heo struct bdi_writeback *wb = inode_to_wb(inode); 67287e1d789STejun Heo 67387e1d789STejun Heo spin_lock(&wb->list_lock); 67487e1d789STejun Heo return wb; 67587e1d789STejun Heo } 67687e1d789STejun Heo 677f2b65121STejun Heo static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages) 678f2b65121STejun Heo { 679f2b65121STejun Heo return nr_pages; 680f2b65121STejun Heo } 681f2b65121STejun Heo 682db125360STejun Heo static void bdi_split_work_to_wbs(struct backing_dev_info *bdi, 683db125360STejun Heo struct wb_writeback_work *base_work, 684db125360STejun Heo bool skip_if_busy) 685db125360STejun Heo { 686db125360STejun Heo might_sleep(); 687db125360STejun Heo 688db125360STejun Heo if (bdi_has_dirty_io(bdi) && 689db125360STejun Heo (!skip_if_busy || !writeback_in_progress(&bdi->wb))) { 690db125360STejun Heo base_work->auto_free = 0; 691db125360STejun Heo base_work->single_wait = 0; 692db125360STejun Heo base_work->single_done = 0; 693db125360STejun Heo wb_queue_work(&bdi->wb, base_work); 694db125360STejun Heo } 695db125360STejun Heo } 696db125360STejun Heo 697703c2708STejun Heo #endif /* CONFIG_CGROUP_WRITEBACK */ 698703c2708STejun Heo 699c00ddad3STejun Heo void wb_start_writeback(struct bdi_writeback *wb, long nr_pages, 700c00ddad3STejun Heo bool range_cyclic, enum wb_reason reason) 701b6e51316SJens Axboe { 702c00ddad3STejun Heo struct wb_writeback_work *work; 703c00ddad3STejun Heo 704c00ddad3STejun Heo if (!wb_has_dirty_io(wb)) 705c00ddad3STejun Heo return; 706c00ddad3STejun Heo 707c00ddad3STejun Heo /* 708c00ddad3STejun Heo * This is WB_SYNC_NONE writeback, so if allocation fails just 709c00ddad3STejun Heo * wakeup the thread for old dirty data writeback 710c00ddad3STejun Heo */ 711c00ddad3STejun Heo work = kzalloc(sizeof(*work), GFP_ATOMIC); 712c00ddad3STejun Heo if (!work) { 713c00ddad3STejun Heo trace_writeback_nowork(wb->bdi); 714c00ddad3STejun Heo wb_wakeup(wb); 715c00ddad3STejun Heo return; 716c00ddad3STejun Heo } 717c00ddad3STejun Heo 718c00ddad3STejun Heo work->sync_mode = WB_SYNC_NONE; 719c00ddad3STejun Heo work->nr_pages = nr_pages; 720c00ddad3STejun Heo work->range_cyclic = range_cyclic; 721c00ddad3STejun Heo work->reason = reason; 722ac7b19a3STejun Heo work->auto_free = 1; 723c00ddad3STejun Heo 724c00ddad3STejun Heo wb_queue_work(wb, work); 725d3ddec76SWu Fengguang } 726d3ddec76SWu Fengguang 727c5444198SChristoph Hellwig /** 7289ecf4866STejun Heo * wb_start_background_writeback - start background writeback 7299ecf4866STejun Heo * @wb: bdi_writback to write from 730c5444198SChristoph Hellwig * 731c5444198SChristoph Hellwig * Description: 7326585027aSJan Kara * This makes sure WB_SYNC_NONE background writeback happens. When 7339ecf4866STejun Heo * this function returns, it is only guaranteed that for given wb 7346585027aSJan Kara * some IO is happening if we are over background dirty threshold. 7356585027aSJan Kara * Caller need not hold sb s_umount semaphore. 736c5444198SChristoph Hellwig */ 7379ecf4866STejun Heo void wb_start_background_writeback(struct bdi_writeback *wb) 738c5444198SChristoph Hellwig { 7396585027aSJan Kara /* 7406585027aSJan Kara * We just wake up the flusher thread. It will perform background 7416585027aSJan Kara * writeback as soon as there is no other work to do. 7426585027aSJan Kara */ 7439ecf4866STejun Heo trace_writeback_wake_background(wb->bdi); 7449ecf4866STejun Heo wb_wakeup(wb); 7451da177e4SLinus Torvalds } 7461da177e4SLinus Torvalds 7471da177e4SLinus Torvalds /* 748a66979abSDave Chinner * Remove the inode from the writeback list it is on. 749a66979abSDave Chinner */ 750a66979abSDave Chinner void inode_wb_list_del(struct inode *inode) 751a66979abSDave Chinner { 75287e1d789STejun Heo struct bdi_writeback *wb; 753a66979abSDave Chinner 75487e1d789STejun Heo wb = inode_to_wb_and_lock_list(inode); 755d6c10f1fSTejun Heo inode_wb_list_del_locked(inode, wb); 75652ebea74STejun Heo spin_unlock(&wb->list_lock); 757f758eeabSChristoph Hellwig } 758a66979abSDave Chinner 759a66979abSDave Chinner /* 7606610a0bcSAndrew Morton * Redirty an inode: set its when-it-was dirtied timestamp and move it to the 7616610a0bcSAndrew Morton * furthest end of its superblock's dirty-inode list. 7626610a0bcSAndrew Morton * 7636610a0bcSAndrew Morton * Before stamping the inode's ->dirtied_when, we check to see whether it is 76466f3b8e2SJens Axboe * already the most-recently-dirtied inode on the b_dirty list. If that is 7656610a0bcSAndrew Morton * the case then the inode must have been redirtied while it was being written 7666610a0bcSAndrew Morton * out and we don't reset its dirtied_when. 7676610a0bcSAndrew Morton */ 768f758eeabSChristoph Hellwig static void redirty_tail(struct inode *inode, struct bdi_writeback *wb) 7696610a0bcSAndrew Morton { 77003ba3782SJens Axboe if (!list_empty(&wb->b_dirty)) { 77166f3b8e2SJens Axboe struct inode *tail; 7726610a0bcSAndrew Morton 7737ccf19a8SNick Piggin tail = wb_inode(wb->b_dirty.next); 77466f3b8e2SJens Axboe if (time_before(inode->dirtied_when, tail->dirtied_when)) 7756610a0bcSAndrew Morton inode->dirtied_when = jiffies; 7766610a0bcSAndrew Morton } 777d6c10f1fSTejun Heo inode_wb_list_move_locked(inode, wb, &wb->b_dirty); 7786610a0bcSAndrew Morton } 7796610a0bcSAndrew Morton 7806610a0bcSAndrew Morton /* 78166f3b8e2SJens Axboe * requeue inode for re-scanning after bdi->b_io list is exhausted. 782c986d1e2SAndrew Morton */ 783f758eeabSChristoph Hellwig static void requeue_io(struct inode *inode, struct bdi_writeback *wb) 784c986d1e2SAndrew Morton { 785d6c10f1fSTejun Heo inode_wb_list_move_locked(inode, wb, &wb->b_more_io); 786c986d1e2SAndrew Morton } 787c986d1e2SAndrew Morton 7881c0eeaf5SJoern Engel static void inode_sync_complete(struct inode *inode) 7891c0eeaf5SJoern Engel { 790365b94aeSJan Kara inode->i_state &= ~I_SYNC; 7914eff96ddSJan Kara /* If inode is clean an unused, put it into LRU now... */ 7924eff96ddSJan Kara inode_add_lru(inode); 793365b94aeSJan Kara /* Waiters must see I_SYNC cleared before being woken up */ 7941c0eeaf5SJoern Engel smp_mb(); 7951c0eeaf5SJoern Engel wake_up_bit(&inode->i_state, __I_SYNC); 7961c0eeaf5SJoern Engel } 7971c0eeaf5SJoern Engel 798d2caa3c5SJeff Layton static bool inode_dirtied_after(struct inode *inode, unsigned long t) 799d2caa3c5SJeff Layton { 800d2caa3c5SJeff Layton bool ret = time_after(inode->dirtied_when, t); 801d2caa3c5SJeff Layton #ifndef CONFIG_64BIT 802d2caa3c5SJeff Layton /* 803d2caa3c5SJeff Layton * For inodes being constantly redirtied, dirtied_when can get stuck. 804d2caa3c5SJeff Layton * It _appears_ to be in the future, but is actually in distant past. 805d2caa3c5SJeff Layton * This test is necessary to prevent such wrapped-around relative times 8065b0830cbSJens Axboe * from permanently stopping the whole bdi writeback. 807d2caa3c5SJeff Layton */ 808d2caa3c5SJeff Layton ret = ret && time_before_eq(inode->dirtied_when, jiffies); 809d2caa3c5SJeff Layton #endif 810d2caa3c5SJeff Layton return ret; 811d2caa3c5SJeff Layton } 812d2caa3c5SJeff Layton 8130ae45f63STheodore Ts'o #define EXPIRE_DIRTY_ATIME 0x0001 8140ae45f63STheodore Ts'o 815c986d1e2SAndrew Morton /* 8160e2f2b23SWang Sheng-Hui * Move expired (dirtied before work->older_than_this) dirty inodes from 817697e6fedSJan Kara * @delaying_queue to @dispatch_queue. 8182c136579SFengguang Wu */ 819e84d0a4fSWu Fengguang static int move_expired_inodes(struct list_head *delaying_queue, 8202c136579SFengguang Wu struct list_head *dispatch_queue, 8210ae45f63STheodore Ts'o int flags, 822ad4e38ddSCurt Wohlgemuth struct wb_writeback_work *work) 8232c136579SFengguang Wu { 8240ae45f63STheodore Ts'o unsigned long *older_than_this = NULL; 8250ae45f63STheodore Ts'o unsigned long expire_time; 8265c03449dSShaohua Li LIST_HEAD(tmp); 8275c03449dSShaohua Li struct list_head *pos, *node; 828cf137307SJens Axboe struct super_block *sb = NULL; 8295c03449dSShaohua Li struct inode *inode; 830cf137307SJens Axboe int do_sb_sort = 0; 831e84d0a4fSWu Fengguang int moved = 0; 8325c03449dSShaohua Li 8330ae45f63STheodore Ts'o if ((flags & EXPIRE_DIRTY_ATIME) == 0) 8340ae45f63STheodore Ts'o older_than_this = work->older_than_this; 835a2f48706STheodore Ts'o else if (!work->for_sync) { 836a2f48706STheodore Ts'o expire_time = jiffies - (dirtytime_expire_interval * HZ); 8370ae45f63STheodore Ts'o older_than_this = &expire_time; 8380ae45f63STheodore Ts'o } 8392c136579SFengguang Wu while (!list_empty(delaying_queue)) { 8407ccf19a8SNick Piggin inode = wb_inode(delaying_queue->prev); 8410ae45f63STheodore Ts'o if (older_than_this && 8420ae45f63STheodore Ts'o inode_dirtied_after(inode, *older_than_this)) 8432c136579SFengguang Wu break; 844a8855990SJan Kara list_move(&inode->i_wb_list, &tmp); 845a8855990SJan Kara moved++; 8460ae45f63STheodore Ts'o if (flags & EXPIRE_DIRTY_ATIME) 8470ae45f63STheodore Ts'o set_bit(__I_DIRTY_TIME_EXPIRED, &inode->i_state); 848a8855990SJan Kara if (sb_is_blkdev_sb(inode->i_sb)) 849a8855990SJan Kara continue; 850cf137307SJens Axboe if (sb && sb != inode->i_sb) 851cf137307SJens Axboe do_sb_sort = 1; 852cf137307SJens Axboe sb = inode->i_sb; 8535c03449dSShaohua Li } 8545c03449dSShaohua Li 855cf137307SJens Axboe /* just one sb in list, splice to dispatch_queue and we're done */ 856cf137307SJens Axboe if (!do_sb_sort) { 857cf137307SJens Axboe list_splice(&tmp, dispatch_queue); 858e84d0a4fSWu Fengguang goto out; 859cf137307SJens Axboe } 860cf137307SJens Axboe 8615c03449dSShaohua Li /* Move inodes from one superblock together */ 8625c03449dSShaohua Li while (!list_empty(&tmp)) { 8637ccf19a8SNick Piggin sb = wb_inode(tmp.prev)->i_sb; 8645c03449dSShaohua Li list_for_each_prev_safe(pos, node, &tmp) { 8657ccf19a8SNick Piggin inode = wb_inode(pos); 8665c03449dSShaohua Li if (inode->i_sb == sb) 8677ccf19a8SNick Piggin list_move(&inode->i_wb_list, dispatch_queue); 8682c136579SFengguang Wu } 8692c136579SFengguang Wu } 870e84d0a4fSWu Fengguang out: 871e84d0a4fSWu Fengguang return moved; 8725c03449dSShaohua Li } 8732c136579SFengguang Wu 8742c136579SFengguang Wu /* 8752c136579SFengguang Wu * Queue all expired dirty inodes for io, eldest first. 8764ea879b9SWu Fengguang * Before 8774ea879b9SWu Fengguang * newly dirtied b_dirty b_io b_more_io 8784ea879b9SWu Fengguang * =============> gf edc BA 8794ea879b9SWu Fengguang * After 8804ea879b9SWu Fengguang * newly dirtied b_dirty b_io b_more_io 8814ea879b9SWu Fengguang * =============> g fBAedc 8824ea879b9SWu Fengguang * | 8834ea879b9SWu Fengguang * +--> dequeue for IO 8842c136579SFengguang Wu */ 885ad4e38ddSCurt Wohlgemuth static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work) 8862c136579SFengguang Wu { 887e84d0a4fSWu Fengguang int moved; 8880ae45f63STheodore Ts'o 889f758eeabSChristoph Hellwig assert_spin_locked(&wb->list_lock); 8904ea879b9SWu Fengguang list_splice_init(&wb->b_more_io, &wb->b_io); 8910ae45f63STheodore Ts'o moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, 0, work); 8920ae45f63STheodore Ts'o moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io, 8930ae45f63STheodore Ts'o EXPIRE_DIRTY_ATIME, work); 894d6c10f1fSTejun Heo if (moved) 895d6c10f1fSTejun Heo wb_io_lists_populated(wb); 896ad4e38ddSCurt Wohlgemuth trace_writeback_queue_io(wb, work, moved); 89766f3b8e2SJens Axboe } 89866f3b8e2SJens Axboe 899a9185b41SChristoph Hellwig static int write_inode(struct inode *inode, struct writeback_control *wbc) 90066f3b8e2SJens Axboe { 9019fb0a7daSTejun Heo int ret; 9029fb0a7daSTejun Heo 9039fb0a7daSTejun Heo if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) { 9049fb0a7daSTejun Heo trace_writeback_write_inode_start(inode, wbc); 9059fb0a7daSTejun Heo ret = inode->i_sb->s_op->write_inode(inode, wbc); 9069fb0a7daSTejun Heo trace_writeback_write_inode(inode, wbc); 9079fb0a7daSTejun Heo return ret; 9089fb0a7daSTejun Heo } 90903ba3782SJens Axboe return 0; 91066f3b8e2SJens Axboe } 91108d8e974SFengguang Wu 9122c136579SFengguang Wu /* 913169ebd90SJan Kara * Wait for writeback on an inode to complete. Called with i_lock held. 914169ebd90SJan Kara * Caller must make sure inode cannot go away when we drop i_lock. 91501c03194SChristoph Hellwig */ 916169ebd90SJan Kara static void __inode_wait_for_writeback(struct inode *inode) 917169ebd90SJan Kara __releases(inode->i_lock) 918169ebd90SJan Kara __acquires(inode->i_lock) 91901c03194SChristoph Hellwig { 92001c03194SChristoph Hellwig DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC); 92101c03194SChristoph Hellwig wait_queue_head_t *wqh; 92201c03194SChristoph Hellwig 92301c03194SChristoph Hellwig wqh = bit_waitqueue(&inode->i_state, __I_SYNC); 92458a9d3d8SRichard Kennedy while (inode->i_state & I_SYNC) { 925250df6edSDave Chinner spin_unlock(&inode->i_lock); 92674316201SNeilBrown __wait_on_bit(wqh, &wq, bit_wait, 92774316201SNeilBrown TASK_UNINTERRUPTIBLE); 928250df6edSDave Chinner spin_lock(&inode->i_lock); 92958a9d3d8SRichard Kennedy } 93001c03194SChristoph Hellwig } 93101c03194SChristoph Hellwig 93201c03194SChristoph Hellwig /* 933169ebd90SJan Kara * Wait for writeback on an inode to complete. Caller must have inode pinned. 934169ebd90SJan Kara */ 935169ebd90SJan Kara void inode_wait_for_writeback(struct inode *inode) 936169ebd90SJan Kara { 937169ebd90SJan Kara spin_lock(&inode->i_lock); 938169ebd90SJan Kara __inode_wait_for_writeback(inode); 939169ebd90SJan Kara spin_unlock(&inode->i_lock); 940169ebd90SJan Kara } 941169ebd90SJan Kara 942169ebd90SJan Kara /* 943169ebd90SJan Kara * Sleep until I_SYNC is cleared. This function must be called with i_lock 944169ebd90SJan Kara * held and drops it. It is aimed for callers not holding any inode reference 945169ebd90SJan Kara * so once i_lock is dropped, inode can go away. 946169ebd90SJan Kara */ 947169ebd90SJan Kara static void inode_sleep_on_writeback(struct inode *inode) 948169ebd90SJan Kara __releases(inode->i_lock) 949169ebd90SJan Kara { 950169ebd90SJan Kara DEFINE_WAIT(wait); 951169ebd90SJan Kara wait_queue_head_t *wqh = bit_waitqueue(&inode->i_state, __I_SYNC); 952169ebd90SJan Kara int sleep; 953169ebd90SJan Kara 954169ebd90SJan Kara prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); 955169ebd90SJan Kara sleep = inode->i_state & I_SYNC; 956169ebd90SJan Kara spin_unlock(&inode->i_lock); 957169ebd90SJan Kara if (sleep) 958169ebd90SJan Kara schedule(); 959169ebd90SJan Kara finish_wait(wqh, &wait); 960169ebd90SJan Kara } 961169ebd90SJan Kara 962169ebd90SJan Kara /* 963ccb26b5aSJan Kara * Find proper writeback list for the inode depending on its current state and 964ccb26b5aSJan Kara * possibly also change of its state while we were doing writeback. Here we 965ccb26b5aSJan Kara * handle things such as livelock prevention or fairness of writeback among 966ccb26b5aSJan Kara * inodes. This function can be called only by flusher thread - noone else 967ccb26b5aSJan Kara * processes all inodes in writeback lists and requeueing inodes behind flusher 968ccb26b5aSJan Kara * thread's back can have unexpected consequences. 969ccb26b5aSJan Kara */ 970ccb26b5aSJan Kara static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, 971ccb26b5aSJan Kara struct writeback_control *wbc) 972ccb26b5aSJan Kara { 973ccb26b5aSJan Kara if (inode->i_state & I_FREEING) 974ccb26b5aSJan Kara return; 975ccb26b5aSJan Kara 976ccb26b5aSJan Kara /* 977ccb26b5aSJan Kara * Sync livelock prevention. Each inode is tagged and synced in one 978ccb26b5aSJan Kara * shot. If still dirty, it will be redirty_tail()'ed below. Update 979ccb26b5aSJan Kara * the dirty time to prevent enqueue and sync it again. 980ccb26b5aSJan Kara */ 981ccb26b5aSJan Kara if ((inode->i_state & I_DIRTY) && 982ccb26b5aSJan Kara (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)) 983ccb26b5aSJan Kara inode->dirtied_when = jiffies; 984ccb26b5aSJan Kara 9854f8ad655SJan Kara if (wbc->pages_skipped) { 9864f8ad655SJan Kara /* 9874f8ad655SJan Kara * writeback is not making progress due to locked 9884f8ad655SJan Kara * buffers. Skip this inode for now. 9894f8ad655SJan Kara */ 9904f8ad655SJan Kara redirty_tail(inode, wb); 9914f8ad655SJan Kara return; 9924f8ad655SJan Kara } 9934f8ad655SJan Kara 994ccb26b5aSJan Kara if (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) { 995ccb26b5aSJan Kara /* 996ccb26b5aSJan Kara * We didn't write back all the pages. nfs_writepages() 997ccb26b5aSJan Kara * sometimes bales out without doing anything. 998ccb26b5aSJan Kara */ 999ccb26b5aSJan Kara if (wbc->nr_to_write <= 0) { 1000ccb26b5aSJan Kara /* Slice used up. Queue for next turn. */ 1001ccb26b5aSJan Kara requeue_io(inode, wb); 1002ccb26b5aSJan Kara } else { 1003ccb26b5aSJan Kara /* 1004ccb26b5aSJan Kara * Writeback blocked by something other than 1005ccb26b5aSJan Kara * congestion. Delay the inode for some time to 1006ccb26b5aSJan Kara * avoid spinning on the CPU (100% iowait) 1007ccb26b5aSJan Kara * retrying writeback of the dirty page/inode 1008ccb26b5aSJan Kara * that cannot be performed immediately. 1009ccb26b5aSJan Kara */ 1010ccb26b5aSJan Kara redirty_tail(inode, wb); 1011ccb26b5aSJan Kara } 1012ccb26b5aSJan Kara } else if (inode->i_state & I_DIRTY) { 1013ccb26b5aSJan Kara /* 1014ccb26b5aSJan Kara * Filesystems can dirty the inode during writeback operations, 1015ccb26b5aSJan Kara * such as delayed allocation during submission or metadata 1016ccb26b5aSJan Kara * updates after data IO completion. 1017ccb26b5aSJan Kara */ 1018ccb26b5aSJan Kara redirty_tail(inode, wb); 10190ae45f63STheodore Ts'o } else if (inode->i_state & I_DIRTY_TIME) { 1020a2f48706STheodore Ts'o inode->dirtied_when = jiffies; 1021d6c10f1fSTejun Heo inode_wb_list_move_locked(inode, wb, &wb->b_dirty_time); 1022ccb26b5aSJan Kara } else { 1023ccb26b5aSJan Kara /* The inode is clean. Remove from writeback lists. */ 1024d6c10f1fSTejun Heo inode_wb_list_del_locked(inode, wb); 1025ccb26b5aSJan Kara } 1026ccb26b5aSJan Kara } 1027ccb26b5aSJan Kara 1028ccb26b5aSJan Kara /* 10294f8ad655SJan Kara * Write out an inode and its dirty pages. Do not update the writeback list 10304f8ad655SJan Kara * linkage. That is left to the caller. The caller is also responsible for 10314f8ad655SJan Kara * setting I_SYNC flag and calling inode_sync_complete() to clear it. 10321da177e4SLinus Torvalds */ 10331da177e4SLinus Torvalds static int 1034cd8ed2a4SYan Hong __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) 10351da177e4SLinus Torvalds { 10361da177e4SLinus Torvalds struct address_space *mapping = inode->i_mapping; 1037251d6a47SWu Fengguang long nr_to_write = wbc->nr_to_write; 103801c03194SChristoph Hellwig unsigned dirty; 10391da177e4SLinus Torvalds int ret; 10401da177e4SLinus Torvalds 10414f8ad655SJan Kara WARN_ON(!(inode->i_state & I_SYNC)); 10421da177e4SLinus Torvalds 10439fb0a7daSTejun Heo trace_writeback_single_inode_start(inode, wbc, nr_to_write); 10449fb0a7daSTejun Heo 10451da177e4SLinus Torvalds ret = do_writepages(mapping, wbc); 10461da177e4SLinus Torvalds 104726821ed4SChristoph Hellwig /* 104826821ed4SChristoph Hellwig * Make sure to wait on the data before writing out the metadata. 104926821ed4SChristoph Hellwig * This is important for filesystems that modify metadata on data 10507747bd4bSDave Chinner * I/O completion. We don't do it for sync(2) writeback because it has a 10517747bd4bSDave Chinner * separate, external IO completion path and ->sync_fs for guaranteeing 10527747bd4bSDave Chinner * inode metadata is written back correctly. 105326821ed4SChristoph Hellwig */ 10547747bd4bSDave Chinner if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) { 105526821ed4SChristoph Hellwig int err = filemap_fdatawait(mapping); 10561da177e4SLinus Torvalds if (ret == 0) 10571da177e4SLinus Torvalds ret = err; 10581da177e4SLinus Torvalds } 10591da177e4SLinus Torvalds 10605547e8aaSDmitry Monakhov /* 10615547e8aaSDmitry Monakhov * Some filesystems may redirty the inode during the writeback 10625547e8aaSDmitry Monakhov * due to delalloc, clear dirty metadata flags right before 10635547e8aaSDmitry Monakhov * write_inode() 10645547e8aaSDmitry Monakhov */ 1065250df6edSDave Chinner spin_lock(&inode->i_lock); 10669c6ac78eSTejun Heo 10675547e8aaSDmitry Monakhov dirty = inode->i_state & I_DIRTY; 1068a2f48706STheodore Ts'o if (inode->i_state & I_DIRTY_TIME) { 1069a2f48706STheodore Ts'o if ((dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) || 1070a2f48706STheodore Ts'o unlikely(inode->i_state & I_DIRTY_TIME_EXPIRED) || 1071a2f48706STheodore Ts'o unlikely(time_after(jiffies, 1072a2f48706STheodore Ts'o (inode->dirtied_time_when + 1073a2f48706STheodore Ts'o dirtytime_expire_interval * HZ)))) { 10740ae45f63STheodore Ts'o dirty |= I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED; 10750ae45f63STheodore Ts'o trace_writeback_lazytime(inode); 10760ae45f63STheodore Ts'o } 1077a2f48706STheodore Ts'o } else 1078a2f48706STheodore Ts'o inode->i_state &= ~I_DIRTY_TIME_EXPIRED; 10790ae45f63STheodore Ts'o inode->i_state &= ~dirty; 10809c6ac78eSTejun Heo 10819c6ac78eSTejun Heo /* 10829c6ac78eSTejun Heo * Paired with smp_mb() in __mark_inode_dirty(). This allows 10839c6ac78eSTejun Heo * __mark_inode_dirty() to test i_state without grabbing i_lock - 10849c6ac78eSTejun Heo * either they see the I_DIRTY bits cleared or we see the dirtied 10859c6ac78eSTejun Heo * inode. 10869c6ac78eSTejun Heo * 10879c6ac78eSTejun Heo * I_DIRTY_PAGES is always cleared together above even if @mapping 10889c6ac78eSTejun Heo * still has dirty pages. The flag is reinstated after smp_mb() if 10899c6ac78eSTejun Heo * necessary. This guarantees that either __mark_inode_dirty() 10909c6ac78eSTejun Heo * sees clear I_DIRTY_PAGES or we see PAGECACHE_TAG_DIRTY. 10919c6ac78eSTejun Heo */ 10929c6ac78eSTejun Heo smp_mb(); 10939c6ac78eSTejun Heo 10949c6ac78eSTejun Heo if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) 10959c6ac78eSTejun Heo inode->i_state |= I_DIRTY_PAGES; 10969c6ac78eSTejun Heo 1097250df6edSDave Chinner spin_unlock(&inode->i_lock); 10989c6ac78eSTejun Heo 10990ae45f63STheodore Ts'o if (dirty & I_DIRTY_TIME) 11000ae45f63STheodore Ts'o mark_inode_dirty_sync(inode); 110126821ed4SChristoph Hellwig /* Don't write the inode if only I_DIRTY_PAGES was set */ 11020ae45f63STheodore Ts'o if (dirty & ~I_DIRTY_PAGES) { 1103a9185b41SChristoph Hellwig int err = write_inode(inode, wbc); 11041da177e4SLinus Torvalds if (ret == 0) 11051da177e4SLinus Torvalds ret = err; 11061da177e4SLinus Torvalds } 11074f8ad655SJan Kara trace_writeback_single_inode(inode, wbc, nr_to_write); 11084f8ad655SJan Kara return ret; 11094f8ad655SJan Kara } 11104f8ad655SJan Kara 11114f8ad655SJan Kara /* 11124f8ad655SJan Kara * Write out an inode's dirty pages. Either the caller has an active reference 11134f8ad655SJan Kara * on the inode or the inode has I_WILL_FREE set. 11144f8ad655SJan Kara * 11154f8ad655SJan Kara * This function is designed to be called for writing back one inode which 11164f8ad655SJan Kara * we go e.g. from filesystem. Flusher thread uses __writeback_single_inode() 11174f8ad655SJan Kara * and does more profound writeback list handling in writeback_sb_inodes(). 11184f8ad655SJan Kara */ 11194f8ad655SJan Kara static int 11204f8ad655SJan Kara writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, 11214f8ad655SJan Kara struct writeback_control *wbc) 11224f8ad655SJan Kara { 11234f8ad655SJan Kara int ret = 0; 11244f8ad655SJan Kara 11254f8ad655SJan Kara spin_lock(&inode->i_lock); 11264f8ad655SJan Kara if (!atomic_read(&inode->i_count)) 11274f8ad655SJan Kara WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING))); 11284f8ad655SJan Kara else 11294f8ad655SJan Kara WARN_ON(inode->i_state & I_WILL_FREE); 11304f8ad655SJan Kara 11314f8ad655SJan Kara if (inode->i_state & I_SYNC) { 11324f8ad655SJan Kara if (wbc->sync_mode != WB_SYNC_ALL) 11334f8ad655SJan Kara goto out; 11344f8ad655SJan Kara /* 1135169ebd90SJan Kara * It's a data-integrity sync. We must wait. Since callers hold 1136169ebd90SJan Kara * inode reference or inode has I_WILL_FREE set, it cannot go 1137169ebd90SJan Kara * away under us. 11384f8ad655SJan Kara */ 1139169ebd90SJan Kara __inode_wait_for_writeback(inode); 11404f8ad655SJan Kara } 11414f8ad655SJan Kara WARN_ON(inode->i_state & I_SYNC); 11424f8ad655SJan Kara /* 1143f9b0e058SJan Kara * Skip inode if it is clean and we have no outstanding writeback in 1144f9b0e058SJan Kara * WB_SYNC_ALL mode. We don't want to mess with writeback lists in this 1145f9b0e058SJan Kara * function since flusher thread may be doing for example sync in 1146f9b0e058SJan Kara * parallel and if we move the inode, it could get skipped. So here we 1147f9b0e058SJan Kara * make sure inode is on some writeback list and leave it there unless 1148f9b0e058SJan Kara * we have completely cleaned the inode. 11494f8ad655SJan Kara */ 11500ae45f63STheodore Ts'o if (!(inode->i_state & I_DIRTY_ALL) && 1151f9b0e058SJan Kara (wbc->sync_mode != WB_SYNC_ALL || 1152f9b0e058SJan Kara !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK))) 11534f8ad655SJan Kara goto out; 11544f8ad655SJan Kara inode->i_state |= I_SYNC; 1155b16b1debSTejun Heo wbc_attach_and_unlock_inode(wbc, inode); 11564f8ad655SJan Kara 1157cd8ed2a4SYan Hong ret = __writeback_single_inode(inode, wbc); 11581da177e4SLinus Torvalds 1159b16b1debSTejun Heo wbc_detach_inode(wbc); 1160f758eeabSChristoph Hellwig spin_lock(&wb->list_lock); 1161250df6edSDave Chinner spin_lock(&inode->i_lock); 11624f8ad655SJan Kara /* 11634f8ad655SJan Kara * If inode is clean, remove it from writeback lists. Otherwise don't 11644f8ad655SJan Kara * touch it. See comment above for explanation. 11654f8ad655SJan Kara */ 11660ae45f63STheodore Ts'o if (!(inode->i_state & I_DIRTY_ALL)) 1167d6c10f1fSTejun Heo inode_wb_list_del_locked(inode, wb); 11684f8ad655SJan Kara spin_unlock(&wb->list_lock); 11691c0eeaf5SJoern Engel inode_sync_complete(inode); 11704f8ad655SJan Kara out: 11714f8ad655SJan Kara spin_unlock(&inode->i_lock); 11721da177e4SLinus Torvalds return ret; 11731da177e4SLinus Torvalds } 11741da177e4SLinus Torvalds 1175a88a341aSTejun Heo static long writeback_chunk_size(struct bdi_writeback *wb, 11761a12d8bdSWu Fengguang struct wb_writeback_work *work) 1177d46db3d5SWu Fengguang { 1178d46db3d5SWu Fengguang long pages; 1179d46db3d5SWu Fengguang 1180d46db3d5SWu Fengguang /* 1181d46db3d5SWu Fengguang * WB_SYNC_ALL mode does livelock avoidance by syncing dirty 1182d46db3d5SWu Fengguang * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX 1183d46db3d5SWu Fengguang * here avoids calling into writeback_inodes_wb() more than once. 1184d46db3d5SWu Fengguang * 1185d46db3d5SWu Fengguang * The intended call sequence for WB_SYNC_ALL writeback is: 1186d46db3d5SWu Fengguang * 1187d46db3d5SWu Fengguang * wb_writeback() 1188d46db3d5SWu Fengguang * writeback_sb_inodes() <== called only once 1189d46db3d5SWu Fengguang * write_cache_pages() <== called once for each inode 1190d46db3d5SWu Fengguang * (quickly) tag currently dirty pages 1191d46db3d5SWu Fengguang * (maybe slowly) sync all tagged pages 1192d46db3d5SWu Fengguang */ 1193d46db3d5SWu Fengguang if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages) 1194d46db3d5SWu Fengguang pages = LONG_MAX; 11951a12d8bdSWu Fengguang else { 1196a88a341aSTejun Heo pages = min(wb->avg_write_bandwidth / 2, 1197dcc25ae7STejun Heo global_wb_domain.dirty_limit / DIRTY_SCOPE); 11981a12d8bdSWu Fengguang pages = min(pages, work->nr_pages); 11991a12d8bdSWu Fengguang pages = round_down(pages + MIN_WRITEBACK_PAGES, 12001a12d8bdSWu Fengguang MIN_WRITEBACK_PAGES); 12011a12d8bdSWu Fengguang } 1202d46db3d5SWu Fengguang 1203d46db3d5SWu Fengguang return pages; 1204d46db3d5SWu Fengguang } 1205d46db3d5SWu Fengguang 120603ba3782SJens Axboe /* 1207f11c9c5cSEdward Shishkin * Write a portion of b_io inodes which belong to @sb. 1208edadfb10SChristoph Hellwig * 1209d46db3d5SWu Fengguang * Return the number of pages and/or inodes written. 1210f11c9c5cSEdward Shishkin */ 1211d46db3d5SWu Fengguang static long writeback_sb_inodes(struct super_block *sb, 1212d46db3d5SWu Fengguang struct bdi_writeback *wb, 1213d46db3d5SWu Fengguang struct wb_writeback_work *work) 121403ba3782SJens Axboe { 1215d46db3d5SWu Fengguang struct writeback_control wbc = { 1216d46db3d5SWu Fengguang .sync_mode = work->sync_mode, 1217d46db3d5SWu Fengguang .tagged_writepages = work->tagged_writepages, 1218d46db3d5SWu Fengguang .for_kupdate = work->for_kupdate, 1219d46db3d5SWu Fengguang .for_background = work->for_background, 12207747bd4bSDave Chinner .for_sync = work->for_sync, 1221d46db3d5SWu Fengguang .range_cyclic = work->range_cyclic, 1222d46db3d5SWu Fengguang .range_start = 0, 1223d46db3d5SWu Fengguang .range_end = LLONG_MAX, 1224d46db3d5SWu Fengguang }; 1225d46db3d5SWu Fengguang unsigned long start_time = jiffies; 1226d46db3d5SWu Fengguang long write_chunk; 1227d46db3d5SWu Fengguang long wrote = 0; /* count both pages and inodes */ 1228d46db3d5SWu Fengguang 122903ba3782SJens Axboe while (!list_empty(&wb->b_io)) { 12307ccf19a8SNick Piggin struct inode *inode = wb_inode(wb->b_io.prev); 1231edadfb10SChristoph Hellwig 1232edadfb10SChristoph Hellwig if (inode->i_sb != sb) { 1233d46db3d5SWu Fengguang if (work->sb) { 1234edadfb10SChristoph Hellwig /* 1235edadfb10SChristoph Hellwig * We only want to write back data for this 1236edadfb10SChristoph Hellwig * superblock, move all inodes not belonging 1237edadfb10SChristoph Hellwig * to it back onto the dirty list. 1238edadfb10SChristoph Hellwig */ 1239f758eeabSChristoph Hellwig redirty_tail(inode, wb); 124066f3b8e2SJens Axboe continue; 124166f3b8e2SJens Axboe } 1242edadfb10SChristoph Hellwig 1243edadfb10SChristoph Hellwig /* 1244edadfb10SChristoph Hellwig * The inode belongs to a different superblock. 1245edadfb10SChristoph Hellwig * Bounce back to the caller to unpin this and 1246edadfb10SChristoph Hellwig * pin the next superblock. 1247edadfb10SChristoph Hellwig */ 1248d46db3d5SWu Fengguang break; 1249edadfb10SChristoph Hellwig } 1250edadfb10SChristoph Hellwig 12519843b76aSChristoph Hellwig /* 1252331cbdeeSWanpeng Li * Don't bother with new inodes or inodes being freed, first 1253331cbdeeSWanpeng Li * kind does not need periodic writeout yet, and for the latter 12549843b76aSChristoph Hellwig * kind writeout is handled by the freer. 12559843b76aSChristoph Hellwig */ 1256250df6edSDave Chinner spin_lock(&inode->i_lock); 12579843b76aSChristoph Hellwig if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { 1258250df6edSDave Chinner spin_unlock(&inode->i_lock); 1259fcc5c222SWu Fengguang redirty_tail(inode, wb); 12607ef0d737SNick Piggin continue; 12617ef0d737SNick Piggin } 1262cc1676d9SJan Kara if ((inode->i_state & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) { 1263cc1676d9SJan Kara /* 1264cc1676d9SJan Kara * If this inode is locked for writeback and we are not 1265cc1676d9SJan Kara * doing writeback-for-data-integrity, move it to 1266cc1676d9SJan Kara * b_more_io so that writeback can proceed with the 1267cc1676d9SJan Kara * other inodes on s_io. 1268cc1676d9SJan Kara * 1269cc1676d9SJan Kara * We'll have another go at writing back this inode 1270cc1676d9SJan Kara * when we completed a full scan of b_io. 1271cc1676d9SJan Kara */ 1272cc1676d9SJan Kara spin_unlock(&inode->i_lock); 1273cc1676d9SJan Kara requeue_io(inode, wb); 1274cc1676d9SJan Kara trace_writeback_sb_inodes_requeue(inode); 1275cc1676d9SJan Kara continue; 1276cc1676d9SJan Kara } 1277f0d07b7fSJan Kara spin_unlock(&wb->list_lock); 1278f0d07b7fSJan Kara 12794f8ad655SJan Kara /* 12804f8ad655SJan Kara * We already requeued the inode if it had I_SYNC set and we 12814f8ad655SJan Kara * are doing WB_SYNC_NONE writeback. So this catches only the 12824f8ad655SJan Kara * WB_SYNC_ALL case. 12834f8ad655SJan Kara */ 1284169ebd90SJan Kara if (inode->i_state & I_SYNC) { 1285169ebd90SJan Kara /* Wait for I_SYNC. This function drops i_lock... */ 1286169ebd90SJan Kara inode_sleep_on_writeback(inode); 1287169ebd90SJan Kara /* Inode may be gone, start again */ 1288ead188f9SJan Kara spin_lock(&wb->list_lock); 1289169ebd90SJan Kara continue; 1290169ebd90SJan Kara } 12914f8ad655SJan Kara inode->i_state |= I_SYNC; 1292b16b1debSTejun Heo wbc_attach_and_unlock_inode(&wbc, inode); 1293169ebd90SJan Kara 1294a88a341aSTejun Heo write_chunk = writeback_chunk_size(wb, work); 1295d46db3d5SWu Fengguang wbc.nr_to_write = write_chunk; 1296d46db3d5SWu Fengguang wbc.pages_skipped = 0; 1297250df6edSDave Chinner 1298169ebd90SJan Kara /* 1299169ebd90SJan Kara * We use I_SYNC to pin the inode in memory. While it is set 1300169ebd90SJan Kara * evict_inode() will wait so the inode cannot be freed. 1301169ebd90SJan Kara */ 1302cd8ed2a4SYan Hong __writeback_single_inode(inode, &wbc); 1303d46db3d5SWu Fengguang 1304b16b1debSTejun Heo wbc_detach_inode(&wbc); 1305d46db3d5SWu Fengguang work->nr_pages -= write_chunk - wbc.nr_to_write; 1306d46db3d5SWu Fengguang wrote += write_chunk - wbc.nr_to_write; 13074f8ad655SJan Kara spin_lock(&wb->list_lock); 13084f8ad655SJan Kara spin_lock(&inode->i_lock); 13090ae45f63STheodore Ts'o if (!(inode->i_state & I_DIRTY_ALL)) 1310d46db3d5SWu Fengguang wrote++; 13114f8ad655SJan Kara requeue_inode(inode, wb, &wbc); 13124f8ad655SJan Kara inode_sync_complete(inode); 13130f1b1fd8SDave Chinner spin_unlock(&inode->i_lock); 1314169ebd90SJan Kara cond_resched_lock(&wb->list_lock); 1315d46db3d5SWu Fengguang /* 1316d46db3d5SWu Fengguang * bail out to wb_writeback() often enough to check 1317d46db3d5SWu Fengguang * background threshold and other termination conditions. 1318d46db3d5SWu Fengguang */ 1319d46db3d5SWu Fengguang if (wrote) { 1320d46db3d5SWu Fengguang if (time_is_before_jiffies(start_time + HZ / 10UL)) 1321d46db3d5SWu Fengguang break; 1322d46db3d5SWu Fengguang if (work->nr_pages <= 0) 1323d46db3d5SWu Fengguang break; 13241da177e4SLinus Torvalds } 13258bc3be27SFengguang Wu } 1326d46db3d5SWu Fengguang return wrote; 1327f11c9c5cSEdward Shishkin } 132838f21977SNick Piggin 1329d46db3d5SWu Fengguang static long __writeback_inodes_wb(struct bdi_writeback *wb, 1330d46db3d5SWu Fengguang struct wb_writeback_work *work) 1331f11c9c5cSEdward Shishkin { 1332d46db3d5SWu Fengguang unsigned long start_time = jiffies; 1333d46db3d5SWu Fengguang long wrote = 0; 1334f11c9c5cSEdward Shishkin 1335f11c9c5cSEdward Shishkin while (!list_empty(&wb->b_io)) { 13367ccf19a8SNick Piggin struct inode *inode = wb_inode(wb->b_io.prev); 1337f11c9c5cSEdward Shishkin struct super_block *sb = inode->i_sb; 1338f11c9c5cSEdward Shishkin 1339eb6ef3dfSKonstantin Khlebnikov if (!trylock_super(sb)) { 13400e995816SWu Fengguang /* 1341eb6ef3dfSKonstantin Khlebnikov * trylock_super() may fail consistently due to 13420e995816SWu Fengguang * s_umount being grabbed by someone else. Don't use 13430e995816SWu Fengguang * requeue_io() to avoid busy retrying the inode/sb. 13440e995816SWu Fengguang */ 13450e995816SWu Fengguang redirty_tail(inode, wb); 1346d19de7edSChristoph Hellwig continue; 1347334132aeSChristoph Hellwig } 1348d46db3d5SWu Fengguang wrote += writeback_sb_inodes(sb, wb, work); 1349eb6ef3dfSKonstantin Khlebnikov up_read(&sb->s_umount); 1350f11c9c5cSEdward Shishkin 1351d46db3d5SWu Fengguang /* refer to the same tests at the end of writeback_sb_inodes */ 1352d46db3d5SWu Fengguang if (wrote) { 1353d46db3d5SWu Fengguang if (time_is_before_jiffies(start_time + HZ / 10UL)) 1354d46db3d5SWu Fengguang break; 1355d46db3d5SWu Fengguang if (work->nr_pages <= 0) 1356f11c9c5cSEdward Shishkin break; 1357f11c9c5cSEdward Shishkin } 1358d46db3d5SWu Fengguang } 135966f3b8e2SJens Axboe /* Leave any unwritten inodes on b_io */ 1360d46db3d5SWu Fengguang return wrote; 136166f3b8e2SJens Axboe } 136266f3b8e2SJens Axboe 13637d9f073bSWanpeng Li static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages, 13640e175a18SCurt Wohlgemuth enum wb_reason reason) 1365edadfb10SChristoph Hellwig { 1366d46db3d5SWu Fengguang struct wb_writeback_work work = { 1367d46db3d5SWu Fengguang .nr_pages = nr_pages, 1368d46db3d5SWu Fengguang .sync_mode = WB_SYNC_NONE, 1369d46db3d5SWu Fengguang .range_cyclic = 1, 13700e175a18SCurt Wohlgemuth .reason = reason, 1371d46db3d5SWu Fengguang }; 1372edadfb10SChristoph Hellwig 1373f758eeabSChristoph Hellwig spin_lock(&wb->list_lock); 1374424b351fSWu Fengguang if (list_empty(&wb->b_io)) 1375ad4e38ddSCurt Wohlgemuth queue_io(wb, &work); 1376d46db3d5SWu Fengguang __writeback_inodes_wb(wb, &work); 1377f758eeabSChristoph Hellwig spin_unlock(&wb->list_lock); 1378edadfb10SChristoph Hellwig 1379d46db3d5SWu Fengguang return nr_pages - work.nr_pages; 138066f3b8e2SJens Axboe } 138166f3b8e2SJens Axboe 138203ba3782SJens Axboe /* 138303ba3782SJens Axboe * Explicit flushing or periodic writeback of "old" data. 138403ba3782SJens Axboe * 138503ba3782SJens Axboe * Define "old": the first time one of an inode's pages is dirtied, we mark the 138603ba3782SJens Axboe * dirtying-time in the inode's address_space. So this periodic writeback code 138703ba3782SJens Axboe * just walks the superblock inode list, writing back any inodes which are 138803ba3782SJens Axboe * older than a specific point in time. 138903ba3782SJens Axboe * 139003ba3782SJens Axboe * Try to run once per dirty_writeback_interval. But if a writeback event 139103ba3782SJens Axboe * takes longer than a dirty_writeback_interval interval, then leave a 139203ba3782SJens Axboe * one-second gap. 139303ba3782SJens Axboe * 139403ba3782SJens Axboe * older_than_this takes precedence over nr_to_write. So we'll only write back 139503ba3782SJens Axboe * all dirty pages if they are all attached to "old" mappings. 139603ba3782SJens Axboe */ 1397c4a77a6cSJens Axboe static long wb_writeback(struct bdi_writeback *wb, 139883ba7b07SChristoph Hellwig struct wb_writeback_work *work) 139903ba3782SJens Axboe { 1400e98be2d5SWu Fengguang unsigned long wb_start = jiffies; 1401d46db3d5SWu Fengguang long nr_pages = work->nr_pages; 14020dc83bd3SJan Kara unsigned long oldest_jif; 1403a5989bdcSJan Kara struct inode *inode; 1404d46db3d5SWu Fengguang long progress; 140503ba3782SJens Axboe 14060dc83bd3SJan Kara oldest_jif = jiffies; 14070dc83bd3SJan Kara work->older_than_this = &oldest_jif; 140803ba3782SJens Axboe 1409e8dfc305SWu Fengguang spin_lock(&wb->list_lock); 141003ba3782SJens Axboe for (;;) { 141103ba3782SJens Axboe /* 1412d3ddec76SWu Fengguang * Stop writeback when nr_pages has been consumed 141303ba3782SJens Axboe */ 141483ba7b07SChristoph Hellwig if (work->nr_pages <= 0) 141503ba3782SJens Axboe break; 141603ba3782SJens Axboe 141703ba3782SJens Axboe /* 1418aa373cf5SJan Kara * Background writeout and kupdate-style writeback may 1419aa373cf5SJan Kara * run forever. Stop them if there is other work to do 1420aa373cf5SJan Kara * so that e.g. sync can proceed. They'll be restarted 1421aa373cf5SJan Kara * after the other works are all done. 1422aa373cf5SJan Kara */ 1423aa373cf5SJan Kara if ((work->for_background || work->for_kupdate) && 1424f0054bb1STejun Heo !list_empty(&wb->work_list)) 1425aa373cf5SJan Kara break; 1426aa373cf5SJan Kara 1427aa373cf5SJan Kara /* 1428d3ddec76SWu Fengguang * For background writeout, stop when we are below the 1429d3ddec76SWu Fengguang * background dirty threshold 143003ba3782SJens Axboe */ 1431aa661bbeSTejun Heo if (work->for_background && !wb_over_bg_thresh(wb)) 143203ba3782SJens Axboe break; 143303ba3782SJens Axboe 14341bc36b64SJan Kara /* 14351bc36b64SJan Kara * Kupdate and background works are special and we want to 14361bc36b64SJan Kara * include all inodes that need writing. Livelock avoidance is 14371bc36b64SJan Kara * handled by these works yielding to any other work so we are 14381bc36b64SJan Kara * safe. 14391bc36b64SJan Kara */ 1440ba9aa839SWu Fengguang if (work->for_kupdate) { 14410dc83bd3SJan Kara oldest_jif = jiffies - 1442ba9aa839SWu Fengguang msecs_to_jiffies(dirty_expire_interval * 10); 14431bc36b64SJan Kara } else if (work->for_background) 14440dc83bd3SJan Kara oldest_jif = jiffies; 1445028c2dd1SDave Chinner 1446d46db3d5SWu Fengguang trace_writeback_start(wb->bdi, work); 1447e8dfc305SWu Fengguang if (list_empty(&wb->b_io)) 1448ad4e38ddSCurt Wohlgemuth queue_io(wb, work); 144983ba7b07SChristoph Hellwig if (work->sb) 1450d46db3d5SWu Fengguang progress = writeback_sb_inodes(work->sb, wb, work); 1451edadfb10SChristoph Hellwig else 1452d46db3d5SWu Fengguang progress = __writeback_inodes_wb(wb, work); 1453d46db3d5SWu Fengguang trace_writeback_written(wb->bdi, work); 1454028c2dd1SDave Chinner 1455e98be2d5SWu Fengguang wb_update_bandwidth(wb, wb_start); 145603ba3782SJens Axboe 145703ba3782SJens Axboe /* 145871fd05a8SJens Axboe * Did we write something? Try for more 1459e6fb6da2SWu Fengguang * 1460e6fb6da2SWu Fengguang * Dirty inodes are moved to b_io for writeback in batches. 1461e6fb6da2SWu Fengguang * The completion of the current batch does not necessarily 1462e6fb6da2SWu Fengguang * mean the overall work is done. So we keep looping as long 1463e6fb6da2SWu Fengguang * as made some progress on cleaning pages or inodes. 146471fd05a8SJens Axboe */ 1465d46db3d5SWu Fengguang if (progress) 146603ba3782SJens Axboe continue; 1467a5989bdcSJan Kara /* 1468e6fb6da2SWu Fengguang * No more inodes for IO, bail 1469a5989bdcSJan Kara */ 1470b7a2441fSWu Fengguang if (list_empty(&wb->b_more_io)) 147103ba3782SJens Axboe break; 147203ba3782SJens Axboe /* 14738010c3b6SJens Axboe * Nothing written. Wait for some inode to 14748010c3b6SJens Axboe * become available for writeback. Otherwise 14758010c3b6SJens Axboe * we'll just busyloop. 147603ba3782SJens Axboe */ 147703ba3782SJens Axboe if (!list_empty(&wb->b_more_io)) { 1478d46db3d5SWu Fengguang trace_writeback_wait(wb->bdi, work); 147903ba3782SJens Axboe inode = wb_inode(wb->b_more_io.prev); 1480250df6edSDave Chinner spin_lock(&inode->i_lock); 1481f0d07b7fSJan Kara spin_unlock(&wb->list_lock); 1482169ebd90SJan Kara /* This function drops i_lock... */ 1483169ebd90SJan Kara inode_sleep_on_writeback(inode); 1484f0d07b7fSJan Kara spin_lock(&wb->list_lock); 148503ba3782SJens Axboe } 148603ba3782SJens Axboe } 1487e8dfc305SWu Fengguang spin_unlock(&wb->list_lock); 148803ba3782SJens Axboe 1489d46db3d5SWu Fengguang return nr_pages - work->nr_pages; 149003ba3782SJens Axboe } 149103ba3782SJens Axboe 149203ba3782SJens Axboe /* 149383ba7b07SChristoph Hellwig * Return the next wb_writeback_work struct that hasn't been processed yet. 149403ba3782SJens Axboe */ 1495f0054bb1STejun Heo static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb) 149603ba3782SJens Axboe { 149783ba7b07SChristoph Hellwig struct wb_writeback_work *work = NULL; 149803ba3782SJens Axboe 1499f0054bb1STejun Heo spin_lock_bh(&wb->work_lock); 1500f0054bb1STejun Heo if (!list_empty(&wb->work_list)) { 1501f0054bb1STejun Heo work = list_entry(wb->work_list.next, 150283ba7b07SChristoph Hellwig struct wb_writeback_work, list); 150383ba7b07SChristoph Hellwig list_del_init(&work->list); 150403ba3782SJens Axboe } 1505f0054bb1STejun Heo spin_unlock_bh(&wb->work_lock); 150683ba7b07SChristoph Hellwig return work; 150703ba3782SJens Axboe } 150803ba3782SJens Axboe 1509cdf01dd5SLinus Torvalds /* 1510cdf01dd5SLinus Torvalds * Add in the number of potentially dirty inodes, because each inode 1511cdf01dd5SLinus Torvalds * write can dirty pagecache in the underlying blockdev. 1512cdf01dd5SLinus Torvalds */ 1513cdf01dd5SLinus Torvalds static unsigned long get_nr_dirty_pages(void) 1514cdf01dd5SLinus Torvalds { 1515cdf01dd5SLinus Torvalds return global_page_state(NR_FILE_DIRTY) + 1516cdf01dd5SLinus Torvalds global_page_state(NR_UNSTABLE_NFS) + 1517cdf01dd5SLinus Torvalds get_nr_dirty_inodes(); 1518cdf01dd5SLinus Torvalds } 1519cdf01dd5SLinus Torvalds 15206585027aSJan Kara static long wb_check_background_flush(struct bdi_writeback *wb) 15216585027aSJan Kara { 1522aa661bbeSTejun Heo if (wb_over_bg_thresh(wb)) { 15236585027aSJan Kara 15246585027aSJan Kara struct wb_writeback_work work = { 15256585027aSJan Kara .nr_pages = LONG_MAX, 15266585027aSJan Kara .sync_mode = WB_SYNC_NONE, 15276585027aSJan Kara .for_background = 1, 15286585027aSJan Kara .range_cyclic = 1, 15290e175a18SCurt Wohlgemuth .reason = WB_REASON_BACKGROUND, 15306585027aSJan Kara }; 15316585027aSJan Kara 15326585027aSJan Kara return wb_writeback(wb, &work); 15336585027aSJan Kara } 15346585027aSJan Kara 15356585027aSJan Kara return 0; 15366585027aSJan Kara } 15376585027aSJan Kara 153803ba3782SJens Axboe static long wb_check_old_data_flush(struct bdi_writeback *wb) 153903ba3782SJens Axboe { 154003ba3782SJens Axboe unsigned long expired; 154103ba3782SJens Axboe long nr_pages; 154203ba3782SJens Axboe 154369b62d01SJens Axboe /* 154469b62d01SJens Axboe * When set to zero, disable periodic writeback 154569b62d01SJens Axboe */ 154669b62d01SJens Axboe if (!dirty_writeback_interval) 154769b62d01SJens Axboe return 0; 154869b62d01SJens Axboe 154903ba3782SJens Axboe expired = wb->last_old_flush + 155003ba3782SJens Axboe msecs_to_jiffies(dirty_writeback_interval * 10); 155103ba3782SJens Axboe if (time_before(jiffies, expired)) 155203ba3782SJens Axboe return 0; 155303ba3782SJens Axboe 155403ba3782SJens Axboe wb->last_old_flush = jiffies; 1555cdf01dd5SLinus Torvalds nr_pages = get_nr_dirty_pages(); 155603ba3782SJens Axboe 1557c4a77a6cSJens Axboe if (nr_pages) { 155883ba7b07SChristoph Hellwig struct wb_writeback_work work = { 1559c4a77a6cSJens Axboe .nr_pages = nr_pages, 1560c4a77a6cSJens Axboe .sync_mode = WB_SYNC_NONE, 1561c4a77a6cSJens Axboe .for_kupdate = 1, 1562c4a77a6cSJens Axboe .range_cyclic = 1, 15630e175a18SCurt Wohlgemuth .reason = WB_REASON_PERIODIC, 1564c4a77a6cSJens Axboe }; 1565c4a77a6cSJens Axboe 156683ba7b07SChristoph Hellwig return wb_writeback(wb, &work); 1567c4a77a6cSJens Axboe } 156803ba3782SJens Axboe 156903ba3782SJens Axboe return 0; 157003ba3782SJens Axboe } 157103ba3782SJens Axboe 157203ba3782SJens Axboe /* 157303ba3782SJens Axboe * Retrieve work items and do the writeback they describe 157403ba3782SJens Axboe */ 157525d130baSWanpeng Li static long wb_do_writeback(struct bdi_writeback *wb) 157603ba3782SJens Axboe { 157783ba7b07SChristoph Hellwig struct wb_writeback_work *work; 1578c4a77a6cSJens Axboe long wrote = 0; 157903ba3782SJens Axboe 15804452226eSTejun Heo set_bit(WB_writeback_running, &wb->state); 1581f0054bb1STejun Heo while ((work = get_next_work_item(wb)) != NULL) { 1582cc395d7fSTejun Heo struct wb_completion *done = work->done; 158398754bf7STejun Heo bool need_wake_up = false; 158483ba7b07SChristoph Hellwig 1585f0054bb1STejun Heo trace_writeback_exec(wb->bdi, work); 1586455b2864SDave Chinner 158783ba7b07SChristoph Hellwig wrote += wb_writeback(wb, work); 158803ba3782SJens Axboe 158998754bf7STejun Heo if (work->single_wait) { 159098754bf7STejun Heo WARN_ON_ONCE(work->auto_free); 159198754bf7STejun Heo /* paired w/ rmb in wb_wait_for_single_work() */ 159298754bf7STejun Heo smp_wmb(); 159398754bf7STejun Heo work->single_done = 1; 159498754bf7STejun Heo need_wake_up = true; 159598754bf7STejun Heo } else if (work->auto_free) { 159683ba7b07SChristoph Hellwig kfree(work); 159798754bf7STejun Heo } 159898754bf7STejun Heo 1599cc395d7fSTejun Heo if (done && atomic_dec_and_test(&done->cnt)) 160098754bf7STejun Heo need_wake_up = true; 160198754bf7STejun Heo 160298754bf7STejun Heo if (need_wake_up) 1603cc395d7fSTejun Heo wake_up_all(&wb->bdi->wb_waitq); 160403ba3782SJens Axboe } 160503ba3782SJens Axboe 160603ba3782SJens Axboe /* 160703ba3782SJens Axboe * Check for periodic writeback, kupdated() style 160803ba3782SJens Axboe */ 160903ba3782SJens Axboe wrote += wb_check_old_data_flush(wb); 16106585027aSJan Kara wrote += wb_check_background_flush(wb); 16114452226eSTejun Heo clear_bit(WB_writeback_running, &wb->state); 161203ba3782SJens Axboe 161303ba3782SJens Axboe return wrote; 161403ba3782SJens Axboe } 161503ba3782SJens Axboe 161603ba3782SJens Axboe /* 161703ba3782SJens Axboe * Handle writeback of dirty data for the device backed by this bdi. Also 1618839a8e86STejun Heo * reschedules periodically and does kupdated style flushing. 161903ba3782SJens Axboe */ 1620f0054bb1STejun Heo void wb_workfn(struct work_struct *work) 162103ba3782SJens Axboe { 1622839a8e86STejun Heo struct bdi_writeback *wb = container_of(to_delayed_work(work), 1623839a8e86STejun Heo struct bdi_writeback, dwork); 162403ba3782SJens Axboe long pages_written; 162503ba3782SJens Axboe 1626f0054bb1STejun Heo set_worker_desc("flush-%s", dev_name(wb->bdi->dev)); 1627766f9164SPeter Zijlstra current->flags |= PF_SWAPWRITE; 162803ba3782SJens Axboe 1629839a8e86STejun Heo if (likely(!current_is_workqueue_rescuer() || 16304452226eSTejun Heo !test_bit(WB_registered, &wb->state))) { 163103ba3782SJens Axboe /* 1632f0054bb1STejun Heo * The normal path. Keep writing back @wb until its 1633839a8e86STejun Heo * work_list is empty. Note that this path is also taken 1634f0054bb1STejun Heo * if @wb is shutting down even when we're running off the 1635839a8e86STejun Heo * rescuer as work_list needs to be drained. 163603ba3782SJens Axboe */ 1637839a8e86STejun Heo do { 163825d130baSWanpeng Li pages_written = wb_do_writeback(wb); 1639455b2864SDave Chinner trace_writeback_pages_written(pages_written); 1640f0054bb1STejun Heo } while (!list_empty(&wb->work_list)); 1641839a8e86STejun Heo } else { 1642253c34e9SArtem Bityutskiy /* 1643839a8e86STejun Heo * bdi_wq can't get enough workers and we're running off 1644839a8e86STejun Heo * the emergency worker. Don't hog it. Hopefully, 1024 is 1645839a8e86STejun Heo * enough for efficient IO. 1646253c34e9SArtem Bityutskiy */ 1647f0054bb1STejun Heo pages_written = writeback_inodes_wb(wb, 1024, 1648839a8e86STejun Heo WB_REASON_FORKER_THREAD); 1649839a8e86STejun Heo trace_writeback_pages_written(pages_written); 165003ba3782SJens Axboe } 165103ba3782SJens Axboe 1652f0054bb1STejun Heo if (!list_empty(&wb->work_list)) 16536ca738d6SDerek Basehore mod_delayed_work(bdi_wq, &wb->dwork, 0); 16546ca738d6SDerek Basehore else if (wb_has_dirty_io(wb) && dirty_writeback_interval) 1655f0054bb1STejun Heo wb_wakeup_delayed(wb); 1656455b2864SDave Chinner 1657839a8e86STejun Heo current->flags &= ~PF_SWAPWRITE; 165803ba3782SJens Axboe } 165903ba3782SJens Axboe 166003ba3782SJens Axboe /* 166103ba3782SJens Axboe * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back 166203ba3782SJens Axboe * the whole world. 166303ba3782SJens Axboe */ 16640e175a18SCurt Wohlgemuth void wakeup_flusher_threads(long nr_pages, enum wb_reason reason) 166503ba3782SJens Axboe { 1666b8c2f347SChristoph Hellwig struct backing_dev_info *bdi; 1667b8c2f347SChristoph Hellwig 166847df3ddeSJan Kara if (!nr_pages) 166947df3ddeSJan Kara nr_pages = get_nr_dirty_pages(); 1670b8c2f347SChristoph Hellwig 1671b8c2f347SChristoph Hellwig rcu_read_lock(); 1672f2b65121STejun Heo list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { 1673f2b65121STejun Heo struct bdi_writeback *wb; 1674f2b65121STejun Heo struct wb_iter iter; 1675f2b65121STejun Heo 1676f2b65121STejun Heo if (!bdi_has_dirty_io(bdi)) 1677f2b65121STejun Heo continue; 1678f2b65121STejun Heo 1679f2b65121STejun Heo bdi_for_each_wb(wb, bdi, &iter, 0) 1680f2b65121STejun Heo wb_start_writeback(wb, wb_split_bdi_pages(wb, nr_pages), 1681f2b65121STejun Heo false, reason); 1682f2b65121STejun Heo } 1683b8c2f347SChristoph Hellwig rcu_read_unlock(); 168403ba3782SJens Axboe } 168503ba3782SJens Axboe 1686a2f48706STheodore Ts'o /* 1687a2f48706STheodore Ts'o * Wake up bdi's periodically to make sure dirtytime inodes gets 1688a2f48706STheodore Ts'o * written back periodically. We deliberately do *not* check the 1689a2f48706STheodore Ts'o * b_dirtytime list in wb_has_dirty_io(), since this would cause the 1690a2f48706STheodore Ts'o * kernel to be constantly waking up once there are any dirtytime 1691a2f48706STheodore Ts'o * inodes on the system. So instead we define a separate delayed work 1692a2f48706STheodore Ts'o * function which gets called much more rarely. (By default, only 1693a2f48706STheodore Ts'o * once every 12 hours.) 1694a2f48706STheodore Ts'o * 1695a2f48706STheodore Ts'o * If there is any other write activity going on in the file system, 1696a2f48706STheodore Ts'o * this function won't be necessary. But if the only thing that has 1697a2f48706STheodore Ts'o * happened on the file system is a dirtytime inode caused by an atime 1698a2f48706STheodore Ts'o * update, we need this infrastructure below to make sure that inode 1699a2f48706STheodore Ts'o * eventually gets pushed out to disk. 1700a2f48706STheodore Ts'o */ 1701a2f48706STheodore Ts'o static void wakeup_dirtytime_writeback(struct work_struct *w); 1702a2f48706STheodore Ts'o static DECLARE_DELAYED_WORK(dirtytime_work, wakeup_dirtytime_writeback); 1703a2f48706STheodore Ts'o 1704a2f48706STheodore Ts'o static void wakeup_dirtytime_writeback(struct work_struct *w) 1705a2f48706STheodore Ts'o { 1706a2f48706STheodore Ts'o struct backing_dev_info *bdi; 1707a2f48706STheodore Ts'o 1708a2f48706STheodore Ts'o rcu_read_lock(); 1709a2f48706STheodore Ts'o list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { 1710001fe6f6STejun Heo struct bdi_writeback *wb; 1711001fe6f6STejun Heo struct wb_iter iter; 1712001fe6f6STejun Heo 1713001fe6f6STejun Heo bdi_for_each_wb(wb, bdi, &iter, 0) 1714001fe6f6STejun Heo if (!list_empty(&bdi->wb.b_dirty_time)) 1715f0054bb1STejun Heo wb_wakeup(&bdi->wb); 1716a2f48706STheodore Ts'o } 1717a2f48706STheodore Ts'o rcu_read_unlock(); 1718a2f48706STheodore Ts'o schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ); 1719a2f48706STheodore Ts'o } 1720a2f48706STheodore Ts'o 1721a2f48706STheodore Ts'o static int __init start_dirtytime_writeback(void) 1722a2f48706STheodore Ts'o { 1723a2f48706STheodore Ts'o schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ); 1724a2f48706STheodore Ts'o return 0; 1725a2f48706STheodore Ts'o } 1726a2f48706STheodore Ts'o __initcall(start_dirtytime_writeback); 1727a2f48706STheodore Ts'o 17281efff914STheodore Ts'o int dirtytime_interval_handler(struct ctl_table *table, int write, 17291efff914STheodore Ts'o void __user *buffer, size_t *lenp, loff_t *ppos) 17301efff914STheodore Ts'o { 17311efff914STheodore Ts'o int ret; 17321efff914STheodore Ts'o 17331efff914STheodore Ts'o ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 17341efff914STheodore Ts'o if (ret == 0 && write) 17351efff914STheodore Ts'o mod_delayed_work(system_wq, &dirtytime_work, 0); 17361efff914STheodore Ts'o return ret; 17371efff914STheodore Ts'o } 17381efff914STheodore Ts'o 173903ba3782SJens Axboe static noinline void block_dump___mark_inode_dirty(struct inode *inode) 174003ba3782SJens Axboe { 174103ba3782SJens Axboe if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) { 174203ba3782SJens Axboe struct dentry *dentry; 174303ba3782SJens Axboe const char *name = "?"; 174403ba3782SJens Axboe 174503ba3782SJens Axboe dentry = d_find_alias(inode); 174603ba3782SJens Axboe if (dentry) { 174703ba3782SJens Axboe spin_lock(&dentry->d_lock); 174803ba3782SJens Axboe name = (const char *) dentry->d_name.name; 174903ba3782SJens Axboe } 175003ba3782SJens Axboe printk(KERN_DEBUG 175103ba3782SJens Axboe "%s(%d): dirtied inode %lu (%s) on %s\n", 175203ba3782SJens Axboe current->comm, task_pid_nr(current), inode->i_ino, 175303ba3782SJens Axboe name, inode->i_sb->s_id); 175403ba3782SJens Axboe if (dentry) { 175503ba3782SJens Axboe spin_unlock(&dentry->d_lock); 175603ba3782SJens Axboe dput(dentry); 175703ba3782SJens Axboe } 175803ba3782SJens Axboe } 175903ba3782SJens Axboe } 176003ba3782SJens Axboe 176103ba3782SJens Axboe /** 176203ba3782SJens Axboe * __mark_inode_dirty - internal function 176303ba3782SJens Axboe * @inode: inode to mark 176403ba3782SJens Axboe * @flags: what kind of dirty (i.e. I_DIRTY_SYNC) 176503ba3782SJens Axboe * Mark an inode as dirty. Callers should use mark_inode_dirty or 176603ba3782SJens Axboe * mark_inode_dirty_sync. 176703ba3782SJens Axboe * 176803ba3782SJens Axboe * Put the inode on the super block's dirty list. 176903ba3782SJens Axboe * 177003ba3782SJens Axboe * CAREFUL! We mark it dirty unconditionally, but move it onto the 177103ba3782SJens Axboe * dirty list only if it is hashed or if it refers to a blockdev. 177203ba3782SJens Axboe * If it was not hashed, it will never be added to the dirty list 177303ba3782SJens Axboe * even if it is later hashed, as it will have been marked dirty already. 177403ba3782SJens Axboe * 177503ba3782SJens Axboe * In short, make sure you hash any inodes _before_ you start marking 177603ba3782SJens Axboe * them dirty. 177703ba3782SJens Axboe * 177803ba3782SJens Axboe * Note that for blockdevs, inode->dirtied_when represents the dirtying time of 177903ba3782SJens Axboe * the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of 178003ba3782SJens Axboe * the kernel-internal blockdev inode represents the dirtying time of the 178103ba3782SJens Axboe * blockdev's pages. This is why for I_DIRTY_PAGES we always use 178203ba3782SJens Axboe * page->mapping->host, so the page-dirtying time is recorded in the internal 178303ba3782SJens Axboe * blockdev inode. 178403ba3782SJens Axboe */ 17850ae45f63STheodore Ts'o #define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC) 178603ba3782SJens Axboe void __mark_inode_dirty(struct inode *inode, int flags) 178703ba3782SJens Axboe { 178803ba3782SJens Axboe struct super_block *sb = inode->i_sb; 17890ae45f63STheodore Ts'o int dirtytime; 17900ae45f63STheodore Ts'o 17910ae45f63STheodore Ts'o trace_writeback_mark_inode_dirty(inode, flags); 179203ba3782SJens Axboe 179303ba3782SJens Axboe /* 179403ba3782SJens Axboe * Don't do this for I_DIRTY_PAGES - that doesn't actually 179503ba3782SJens Axboe * dirty the inode itself 179603ba3782SJens Axboe */ 17970ae45f63STheodore Ts'o if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_TIME)) { 17989fb0a7daSTejun Heo trace_writeback_dirty_inode_start(inode, flags); 17999fb0a7daSTejun Heo 180003ba3782SJens Axboe if (sb->s_op->dirty_inode) 1801aa385729SChristoph Hellwig sb->s_op->dirty_inode(inode, flags); 18029fb0a7daSTejun Heo 18039fb0a7daSTejun Heo trace_writeback_dirty_inode(inode, flags); 180403ba3782SJens Axboe } 18050ae45f63STheodore Ts'o if (flags & I_DIRTY_INODE) 18060ae45f63STheodore Ts'o flags &= ~I_DIRTY_TIME; 18070ae45f63STheodore Ts'o dirtytime = flags & I_DIRTY_TIME; 180803ba3782SJens Axboe 180903ba3782SJens Axboe /* 18109c6ac78eSTejun Heo * Paired with smp_mb() in __writeback_single_inode() for the 18119c6ac78eSTejun Heo * following lockless i_state test. See there for details. 181203ba3782SJens Axboe */ 181303ba3782SJens Axboe smp_mb(); 181403ba3782SJens Axboe 18150ae45f63STheodore Ts'o if (((inode->i_state & flags) == flags) || 18160ae45f63STheodore Ts'o (dirtytime && (inode->i_state & I_DIRTY_INODE))) 181703ba3782SJens Axboe return; 181803ba3782SJens Axboe 181903ba3782SJens Axboe if (unlikely(block_dump)) 182003ba3782SJens Axboe block_dump___mark_inode_dirty(inode); 182103ba3782SJens Axboe 1822250df6edSDave Chinner spin_lock(&inode->i_lock); 18230ae45f63STheodore Ts'o if (dirtytime && (inode->i_state & I_DIRTY_INODE)) 18240ae45f63STheodore Ts'o goto out_unlock_inode; 182503ba3782SJens Axboe if ((inode->i_state & flags) != flags) { 182603ba3782SJens Axboe const int was_dirty = inode->i_state & I_DIRTY; 182703ba3782SJens Axboe 182852ebea74STejun Heo inode_attach_wb(inode, NULL); 182952ebea74STejun Heo 18300ae45f63STheodore Ts'o if (flags & I_DIRTY_INODE) 18310ae45f63STheodore Ts'o inode->i_state &= ~I_DIRTY_TIME; 183203ba3782SJens Axboe inode->i_state |= flags; 183303ba3782SJens Axboe 183403ba3782SJens Axboe /* 183503ba3782SJens Axboe * If the inode is being synced, just update its dirty state. 183603ba3782SJens Axboe * The unlocker will place the inode on the appropriate 183703ba3782SJens Axboe * superblock list, based upon its state. 183803ba3782SJens Axboe */ 183903ba3782SJens Axboe if (inode->i_state & I_SYNC) 1840250df6edSDave Chinner goto out_unlock_inode; 184103ba3782SJens Axboe 184203ba3782SJens Axboe /* 184303ba3782SJens Axboe * Only add valid (hashed) inodes to the superblock's 184403ba3782SJens Axboe * dirty list. Add blockdev inodes as well. 184503ba3782SJens Axboe */ 184603ba3782SJens Axboe if (!S_ISBLK(inode->i_mode)) { 18471d3382cbSAl Viro if (inode_unhashed(inode)) 1848250df6edSDave Chinner goto out_unlock_inode; 184903ba3782SJens Axboe } 1850a4ffdde6SAl Viro if (inode->i_state & I_FREEING) 1851250df6edSDave Chinner goto out_unlock_inode; 185203ba3782SJens Axboe 185303ba3782SJens Axboe /* 185403ba3782SJens Axboe * If the inode was already on b_dirty/b_io/b_more_io, don't 185503ba3782SJens Axboe * reposition it (that would break b_dirty time-ordering). 185603ba3782SJens Axboe */ 185703ba3782SJens Axboe if (!was_dirty) { 185887e1d789STejun Heo struct bdi_writeback *wb; 1859d6c10f1fSTejun Heo struct list_head *dirty_list; 1860a66979abSDave Chinner bool wakeup_bdi = false; 1861500b067cSJens Axboe 186287e1d789STejun Heo wb = locked_inode_to_wb_and_lock_list(inode); 1863253c34e9SArtem Bityutskiy 18640747259dSTejun Heo WARN(bdi_cap_writeback_dirty(wb->bdi) && 18650747259dSTejun Heo !test_bit(WB_registered, &wb->state), 18660747259dSTejun Heo "bdi-%s not registered\n", wb->bdi->name); 186703ba3782SJens Axboe 186803ba3782SJens Axboe inode->dirtied_when = jiffies; 1869a2f48706STheodore Ts'o if (dirtytime) 1870a2f48706STheodore Ts'o inode->dirtied_time_when = jiffies; 1871d6c10f1fSTejun Heo 1872a2f48706STheodore Ts'o if (inode->i_state & (I_DIRTY_INODE | I_DIRTY_PAGES)) 18730747259dSTejun Heo dirty_list = &wb->b_dirty; 1874a2f48706STheodore Ts'o else 18750747259dSTejun Heo dirty_list = &wb->b_dirty_time; 1876d6c10f1fSTejun Heo 18770747259dSTejun Heo wakeup_bdi = inode_wb_list_move_locked(inode, wb, 1878d6c10f1fSTejun Heo dirty_list); 1879d6c10f1fSTejun Heo 18800747259dSTejun Heo spin_unlock(&wb->list_lock); 18810ae45f63STheodore Ts'o trace_writeback_dirty_inode_enqueue(inode); 1882253c34e9SArtem Bityutskiy 1883d6c10f1fSTejun Heo /* 1884d6c10f1fSTejun Heo * If this is the first dirty inode for this bdi, 1885d6c10f1fSTejun Heo * we have to wake-up the corresponding bdi thread 1886d6c10f1fSTejun Heo * to make sure background write-back happens 1887d6c10f1fSTejun Heo * later. 1888d6c10f1fSTejun Heo */ 18890747259dSTejun Heo if (bdi_cap_writeback_dirty(wb->bdi) && wakeup_bdi) 18900747259dSTejun Heo wb_wakeup_delayed(wb); 1891a66979abSDave Chinner return; 1892a66979abSDave Chinner } 1893a66979abSDave Chinner } 1894a66979abSDave Chinner out_unlock_inode: 1895a66979abSDave Chinner spin_unlock(&inode->i_lock); 1896a66979abSDave Chinner 189703ba3782SJens Axboe } 189803ba3782SJens Axboe EXPORT_SYMBOL(__mark_inode_dirty); 189903ba3782SJens Axboe 1900b6e51316SJens Axboe static void wait_sb_inodes(struct super_block *sb) 190166f3b8e2SJens Axboe { 190238f21977SNick Piggin struct inode *inode, *old_inode = NULL; 190338f21977SNick Piggin 190403ba3782SJens Axboe /* 190503ba3782SJens Axboe * We need to be protected against the filesystem going from 190603ba3782SJens Axboe * r/o to r/w or vice versa. 190703ba3782SJens Axboe */ 1908b6e51316SJens Axboe WARN_ON(!rwsem_is_locked(&sb->s_umount)); 190903ba3782SJens Axboe 191055fa6091SDave Chinner spin_lock(&inode_sb_list_lock); 191166f3b8e2SJens Axboe 191238f21977SNick Piggin /* 191338f21977SNick Piggin * Data integrity sync. Must wait for all pages under writeback, 191438f21977SNick Piggin * because there may have been pages dirtied before our sync 191538f21977SNick Piggin * call, but which had writeout started before we write it out. 191638f21977SNick Piggin * In which case, the inode may not be on the dirty list, but 191738f21977SNick Piggin * we still have to wait for that writeout. 191838f21977SNick Piggin */ 1919b6e51316SJens Axboe list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 1920250df6edSDave Chinner struct address_space *mapping = inode->i_mapping; 192138f21977SNick Piggin 1922250df6edSDave Chinner spin_lock(&inode->i_lock); 1923250df6edSDave Chinner if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || 1924250df6edSDave Chinner (mapping->nrpages == 0)) { 1925250df6edSDave Chinner spin_unlock(&inode->i_lock); 192638f21977SNick Piggin continue; 1927250df6edSDave Chinner } 192838f21977SNick Piggin __iget(inode); 1929250df6edSDave Chinner spin_unlock(&inode->i_lock); 193055fa6091SDave Chinner spin_unlock(&inode_sb_list_lock); 193155fa6091SDave Chinner 193238f21977SNick Piggin /* 193355fa6091SDave Chinner * We hold a reference to 'inode' so it couldn't have been 193455fa6091SDave Chinner * removed from s_inodes list while we dropped the 193555fa6091SDave Chinner * inode_sb_list_lock. We cannot iput the inode now as we can 193655fa6091SDave Chinner * be holding the last reference and we cannot iput it under 193755fa6091SDave Chinner * inode_sb_list_lock. So we keep the reference and iput it 193855fa6091SDave Chinner * later. 193938f21977SNick Piggin */ 194038f21977SNick Piggin iput(old_inode); 194138f21977SNick Piggin old_inode = inode; 194238f21977SNick Piggin 194338f21977SNick Piggin filemap_fdatawait(mapping); 194438f21977SNick Piggin 194538f21977SNick Piggin cond_resched(); 194638f21977SNick Piggin 194755fa6091SDave Chinner spin_lock(&inode_sb_list_lock); 194838f21977SNick Piggin } 194955fa6091SDave Chinner spin_unlock(&inode_sb_list_lock); 195038f21977SNick Piggin iput(old_inode); 195166f3b8e2SJens Axboe } 19521da177e4SLinus Torvalds 1953f30a7d0cSTejun Heo static void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr, 1954f30a7d0cSTejun Heo enum wb_reason reason, bool skip_if_busy) 19551da177e4SLinus Torvalds { 1956cc395d7fSTejun Heo DEFINE_WB_COMPLETION_ONSTACK(done); 195783ba7b07SChristoph Hellwig struct wb_writeback_work work = { 19583c4d7165SChristoph Hellwig .sb = sb, 19593c4d7165SChristoph Hellwig .sync_mode = WB_SYNC_NONE, 19606e6938b6SWu Fengguang .tagged_writepages = 1, 196183ba7b07SChristoph Hellwig .done = &done, 19623259f8beSChris Mason .nr_pages = nr, 19630e175a18SCurt Wohlgemuth .reason = reason, 19643c4d7165SChristoph Hellwig }; 1965e7972912STejun Heo struct backing_dev_info *bdi = sb->s_bdi; 19660e3c9a22SJens Axboe 1967e7972912STejun Heo if (!bdi_has_dirty_io(bdi) || bdi == &noop_backing_dev_info) 19686eedc701SJan Kara return; 1969cf37e972SChristoph Hellwig WARN_ON(!rwsem_is_locked(&sb->s_umount)); 1970f30a7d0cSTejun Heo 1971db125360STejun Heo bdi_split_work_to_wbs(sb->s_bdi, &work, skip_if_busy); 1972cc395d7fSTejun Heo wb_wait_for_completion(bdi, &done); 19731da177e4SLinus Torvalds } 1974f30a7d0cSTejun Heo 1975f30a7d0cSTejun Heo /** 1976f30a7d0cSTejun Heo * writeback_inodes_sb_nr - writeback dirty inodes from given super_block 1977f30a7d0cSTejun Heo * @sb: the superblock 1978f30a7d0cSTejun Heo * @nr: the number of pages to write 1979f30a7d0cSTejun Heo * @reason: reason why some writeback work initiated 1980f30a7d0cSTejun Heo * 1981f30a7d0cSTejun Heo * Start writeback on some inodes on this super_block. No guarantees are made 1982f30a7d0cSTejun Heo * on how many (if any) will be written, and this function does not wait 1983f30a7d0cSTejun Heo * for IO completion of submitted IO. 1984f30a7d0cSTejun Heo */ 1985f30a7d0cSTejun Heo void writeback_inodes_sb_nr(struct super_block *sb, 1986f30a7d0cSTejun Heo unsigned long nr, 1987f30a7d0cSTejun Heo enum wb_reason reason) 1988f30a7d0cSTejun Heo { 1989f30a7d0cSTejun Heo __writeback_inodes_sb_nr(sb, nr, reason, false); 1990f30a7d0cSTejun Heo } 19913259f8beSChris Mason EXPORT_SYMBOL(writeback_inodes_sb_nr); 19923259f8beSChris Mason 19933259f8beSChris Mason /** 19943259f8beSChris Mason * writeback_inodes_sb - writeback dirty inodes from given super_block 19953259f8beSChris Mason * @sb: the superblock 1996786228abSMarcos Paulo de Souza * @reason: reason why some writeback work was initiated 19973259f8beSChris Mason * 19983259f8beSChris Mason * Start writeback on some inodes on this super_block. No guarantees are made 19993259f8beSChris Mason * on how many (if any) will be written, and this function does not wait 20003259f8beSChris Mason * for IO completion of submitted IO. 20013259f8beSChris Mason */ 20020e175a18SCurt Wohlgemuth void writeback_inodes_sb(struct super_block *sb, enum wb_reason reason) 20033259f8beSChris Mason { 20040e175a18SCurt Wohlgemuth return writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason); 20053259f8beSChris Mason } 2006d8a8559cSJens Axboe EXPORT_SYMBOL(writeback_inodes_sb); 2007d8a8559cSJens Axboe 2008d8a8559cSJens Axboe /** 200910ee27a0SMiao Xie * try_to_writeback_inodes_sb_nr - try to start writeback if none underway 20103259f8beSChris Mason * @sb: the superblock 20113259f8beSChris Mason * @nr: the number of pages to write 201210ee27a0SMiao Xie * @reason: the reason of writeback 20133259f8beSChris Mason * 201410ee27a0SMiao Xie * Invoke writeback_inodes_sb_nr if no writeback is currently underway. 20153259f8beSChris Mason * Returns 1 if writeback was started, 0 if not. 20163259f8beSChris Mason */ 2017f30a7d0cSTejun Heo bool try_to_writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr, 20180e175a18SCurt Wohlgemuth enum wb_reason reason) 20193259f8beSChris Mason { 202010ee27a0SMiao Xie if (!down_read_trylock(&sb->s_umount)) 2021f30a7d0cSTejun Heo return false; 202210ee27a0SMiao Xie 2023f30a7d0cSTejun Heo __writeback_inodes_sb_nr(sb, nr, reason, true); 20243259f8beSChris Mason up_read(&sb->s_umount); 2025f30a7d0cSTejun Heo return true; 20263259f8beSChris Mason } 202710ee27a0SMiao Xie EXPORT_SYMBOL(try_to_writeback_inodes_sb_nr); 202810ee27a0SMiao Xie 202910ee27a0SMiao Xie /** 203010ee27a0SMiao Xie * try_to_writeback_inodes_sb - try to start writeback if none underway 203110ee27a0SMiao Xie * @sb: the superblock 203210ee27a0SMiao Xie * @reason: reason why some writeback work was initiated 203310ee27a0SMiao Xie * 203410ee27a0SMiao Xie * Implement by try_to_writeback_inodes_sb_nr() 203510ee27a0SMiao Xie * Returns 1 if writeback was started, 0 if not. 203610ee27a0SMiao Xie */ 2037f30a7d0cSTejun Heo bool try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason) 203810ee27a0SMiao Xie { 203910ee27a0SMiao Xie return try_to_writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason); 204010ee27a0SMiao Xie } 204110ee27a0SMiao Xie EXPORT_SYMBOL(try_to_writeback_inodes_sb); 20423259f8beSChris Mason 20433259f8beSChris Mason /** 2044d8a8559cSJens Axboe * sync_inodes_sb - sync sb inode pages 2045d8a8559cSJens Axboe * @sb: the superblock 2046d8a8559cSJens Axboe * 2047d8a8559cSJens Axboe * This function writes and waits on any dirty inode belonging to this 20480dc83bd3SJan Kara * super_block. 2049d8a8559cSJens Axboe */ 20500dc83bd3SJan Kara void sync_inodes_sb(struct super_block *sb) 2051d8a8559cSJens Axboe { 2052cc395d7fSTejun Heo DEFINE_WB_COMPLETION_ONSTACK(done); 205383ba7b07SChristoph Hellwig struct wb_writeback_work work = { 20543c4d7165SChristoph Hellwig .sb = sb, 20553c4d7165SChristoph Hellwig .sync_mode = WB_SYNC_ALL, 20563c4d7165SChristoph Hellwig .nr_pages = LONG_MAX, 20573c4d7165SChristoph Hellwig .range_cyclic = 0, 205883ba7b07SChristoph Hellwig .done = &done, 20590e175a18SCurt Wohlgemuth .reason = WB_REASON_SYNC, 20607747bd4bSDave Chinner .for_sync = 1, 20613c4d7165SChristoph Hellwig }; 2062e7972912STejun Heo struct backing_dev_info *bdi = sb->s_bdi; 20633c4d7165SChristoph Hellwig 20646eedc701SJan Kara /* Nothing to do? */ 2065e7972912STejun Heo if (!bdi_has_dirty_io(bdi) || bdi == &noop_backing_dev_info) 20666eedc701SJan Kara return; 2067cf37e972SChristoph Hellwig WARN_ON(!rwsem_is_locked(&sb->s_umount)); 2068cf37e972SChristoph Hellwig 2069db125360STejun Heo bdi_split_work_to_wbs(bdi, &work, false); 2070cc395d7fSTejun Heo wb_wait_for_completion(bdi, &done); 207183ba7b07SChristoph Hellwig 2072b6e51316SJens Axboe wait_sb_inodes(sb); 2073d8a8559cSJens Axboe } 2074d8a8559cSJens Axboe EXPORT_SYMBOL(sync_inodes_sb); 20751da177e4SLinus Torvalds 20761da177e4SLinus Torvalds /** 20771da177e4SLinus Torvalds * write_inode_now - write an inode to disk 20781da177e4SLinus Torvalds * @inode: inode to write to disk 20791da177e4SLinus Torvalds * @sync: whether the write should be synchronous or not 20801da177e4SLinus Torvalds * 20817f04c26dSAndrea Arcangeli * This function commits an inode to disk immediately if it is dirty. This is 20827f04c26dSAndrea Arcangeli * primarily needed by knfsd. 20837f04c26dSAndrea Arcangeli * 20847f04c26dSAndrea Arcangeli * The caller must either have a ref on the inode or must have set I_WILL_FREE. 20851da177e4SLinus Torvalds */ 20861da177e4SLinus Torvalds int write_inode_now(struct inode *inode, int sync) 20871da177e4SLinus Torvalds { 2088f758eeabSChristoph Hellwig struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; 20891da177e4SLinus Torvalds struct writeback_control wbc = { 20901da177e4SLinus Torvalds .nr_to_write = LONG_MAX, 209118914b18SMike Galbraith .sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE, 2092111ebb6eSOGAWA Hirofumi .range_start = 0, 2093111ebb6eSOGAWA Hirofumi .range_end = LLONG_MAX, 20941da177e4SLinus Torvalds }; 20951da177e4SLinus Torvalds 20961da177e4SLinus Torvalds if (!mapping_cap_writeback_dirty(inode->i_mapping)) 209749364ce2SAndrew Morton wbc.nr_to_write = 0; 20981da177e4SLinus Torvalds 20991da177e4SLinus Torvalds might_sleep(); 21004f8ad655SJan Kara return writeback_single_inode(inode, wb, &wbc); 21011da177e4SLinus Torvalds } 21021da177e4SLinus Torvalds EXPORT_SYMBOL(write_inode_now); 21031da177e4SLinus Torvalds 21041da177e4SLinus Torvalds /** 21051da177e4SLinus Torvalds * sync_inode - write an inode and its pages to disk. 21061da177e4SLinus Torvalds * @inode: the inode to sync 21071da177e4SLinus Torvalds * @wbc: controls the writeback mode 21081da177e4SLinus Torvalds * 21091da177e4SLinus Torvalds * sync_inode() will write an inode and its pages to disk. It will also 21101da177e4SLinus Torvalds * correctly update the inode on its superblock's dirty inode lists and will 21111da177e4SLinus Torvalds * update inode->i_state. 21121da177e4SLinus Torvalds * 21131da177e4SLinus Torvalds * The caller must have a ref on the inode. 21141da177e4SLinus Torvalds */ 21151da177e4SLinus Torvalds int sync_inode(struct inode *inode, struct writeback_control *wbc) 21161da177e4SLinus Torvalds { 21174f8ad655SJan Kara return writeback_single_inode(inode, &inode_to_bdi(inode)->wb, wbc); 21181da177e4SLinus Torvalds } 21191da177e4SLinus Torvalds EXPORT_SYMBOL(sync_inode); 2120c3765016SChristoph Hellwig 2121c3765016SChristoph Hellwig /** 2122c691b9d9SAndrew Morton * sync_inode_metadata - write an inode to disk 2123c3765016SChristoph Hellwig * @inode: the inode to sync 2124c3765016SChristoph Hellwig * @wait: wait for I/O to complete. 2125c3765016SChristoph Hellwig * 2126c691b9d9SAndrew Morton * Write an inode to disk and adjust its dirty state after completion. 2127c3765016SChristoph Hellwig * 2128c3765016SChristoph Hellwig * Note: only writes the actual inode, no associated data or other metadata. 2129c3765016SChristoph Hellwig */ 2130c3765016SChristoph Hellwig int sync_inode_metadata(struct inode *inode, int wait) 2131c3765016SChristoph Hellwig { 2132c3765016SChristoph Hellwig struct writeback_control wbc = { 2133c3765016SChristoph Hellwig .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE, 2134c3765016SChristoph Hellwig .nr_to_write = 0, /* metadata-only */ 2135c3765016SChristoph Hellwig }; 2136c3765016SChristoph Hellwig 2137c3765016SChristoph Hellwig return sync_inode(inode, &wbc); 2138c3765016SChristoph Hellwig } 2139c3765016SChristoph Hellwig EXPORT_SYMBOL(sync_inode_metadata); 2140