1457c8996SThomas Gleixner // SPDX-License-Identifier: GPL-2.0-only
21da177e4SLinus Torvalds /*
31da177e4SLinus Torvalds * fs/fs-writeback.c
41da177e4SLinus Torvalds *
51da177e4SLinus Torvalds * Copyright (C) 2002, Linus Torvalds.
61da177e4SLinus Torvalds *
71da177e4SLinus Torvalds * Contains all the functions related to writing back and waiting
81da177e4SLinus Torvalds * upon dirty inodes against superblocks, and writing back dirty
91da177e4SLinus Torvalds * pages against inodes. ie: data writeback. Writeout of the
101da177e4SLinus Torvalds * inode itself is not handled here.
111da177e4SLinus Torvalds *
12e1f8e874SFrancois Cami * 10Apr2002 Andrew Morton
131da177e4SLinus Torvalds * Split out of fs/inode.c
141da177e4SLinus Torvalds * Additions for address_space-based writeback
151da177e4SLinus Torvalds */
161da177e4SLinus Torvalds
171da177e4SLinus Torvalds #include <linux/kernel.h>
18630d9c47SPaul Gortmaker #include <linux/export.h>
191da177e4SLinus Torvalds #include <linux/spinlock.h>
205a0e3ad6STejun Heo #include <linux/slab.h>
211da177e4SLinus Torvalds #include <linux/sched.h>
221da177e4SLinus Torvalds #include <linux/fs.h>
231da177e4SLinus Torvalds #include <linux/mm.h>
24bc31b86aSWu Fengguang #include <linux/pagemap.h>
2503ba3782SJens Axboe #include <linux/kthread.h>
261da177e4SLinus Torvalds #include <linux/writeback.h>
271da177e4SLinus Torvalds #include <linux/blkdev.h>
281da177e4SLinus Torvalds #include <linux/backing-dev.h>
29455b2864SDave Chinner #include <linux/tracepoint.h>
30719ea2fbSAl Viro #include <linux/device.h>
3121c6321fSTejun Heo #include <linux/memcontrol.h>
3207f3f05cSDavid Howells #include "internal.h"
331da177e4SLinus Torvalds
34d0bceac7SJens Axboe /*
35bc31b86aSWu Fengguang * 4MB minimal write chunk size
36bc31b86aSWu Fengguang */
3709cbfeafSKirill A. Shutemov #define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_SHIFT - 10))
38bc31b86aSWu Fengguang
39bc31b86aSWu Fengguang /*
40c4a77a6cSJens Axboe * Passed into wb_writeback(), essentially a subset of writeback_control
41c4a77a6cSJens Axboe */
4283ba7b07SChristoph Hellwig struct wb_writeback_work {
43c4a77a6cSJens Axboe long nr_pages;
44c4a77a6cSJens Axboe struct super_block *sb;
45c4a77a6cSJens Axboe enum writeback_sync_modes sync_mode;
466e6938b6SWu Fengguang unsigned int tagged_writepages:1;
4752957fe1SH Hartley Sweeten unsigned int for_kupdate:1;
4852957fe1SH Hartley Sweeten unsigned int range_cyclic:1;
4952957fe1SH Hartley Sweeten unsigned int for_background:1;
507747bd4bSDave Chinner unsigned int for_sync:1; /* sync(2) WB_SYNC_ALL writeback */
51ac7b19a3STejun Heo unsigned int auto_free:1; /* free on completion */
520e175a18SCurt Wohlgemuth enum wb_reason reason; /* why was writeback initiated? */
53c4a77a6cSJens Axboe
548010c3b6SJens Axboe struct list_head list; /* pending work list */
55cc395d7fSTejun Heo struct wb_completion *done; /* set if the caller waits */
5603ba3782SJens Axboe };
5703ba3782SJens Axboe
58a2f48706STheodore Ts'o /*
59a2f48706STheodore Ts'o * If an inode is constantly having its pages dirtied, but then the
60a2f48706STheodore Ts'o * updates stop dirtytime_expire_interval seconds in the past, it's
61a2f48706STheodore Ts'o * possible for the worst case time between when an inode has its
62a2f48706STheodore Ts'o * timestamps updated and when they finally get written out to be two
63a2f48706STheodore Ts'o * dirtytime_expire_intervals. We set the default to 12 hours (in
64a2f48706STheodore Ts'o * seconds), which means most of the time inodes will have their
65a2f48706STheodore Ts'o * timestamps written to disk after 12 hours, but in the worst case a
66a2f48706STheodore Ts'o * few inodes might not their timestamps updated for 24 hours.
67a2f48706STheodore Ts'o */
68a2f48706STheodore Ts'o unsigned int dirtytime_expire_interval = 12 * 60 * 60;
69a2f48706STheodore Ts'o
wb_inode(struct list_head * head)707ccf19a8SNick Piggin static inline struct inode *wb_inode(struct list_head *head)
717ccf19a8SNick Piggin {
72c7f54084SDave Chinner return list_entry(head, struct inode, i_io_list);
737ccf19a8SNick Piggin }
747ccf19a8SNick Piggin
7515eb77a0SWu Fengguang /*
7615eb77a0SWu Fengguang * Include the creation of the trace points after defining the
7715eb77a0SWu Fengguang * wb_writeback_work structure and inline functions so that the definition
7815eb77a0SWu Fengguang * remains local to this file.
7915eb77a0SWu Fengguang */
8015eb77a0SWu Fengguang #define CREATE_TRACE_POINTS
8115eb77a0SWu Fengguang #include <trace/events/writeback.h>
8215eb77a0SWu Fengguang
83774016b2SSteven Whitehouse EXPORT_TRACEPOINT_SYMBOL_GPL(wbc_writepage);
84774016b2SSteven Whitehouse
wb_io_lists_populated(struct bdi_writeback * wb)85d6c10f1fSTejun Heo static bool wb_io_lists_populated(struct bdi_writeback *wb)
86d6c10f1fSTejun Heo {
87d6c10f1fSTejun Heo if (wb_has_dirty_io(wb)) {
88d6c10f1fSTejun Heo return false;
89d6c10f1fSTejun Heo } else {
90d6c10f1fSTejun Heo set_bit(WB_has_dirty_io, &wb->state);
9195a46c65STejun Heo WARN_ON_ONCE(!wb->avg_write_bandwidth);
92766a9d6eSTejun Heo atomic_long_add(wb->avg_write_bandwidth,
93766a9d6eSTejun Heo &wb->bdi->tot_write_bandwidth);
94d6c10f1fSTejun Heo return true;
95d6c10f1fSTejun Heo }
96d6c10f1fSTejun Heo }
97d6c10f1fSTejun Heo
wb_io_lists_depopulated(struct bdi_writeback * wb)98d6c10f1fSTejun Heo static void wb_io_lists_depopulated(struct bdi_writeback *wb)
99d6c10f1fSTejun Heo {
100d6c10f1fSTejun Heo if (wb_has_dirty_io(wb) && list_empty(&wb->b_dirty) &&
101766a9d6eSTejun Heo list_empty(&wb->b_io) && list_empty(&wb->b_more_io)) {
102d6c10f1fSTejun Heo clear_bit(WB_has_dirty_io, &wb->state);
10395a46c65STejun Heo WARN_ON_ONCE(atomic_long_sub_return(wb->avg_write_bandwidth,
10495a46c65STejun Heo &wb->bdi->tot_write_bandwidth) < 0);
105766a9d6eSTejun Heo }
106d6c10f1fSTejun Heo }
107d6c10f1fSTejun Heo
108d6c10f1fSTejun Heo /**
109c7f54084SDave Chinner * inode_io_list_move_locked - move an inode onto a bdi_writeback IO list
110d6c10f1fSTejun Heo * @inode: inode to be moved
111d6c10f1fSTejun Heo * @wb: target bdi_writeback
112bbbc3c1cSWang Long * @head: one of @wb->b_{dirty|io|more_io|dirty_time}
113d6c10f1fSTejun Heo *
114c7f54084SDave Chinner * Move @inode->i_io_list to @list of @wb and set %WB_has_dirty_io.
115d6c10f1fSTejun Heo * Returns %true if @inode is the first occupant of the !dirty_time IO
116d6c10f1fSTejun Heo * lists; otherwise, %false.
117d6c10f1fSTejun Heo */
inode_io_list_move_locked(struct inode * inode,struct bdi_writeback * wb,struct list_head * head)118c7f54084SDave Chinner static bool inode_io_list_move_locked(struct inode *inode,
119d6c10f1fSTejun Heo struct bdi_writeback *wb,
120d6c10f1fSTejun Heo struct list_head *head)
121d6c10f1fSTejun Heo {
122d6c10f1fSTejun Heo assert_spin_locked(&wb->list_lock);
12310e14073SJchao Sun assert_spin_locked(&inode->i_lock);
124a9438b44SJan Kara WARN_ON_ONCE(inode->i_state & I_FREEING);
125d6c10f1fSTejun Heo
126c7f54084SDave Chinner list_move(&inode->i_io_list, head);
127d6c10f1fSTejun Heo
128d6c10f1fSTejun Heo /* dirty_time doesn't count as dirty_io until expiration */
129d6c10f1fSTejun Heo if (head != &wb->b_dirty_time)
130d6c10f1fSTejun Heo return wb_io_lists_populated(wb);
131d6c10f1fSTejun Heo
132d6c10f1fSTejun Heo wb_io_lists_depopulated(wb);
133d6c10f1fSTejun Heo return false;
134d6c10f1fSTejun Heo }
135d6c10f1fSTejun Heo
wb_wakeup(struct bdi_writeback * wb)136f0054bb1STejun Heo static void wb_wakeup(struct bdi_writeback *wb)
1375acda9d1SJan Kara {
138f87904c0SKhazhismel Kumykov spin_lock_irq(&wb->work_lock);
139f0054bb1STejun Heo if (test_bit(WB_registered, &wb->state))
140f0054bb1STejun Heo mod_delayed_work(bdi_wq, &wb->dwork, 0);
141f87904c0SKhazhismel Kumykov spin_unlock_irq(&wb->work_lock);
1425acda9d1SJan Kara }
1435acda9d1SJan Kara
finish_writeback_work(struct bdi_writeback * wb,struct wb_writeback_work * work)1444a3a485bSTahsin Erdogan static void finish_writeback_work(struct bdi_writeback *wb,
1454a3a485bSTahsin Erdogan struct wb_writeback_work *work)
1464a3a485bSTahsin Erdogan {
1474a3a485bSTahsin Erdogan struct wb_completion *done = work->done;
1484a3a485bSTahsin Erdogan
1494a3a485bSTahsin Erdogan if (work->auto_free)
1504a3a485bSTahsin Erdogan kfree(work);
1518e00c4e9STejun Heo if (done) {
1528e00c4e9STejun Heo wait_queue_head_t *waitq = done->waitq;
1538e00c4e9STejun Heo
1548e00c4e9STejun Heo /* @done can't be accessed after the following dec */
1558e00c4e9STejun Heo if (atomic_dec_and_test(&done->cnt))
1568e00c4e9STejun Heo wake_up_all(waitq);
1578e00c4e9STejun Heo }
1584a3a485bSTahsin Erdogan }
1594a3a485bSTahsin Erdogan
wb_queue_work(struct bdi_writeback * wb,struct wb_writeback_work * work)160f0054bb1STejun Heo static void wb_queue_work(struct bdi_writeback *wb,
1616585027aSJan Kara struct wb_writeback_work *work)
1626585027aSJan Kara {
1635634cc2aSTejun Heo trace_writeback_queue(wb, work);
1646585027aSJan Kara
165cc395d7fSTejun Heo if (work->done)
166cc395d7fSTejun Heo atomic_inc(&work->done->cnt);
1674a3a485bSTahsin Erdogan
168f87904c0SKhazhismel Kumykov spin_lock_irq(&wb->work_lock);
1694a3a485bSTahsin Erdogan
1704a3a485bSTahsin Erdogan if (test_bit(WB_registered, &wb->state)) {
171f0054bb1STejun Heo list_add_tail(&work->list, &wb->work_list);
172f0054bb1STejun Heo mod_delayed_work(bdi_wq, &wb->dwork, 0);
1734a3a485bSTahsin Erdogan } else
1744a3a485bSTahsin Erdogan finish_writeback_work(wb, work);
1754a3a485bSTahsin Erdogan
176f87904c0SKhazhismel Kumykov spin_unlock_irq(&wb->work_lock);
17703ba3782SJens Axboe }
1781da177e4SLinus Torvalds
179cc395d7fSTejun Heo /**
180cc395d7fSTejun Heo * wb_wait_for_completion - wait for completion of bdi_writeback_works
181cc395d7fSTejun Heo * @done: target wb_completion
182cc395d7fSTejun Heo *
183cc395d7fSTejun Heo * Wait for one or more work items issued to @bdi with their ->done field
1845b9cce4cSTejun Heo * set to @done, which should have been initialized with
1855b9cce4cSTejun Heo * DEFINE_WB_COMPLETION(). This function returns after all such work items
1865b9cce4cSTejun Heo * are completed. Work items which are waited upon aren't freed
187cc395d7fSTejun Heo * automatically on completion.
188cc395d7fSTejun Heo */
wb_wait_for_completion(struct wb_completion * done)1895b9cce4cSTejun Heo void wb_wait_for_completion(struct wb_completion *done)
190cc395d7fSTejun Heo {
191cc395d7fSTejun Heo atomic_dec(&done->cnt); /* put down the initial count */
1925b9cce4cSTejun Heo wait_event(*done->waitq, !atomic_read(&done->cnt));
193cc395d7fSTejun Heo }
194cc395d7fSTejun Heo
195703c2708STejun Heo #ifdef CONFIG_CGROUP_WRITEBACK
196703c2708STejun Heo
19755a694dfSTejun Heo /*
19855a694dfSTejun Heo * Parameters for foreign inode detection, see wbc_detach_inode() to see
19955a694dfSTejun Heo * how they're used.
20055a694dfSTejun Heo *
20155a694dfSTejun Heo * These paramters are inherently heuristical as the detection target
20255a694dfSTejun Heo * itself is fuzzy. All we want to do is detaching an inode from the
20355a694dfSTejun Heo * current owner if it's being written to by some other cgroups too much.
20455a694dfSTejun Heo *
20555a694dfSTejun Heo * The current cgroup writeback is built on the assumption that multiple
20655a694dfSTejun Heo * cgroups writing to the same inode concurrently is very rare and a mode
20755a694dfSTejun Heo * of operation which isn't well supported. As such, the goal is not
20855a694dfSTejun Heo * taking too long when a different cgroup takes over an inode while
20955a694dfSTejun Heo * avoiding too aggressive flip-flops from occasional foreign writes.
21055a694dfSTejun Heo *
21155a694dfSTejun Heo * We record, very roughly, 2s worth of IO time history and if more than
21255a694dfSTejun Heo * half of that is foreign, trigger the switch. The recording is quantized
21355a694dfSTejun Heo * to 16 slots. To avoid tiny writes from swinging the decision too much,
21455a694dfSTejun Heo * writes smaller than 1/8 of avg size are ignored.
21555a694dfSTejun Heo */
2162a814908STejun Heo #define WB_FRN_TIME_SHIFT 13 /* 1s = 2^13, upto 8 secs w/ 16bit */
2172a814908STejun Heo #define WB_FRN_TIME_AVG_SHIFT 3 /* avg = avg * 7/8 + new * 1/8 */
21855a694dfSTejun Heo #define WB_FRN_TIME_CUT_DIV 8 /* ignore rounds < avg / 8 */
2192a814908STejun Heo #define WB_FRN_TIME_PERIOD (2 * (1 << WB_FRN_TIME_SHIFT)) /* 2s */
2202a814908STejun Heo
2212a814908STejun Heo #define WB_FRN_HIST_SLOTS 16 /* inode->i_wb_frn_history is 16bit */
2222a814908STejun Heo #define WB_FRN_HIST_UNIT (WB_FRN_TIME_PERIOD / WB_FRN_HIST_SLOTS)
2232a814908STejun Heo /* each slot's duration is 2s / 16 */
2242a814908STejun Heo #define WB_FRN_HIST_THR_SLOTS (WB_FRN_HIST_SLOTS / 2)
2252a814908STejun Heo /* if foreign slots >= 8, switch */
2262a814908STejun Heo #define WB_FRN_HIST_MAX_SLOTS (WB_FRN_HIST_THR_SLOTS / 2 + 1)
2272a814908STejun Heo /* one round can affect upto 5 slots */
2286444f47eSTejun Heo #define WB_FRN_MAX_IN_FLIGHT 1024 /* don't queue too many concurrently */
2292a814908STejun Heo
230c22d70a1SRoman Gushchin /*
231c22d70a1SRoman Gushchin * Maximum inodes per isw. A specific value has been chosen to make
232c22d70a1SRoman Gushchin * struct inode_switch_wbs_context fit into 1024 bytes kmalloc.
233c22d70a1SRoman Gushchin */
234c22d70a1SRoman Gushchin #define WB_MAX_INODES_PER_ISW ((1024UL - sizeof(struct inode_switch_wbs_context)) \
235c22d70a1SRoman Gushchin / sizeof(struct inode *))
236c22d70a1SRoman Gushchin
237a1a0e23eSTejun Heo static atomic_t isw_nr_in_flight = ATOMIC_INIT(0);
238a1a0e23eSTejun Heo static struct workqueue_struct *isw_wq;
239a1a0e23eSTejun Heo
__inode_attach_wb(struct inode * inode,struct folio * folio)2409cfb816bSMatthew Wilcox (Oracle) void __inode_attach_wb(struct inode *inode, struct folio *folio)
24121c6321fSTejun Heo {
24221c6321fSTejun Heo struct backing_dev_info *bdi = inode_to_bdi(inode);
24321c6321fSTejun Heo struct bdi_writeback *wb = NULL;
24421c6321fSTejun Heo
24521c6321fSTejun Heo if (inode_cgwb_enabled(inode)) {
24621c6321fSTejun Heo struct cgroup_subsys_state *memcg_css;
24721c6321fSTejun Heo
2489cfb816bSMatthew Wilcox (Oracle) if (folio) {
24975376c6fSMatthew Wilcox (Oracle) memcg_css = mem_cgroup_css_from_folio(folio);
25021c6321fSTejun Heo wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
25121c6321fSTejun Heo } else {
25221c6321fSTejun Heo /* must pin memcg_css, see wb_get_create() */
25321c6321fSTejun Heo memcg_css = task_get_css(current, memory_cgrp_id);
25421c6321fSTejun Heo wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
25521c6321fSTejun Heo css_put(memcg_css);
25621c6321fSTejun Heo }
25721c6321fSTejun Heo }
25821c6321fSTejun Heo
25921c6321fSTejun Heo if (!wb)
26021c6321fSTejun Heo wb = &bdi->wb;
26121c6321fSTejun Heo
26221c6321fSTejun Heo /*
26321c6321fSTejun Heo * There may be multiple instances of this function racing to
26421c6321fSTejun Heo * update the same inode. Use cmpxchg() to tell the winner.
26521c6321fSTejun Heo */
26621c6321fSTejun Heo if (unlikely(cmpxchg(&inode->i_wb, NULL, wb)))
26721c6321fSTejun Heo wb_put(wb);
26821c6321fSTejun Heo }
2699b0eb69bSTejun Heo EXPORT_SYMBOL_GPL(__inode_attach_wb);
27021c6321fSTejun Heo
271703c2708STejun Heo /**
272f3b6a6dfSRoman Gushchin * inode_cgwb_move_to_attached - put the inode onto wb->b_attached list
273f3b6a6dfSRoman Gushchin * @inode: inode of interest with i_lock held
274f3b6a6dfSRoman Gushchin * @wb: target bdi_writeback
275f3b6a6dfSRoman Gushchin *
276f3b6a6dfSRoman Gushchin * Remove the inode from wb's io lists and if necessarily put onto b_attached
277f3b6a6dfSRoman Gushchin * list. Only inodes attached to cgwb's are kept on this list.
278f3b6a6dfSRoman Gushchin */
inode_cgwb_move_to_attached(struct inode * inode,struct bdi_writeback * wb)279f3b6a6dfSRoman Gushchin static void inode_cgwb_move_to_attached(struct inode *inode,
280f3b6a6dfSRoman Gushchin struct bdi_writeback *wb)
281f3b6a6dfSRoman Gushchin {
282f3b6a6dfSRoman Gushchin assert_spin_locked(&wb->list_lock);
283f3b6a6dfSRoman Gushchin assert_spin_locked(&inode->i_lock);
284a9438b44SJan Kara WARN_ON_ONCE(inode->i_state & I_FREEING);
285f3b6a6dfSRoman Gushchin
286f3b6a6dfSRoman Gushchin inode->i_state &= ~I_SYNC_QUEUED;
287f3b6a6dfSRoman Gushchin if (wb != &wb->bdi->wb)
288f3b6a6dfSRoman Gushchin list_move(&inode->i_io_list, &wb->b_attached);
289f3b6a6dfSRoman Gushchin else
290f3b6a6dfSRoman Gushchin list_del_init(&inode->i_io_list);
291f3b6a6dfSRoman Gushchin wb_io_lists_depopulated(wb);
292f3b6a6dfSRoman Gushchin }
293f3b6a6dfSRoman Gushchin
294f3b6a6dfSRoman Gushchin /**
29587e1d789STejun Heo * locked_inode_to_wb_and_lock_list - determine a locked inode's wb and lock it
29687e1d789STejun Heo * @inode: inode of interest with i_lock held
29787e1d789STejun Heo *
29887e1d789STejun Heo * Returns @inode's wb with its list_lock held. @inode->i_lock must be
29987e1d789STejun Heo * held on entry and is released on return. The returned wb is guaranteed
30087e1d789STejun Heo * to stay @inode's associated wb until its list_lock is released.
30187e1d789STejun Heo */
30287e1d789STejun Heo static struct bdi_writeback *
locked_inode_to_wb_and_lock_list(struct inode * inode)30387e1d789STejun Heo locked_inode_to_wb_and_lock_list(struct inode *inode)
30487e1d789STejun Heo __releases(&inode->i_lock)
30587e1d789STejun Heo __acquires(&wb->list_lock)
30687e1d789STejun Heo {
30787e1d789STejun Heo while (true) {
30887e1d789STejun Heo struct bdi_writeback *wb = inode_to_wb(inode);
30987e1d789STejun Heo
31087e1d789STejun Heo /*
31187e1d789STejun Heo * inode_to_wb() association is protected by both
31287e1d789STejun Heo * @inode->i_lock and @wb->list_lock but list_lock nests
31387e1d789STejun Heo * outside i_lock. Drop i_lock and verify that the
31487e1d789STejun Heo * association hasn't changed after acquiring list_lock.
31587e1d789STejun Heo */
31687e1d789STejun Heo wb_get(wb);
31787e1d789STejun Heo spin_unlock(&inode->i_lock);
31887e1d789STejun Heo spin_lock(&wb->list_lock);
31987e1d789STejun Heo
320aaa2cacfSTejun Heo /* i_wb may have changed inbetween, can't use inode_to_wb() */
321614a4e37STejun Heo if (likely(wb == inode->i_wb)) {
322614a4e37STejun Heo wb_put(wb); /* @inode already has ref */
323614a4e37STejun Heo return wb;
324614a4e37STejun Heo }
32587e1d789STejun Heo
32687e1d789STejun Heo spin_unlock(&wb->list_lock);
327614a4e37STejun Heo wb_put(wb);
32887e1d789STejun Heo cpu_relax();
32987e1d789STejun Heo spin_lock(&inode->i_lock);
33087e1d789STejun Heo }
33187e1d789STejun Heo }
33287e1d789STejun Heo
33387e1d789STejun Heo /**
33487e1d789STejun Heo * inode_to_wb_and_lock_list - determine an inode's wb and lock it
33587e1d789STejun Heo * @inode: inode of interest
33687e1d789STejun Heo *
33787e1d789STejun Heo * Same as locked_inode_to_wb_and_lock_list() but @inode->i_lock isn't held
33887e1d789STejun Heo * on entry.
33987e1d789STejun Heo */
inode_to_wb_and_lock_list(struct inode * inode)34087e1d789STejun Heo static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
34187e1d789STejun Heo __acquires(&wb->list_lock)
34287e1d789STejun Heo {
34387e1d789STejun Heo spin_lock(&inode->i_lock);
34487e1d789STejun Heo return locked_inode_to_wb_and_lock_list(inode);
34587e1d789STejun Heo }
34687e1d789STejun Heo
347682aa8e1STejun Heo struct inode_switch_wbs_context {
34829264d92SRoman Gushchin struct rcu_work work;
349f5fbe6b7SRoman Gushchin
350f5fbe6b7SRoman Gushchin /*
351f5fbe6b7SRoman Gushchin * Multiple inodes can be switched at once. The switching procedure
352f5fbe6b7SRoman Gushchin * consists of two parts, separated by a RCU grace period. To make
353f5fbe6b7SRoman Gushchin * sure that the second part is executed for each inode gone through
354f5fbe6b7SRoman Gushchin * the first part, all inode pointers are placed into a NULL-terminated
355f5fbe6b7SRoman Gushchin * array embedded into struct inode_switch_wbs_context. Otherwise
356f5fbe6b7SRoman Gushchin * an inode could be left in a non-consistent state.
357f5fbe6b7SRoman Gushchin */
358f5fbe6b7SRoman Gushchin struct bdi_writeback *new_wb;
359f5fbe6b7SRoman Gushchin struct inode *inodes[];
360682aa8e1STejun Heo };
361682aa8e1STejun Heo
bdi_down_write_wb_switch_rwsem(struct backing_dev_info * bdi)3627fc5854fSTejun Heo static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi)
3637fc5854fSTejun Heo {
3647fc5854fSTejun Heo down_write(&bdi->wb_switch_rwsem);
3657fc5854fSTejun Heo }
3667fc5854fSTejun Heo
bdi_up_write_wb_switch_rwsem(struct backing_dev_info * bdi)3677fc5854fSTejun Heo static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi)
3687fc5854fSTejun Heo {
3697fc5854fSTejun Heo up_write(&bdi->wb_switch_rwsem);
3707fc5854fSTejun Heo }
3717fc5854fSTejun Heo
inode_do_switch_wbs(struct inode * inode,struct bdi_writeback * old_wb,struct bdi_writeback * new_wb)372f5fbe6b7SRoman Gushchin static bool inode_do_switch_wbs(struct inode *inode,
373f5fbe6b7SRoman Gushchin struct bdi_writeback *old_wb,
37472d4512eSRoman Gushchin struct bdi_writeback *new_wb)
375682aa8e1STejun Heo {
376d10c8095STejun Heo struct address_space *mapping = inode->i_mapping;
37704edf02cSMatthew Wilcox XA_STATE(xas, &mapping->i_pages, 0);
37822b3c8d6SMatthew Wilcox (Oracle) struct folio *folio;
379d10c8095STejun Heo bool switched = false;
380682aa8e1STejun Heo
381682aa8e1STejun Heo spin_lock(&inode->i_lock);
382b93b0163SMatthew Wilcox xa_lock_irq(&mapping->i_pages);
383682aa8e1STejun Heo
384d10c8095STejun Heo /*
3854ade5867SRoman Gushchin * Once I_FREEING or I_WILL_FREE are visible under i_lock, the eviction
3864ade5867SRoman Gushchin * path owns the inode and we shouldn't modify ->i_io_list.
387d10c8095STejun Heo */
3884ade5867SRoman Gushchin if (unlikely(inode->i_state & (I_FREEING | I_WILL_FREE)))
389d10c8095STejun Heo goto skip_switch;
390d10c8095STejun Heo
3913a8e9ac8STejun Heo trace_inode_switch_wbs(inode, old_wb, new_wb);
3923a8e9ac8STejun Heo
393d10c8095STejun Heo /*
394d10c8095STejun Heo * Count and transfer stats. Note that PAGECACHE_TAG_DIRTY points
39522b3c8d6SMatthew Wilcox (Oracle) * to possibly dirty folios while PAGECACHE_TAG_WRITEBACK points to
39622b3c8d6SMatthew Wilcox (Oracle) * folios actually under writeback.
397d10c8095STejun Heo */
39822b3c8d6SMatthew Wilcox (Oracle) xas_for_each_marked(&xas, folio, ULONG_MAX, PAGECACHE_TAG_DIRTY) {
39922b3c8d6SMatthew Wilcox (Oracle) if (folio_test_dirty(folio)) {
40022b3c8d6SMatthew Wilcox (Oracle) long nr = folio_nr_pages(folio);
40122b3c8d6SMatthew Wilcox (Oracle) wb_stat_mod(old_wb, WB_RECLAIMABLE, -nr);
40222b3c8d6SMatthew Wilcox (Oracle) wb_stat_mod(new_wb, WB_RECLAIMABLE, nr);
403d10c8095STejun Heo }
404d10c8095STejun Heo }
405d10c8095STejun Heo
40604edf02cSMatthew Wilcox xas_set(&xas, 0);
40722b3c8d6SMatthew Wilcox (Oracle) xas_for_each_marked(&xas, folio, ULONG_MAX, PAGECACHE_TAG_WRITEBACK) {
40822b3c8d6SMatthew Wilcox (Oracle) long nr = folio_nr_pages(folio);
40922b3c8d6SMatthew Wilcox (Oracle) WARN_ON_ONCE(!folio_test_writeback(folio));
41022b3c8d6SMatthew Wilcox (Oracle) wb_stat_mod(old_wb, WB_WRITEBACK, -nr);
41122b3c8d6SMatthew Wilcox (Oracle) wb_stat_mod(new_wb, WB_WRITEBACK, nr);
412d10c8095STejun Heo }
413d10c8095STejun Heo
414633a2abbSJan Kara if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) {
415633a2abbSJan Kara atomic_dec(&old_wb->writeback_inodes);
416633a2abbSJan Kara atomic_inc(&new_wb->writeback_inodes);
417633a2abbSJan Kara }
418633a2abbSJan Kara
419d10c8095STejun Heo wb_get(new_wb);
420d10c8095STejun Heo
421d10c8095STejun Heo /*
422f3b6a6dfSRoman Gushchin * Transfer to @new_wb's IO list if necessary. If the @inode is dirty,
423f3b6a6dfSRoman Gushchin * the specific list @inode was on is ignored and the @inode is put on
424f3b6a6dfSRoman Gushchin * ->b_dirty which is always correct including from ->b_dirty_time.
425f3b6a6dfSRoman Gushchin * The transfer preserves @inode->dirtied_when ordering. If the @inode
426f3b6a6dfSRoman Gushchin * was clean, it means it was on the b_attached list, so move it onto
427f3b6a6dfSRoman Gushchin * the b_attached list of @new_wb.
428d10c8095STejun Heo */
429c7f54084SDave Chinner if (!list_empty(&inode->i_io_list)) {
430f3b6a6dfSRoman Gushchin inode->i_wb = new_wb;
431f3b6a6dfSRoman Gushchin
432f3b6a6dfSRoman Gushchin if (inode->i_state & I_DIRTY_ALL) {
433d10c8095STejun Heo struct inode *pos;
434d10c8095STejun Heo
435c7f54084SDave Chinner list_for_each_entry(pos, &new_wb->b_dirty, i_io_list)
436d10c8095STejun Heo if (time_after_eq(inode->dirtied_when,
437d10c8095STejun Heo pos->dirtied_when))
438d10c8095STejun Heo break;
439f3b6a6dfSRoman Gushchin inode_io_list_move_locked(inode, new_wb,
440f3b6a6dfSRoman Gushchin pos->i_io_list.prev);
441f3b6a6dfSRoman Gushchin } else {
442f3b6a6dfSRoman Gushchin inode_cgwb_move_to_attached(inode, new_wb);
443f3b6a6dfSRoman Gushchin }
444d10c8095STejun Heo } else {
445d10c8095STejun Heo inode->i_wb = new_wb;
446d10c8095STejun Heo }
447d10c8095STejun Heo
448d10c8095STejun Heo /* ->i_wb_frn updates may race wbc_detach_inode() but doesn't matter */
449682aa8e1STejun Heo inode->i_wb_frn_winner = 0;
450682aa8e1STejun Heo inode->i_wb_frn_avg_time = 0;
451682aa8e1STejun Heo inode->i_wb_frn_history = 0;
452d10c8095STejun Heo switched = true;
453d10c8095STejun Heo skip_switch:
454682aa8e1STejun Heo /*
455682aa8e1STejun Heo * Paired with load_acquire in unlocked_inode_to_wb_begin() and
456682aa8e1STejun Heo * ensures that the new wb is visible if they see !I_WB_SWITCH.
457682aa8e1STejun Heo */
458682aa8e1STejun Heo smp_store_release(&inode->i_state, inode->i_state & ~I_WB_SWITCH);
459682aa8e1STejun Heo
460b93b0163SMatthew Wilcox xa_unlock_irq(&mapping->i_pages);
461682aa8e1STejun Heo spin_unlock(&inode->i_lock);
462d10c8095STejun Heo
463f5fbe6b7SRoman Gushchin return switched;
46472d4512eSRoman Gushchin }
465682aa8e1STejun Heo
inode_switch_wbs_work_fn(struct work_struct * work)46672d4512eSRoman Gushchin static void inode_switch_wbs_work_fn(struct work_struct *work)
46772d4512eSRoman Gushchin {
46872d4512eSRoman Gushchin struct inode_switch_wbs_context *isw =
46972d4512eSRoman Gushchin container_of(to_rcu_work(work), struct inode_switch_wbs_context, work);
470f5fbe6b7SRoman Gushchin struct backing_dev_info *bdi = inode_to_bdi(isw->inodes[0]);
471f5fbe6b7SRoman Gushchin struct bdi_writeback *old_wb = isw->inodes[0]->i_wb;
472f5fbe6b7SRoman Gushchin struct bdi_writeback *new_wb = isw->new_wb;
473f5fbe6b7SRoman Gushchin unsigned long nr_switched = 0;
474f5fbe6b7SRoman Gushchin struct inode **inodep;
47572d4512eSRoman Gushchin
476f5fbe6b7SRoman Gushchin /*
477f5fbe6b7SRoman Gushchin * If @inode switches cgwb membership while sync_inodes_sb() is
478f5fbe6b7SRoman Gushchin * being issued, sync_inodes_sb() might miss it. Synchronize.
479f5fbe6b7SRoman Gushchin */
480f5fbe6b7SRoman Gushchin down_read(&bdi->wb_switch_rwsem);
481f5fbe6b7SRoman Gushchin
482f5fbe6b7SRoman Gushchin /*
483f5fbe6b7SRoman Gushchin * By the time control reaches here, RCU grace period has passed
484f5fbe6b7SRoman Gushchin * since I_WB_SWITCH assertion and all wb stat update transactions
485f5fbe6b7SRoman Gushchin * between unlocked_inode_to_wb_begin/end() are guaranteed to be
486f5fbe6b7SRoman Gushchin * synchronizing against the i_pages lock.
487f5fbe6b7SRoman Gushchin *
488f5fbe6b7SRoman Gushchin * Grabbing old_wb->list_lock, inode->i_lock and the i_pages lock
489f5fbe6b7SRoman Gushchin * gives us exclusion against all wb related operations on @inode
490f5fbe6b7SRoman Gushchin * including IO list manipulations and stat updates.
491f5fbe6b7SRoman Gushchin */
492f5fbe6b7SRoman Gushchin if (old_wb < new_wb) {
493f5fbe6b7SRoman Gushchin spin_lock(&old_wb->list_lock);
494f5fbe6b7SRoman Gushchin spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING);
495f5fbe6b7SRoman Gushchin } else {
496f5fbe6b7SRoman Gushchin spin_lock(&new_wb->list_lock);
497f5fbe6b7SRoman Gushchin spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING);
498f5fbe6b7SRoman Gushchin }
499f5fbe6b7SRoman Gushchin
500f5fbe6b7SRoman Gushchin for (inodep = isw->inodes; *inodep; inodep++) {
501f5fbe6b7SRoman Gushchin WARN_ON_ONCE((*inodep)->i_wb != old_wb);
502f5fbe6b7SRoman Gushchin if (inode_do_switch_wbs(*inodep, old_wb, new_wb))
503f5fbe6b7SRoman Gushchin nr_switched++;
504f5fbe6b7SRoman Gushchin }
505f5fbe6b7SRoman Gushchin
506f5fbe6b7SRoman Gushchin spin_unlock(&new_wb->list_lock);
507f5fbe6b7SRoman Gushchin spin_unlock(&old_wb->list_lock);
508f5fbe6b7SRoman Gushchin
509f5fbe6b7SRoman Gushchin up_read(&bdi->wb_switch_rwsem);
510f5fbe6b7SRoman Gushchin
511f5fbe6b7SRoman Gushchin if (nr_switched) {
512f5fbe6b7SRoman Gushchin wb_wakeup(new_wb);
513f5fbe6b7SRoman Gushchin wb_put_many(old_wb, nr_switched);
514f5fbe6b7SRoman Gushchin }
515f5fbe6b7SRoman Gushchin
516f5fbe6b7SRoman Gushchin for (inodep = isw->inodes; *inodep; inodep++)
517f5fbe6b7SRoman Gushchin iput(*inodep);
518f5fbe6b7SRoman Gushchin wb_put(new_wb);
519682aa8e1STejun Heo kfree(isw);
520a1a0e23eSTejun Heo atomic_dec(&isw_nr_in_flight);
521682aa8e1STejun Heo }
522682aa8e1STejun Heo
inode_prepare_wbs_switch(struct inode * inode,struct bdi_writeback * new_wb)523c22d70a1SRoman Gushchin static bool inode_prepare_wbs_switch(struct inode *inode,
524c22d70a1SRoman Gushchin struct bdi_writeback *new_wb)
525c22d70a1SRoman Gushchin {
526c22d70a1SRoman Gushchin /*
527c22d70a1SRoman Gushchin * Paired with smp_mb() in cgroup_writeback_umount().
528c22d70a1SRoman Gushchin * isw_nr_in_flight must be increased before checking SB_ACTIVE and
529c22d70a1SRoman Gushchin * grabbing an inode, otherwise isw_nr_in_flight can be observed as 0
530c22d70a1SRoman Gushchin * in cgroup_writeback_umount() and the isw_wq will be not flushed.
531c22d70a1SRoman Gushchin */
532c22d70a1SRoman Gushchin smp_mb();
533c22d70a1SRoman Gushchin
534593311e8SRoman Gushchin if (IS_DAX(inode))
535593311e8SRoman Gushchin return false;
536593311e8SRoman Gushchin
537c22d70a1SRoman Gushchin /* while holding I_WB_SWITCH, no one else can update the association */
538c22d70a1SRoman Gushchin spin_lock(&inode->i_lock);
539c22d70a1SRoman Gushchin if (!(inode->i_sb->s_flags & SB_ACTIVE) ||
540c22d70a1SRoman Gushchin inode->i_state & (I_WB_SWITCH | I_FREEING | I_WILL_FREE) ||
541c22d70a1SRoman Gushchin inode_to_wb(inode) == new_wb) {
542c22d70a1SRoman Gushchin spin_unlock(&inode->i_lock);
543c22d70a1SRoman Gushchin return false;
544c22d70a1SRoman Gushchin }
545c22d70a1SRoman Gushchin inode->i_state |= I_WB_SWITCH;
546c22d70a1SRoman Gushchin __iget(inode);
547c22d70a1SRoman Gushchin spin_unlock(&inode->i_lock);
548c22d70a1SRoman Gushchin
549c22d70a1SRoman Gushchin return true;
550c22d70a1SRoman Gushchin }
551c22d70a1SRoman Gushchin
552682aa8e1STejun Heo /**
553682aa8e1STejun Heo * inode_switch_wbs - change the wb association of an inode
554682aa8e1STejun Heo * @inode: target inode
555682aa8e1STejun Heo * @new_wb_id: ID of the new wb
556682aa8e1STejun Heo *
557682aa8e1STejun Heo * Switch @inode's wb association to the wb identified by @new_wb_id. The
558682aa8e1STejun Heo * switching is performed asynchronously and may fail silently.
559682aa8e1STejun Heo */
inode_switch_wbs(struct inode * inode,int new_wb_id)560682aa8e1STejun Heo static void inode_switch_wbs(struct inode *inode, int new_wb_id)
561682aa8e1STejun Heo {
562682aa8e1STejun Heo struct backing_dev_info *bdi = inode_to_bdi(inode);
563682aa8e1STejun Heo struct cgroup_subsys_state *memcg_css;
564682aa8e1STejun Heo struct inode_switch_wbs_context *isw;
565682aa8e1STejun Heo
566682aa8e1STejun Heo /* noop if seems to be already in progress */
567682aa8e1STejun Heo if (inode->i_state & I_WB_SWITCH)
568682aa8e1STejun Heo return;
569682aa8e1STejun Heo
5706444f47eSTejun Heo /* avoid queueing a new switch if too many are already in flight */
5716444f47eSTejun Heo if (atomic_read(&isw_nr_in_flight) > WB_FRN_MAX_IN_FLIGHT)
5727fc5854fSTejun Heo return;
5737fc5854fSTejun Heo
57498b160c8SLen Baker isw = kzalloc(struct_size(isw, inodes, 2), GFP_ATOMIC);
575682aa8e1STejun Heo if (!isw)
5766444f47eSTejun Heo return;
577682aa8e1STejun Heo
5788826ee4fSRoman Gushchin atomic_inc(&isw_nr_in_flight);
5798826ee4fSRoman Gushchin
580682aa8e1STejun Heo /* find and pin the new wb */
581682aa8e1STejun Heo rcu_read_lock();
582682aa8e1STejun Heo memcg_css = css_from_id(new_wb_id, &memory_cgrp_subsys);
5838b0ed844SMuchun Song if (memcg_css && !css_tryget(memcg_css))
5848b0ed844SMuchun Song memcg_css = NULL;
585682aa8e1STejun Heo rcu_read_unlock();
5868b0ed844SMuchun Song if (!memcg_css)
5878b0ed844SMuchun Song goto out_free;
5888b0ed844SMuchun Song
5898b0ed844SMuchun Song isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
5908b0ed844SMuchun Song css_put(memcg_css);
591682aa8e1STejun Heo if (!isw->new_wb)
592682aa8e1STejun Heo goto out_free;
593682aa8e1STejun Heo
594c22d70a1SRoman Gushchin if (!inode_prepare_wbs_switch(inode, isw->new_wb))
595a1a0e23eSTejun Heo goto out_free;
596682aa8e1STejun Heo
597f5fbe6b7SRoman Gushchin isw->inodes[0] = inode;
598682aa8e1STejun Heo
599682aa8e1STejun Heo /*
600682aa8e1STejun Heo * In addition to synchronizing among switchers, I_WB_SWITCH tells
601b93b0163SMatthew Wilcox * the RCU protected stat update paths to grab the i_page
602b93b0163SMatthew Wilcox * lock so that stat transfer can synchronize against them.
603682aa8e1STejun Heo * Let's continue after I_WB_SWITCH is guaranteed to be visible.
604682aa8e1STejun Heo */
60529264d92SRoman Gushchin INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn);
60629264d92SRoman Gushchin queue_rcu_work(isw_wq, &isw->work);
6076444f47eSTejun Heo return;
608682aa8e1STejun Heo
609682aa8e1STejun Heo out_free:
6108826ee4fSRoman Gushchin atomic_dec(&isw_nr_in_flight);
611682aa8e1STejun Heo if (isw->new_wb)
612682aa8e1STejun Heo wb_put(isw->new_wb);
613682aa8e1STejun Heo kfree(isw);
614682aa8e1STejun Heo }
615682aa8e1STejun Heo
isw_prepare_wbs_switch(struct inode_switch_wbs_context * isw,struct list_head * list,int * nr)616ea6fac90SJingbo Xu static bool isw_prepare_wbs_switch(struct inode_switch_wbs_context *isw,
617ea6fac90SJingbo Xu struct list_head *list, int *nr)
618ea6fac90SJingbo Xu {
619ea6fac90SJingbo Xu struct inode *inode;
620ea6fac90SJingbo Xu
621ea6fac90SJingbo Xu list_for_each_entry(inode, list, i_io_list) {
622ea6fac90SJingbo Xu if (!inode_prepare_wbs_switch(inode, isw->new_wb))
623ea6fac90SJingbo Xu continue;
624ea6fac90SJingbo Xu
625ea6fac90SJingbo Xu isw->inodes[*nr] = inode;
626ea6fac90SJingbo Xu (*nr)++;
627ea6fac90SJingbo Xu
628ea6fac90SJingbo Xu if (*nr >= WB_MAX_INODES_PER_ISW - 1)
629ea6fac90SJingbo Xu return true;
630ea6fac90SJingbo Xu }
631ea6fac90SJingbo Xu return false;
632ea6fac90SJingbo Xu }
633ea6fac90SJingbo Xu
63487e1d789STejun Heo /**
635c22d70a1SRoman Gushchin * cleanup_offline_cgwb - detach associated inodes
636c22d70a1SRoman Gushchin * @wb: target wb
637c22d70a1SRoman Gushchin *
638c22d70a1SRoman Gushchin * Switch all inodes attached to @wb to a nearest living ancestor's wb in order
639c22d70a1SRoman Gushchin * to eventually release the dying @wb. Returns %true if not all inodes were
640c22d70a1SRoman Gushchin * switched and the function has to be restarted.
641c22d70a1SRoman Gushchin */
cleanup_offline_cgwb(struct bdi_writeback * wb)642c22d70a1SRoman Gushchin bool cleanup_offline_cgwb(struct bdi_writeback *wb)
643c22d70a1SRoman Gushchin {
644c22d70a1SRoman Gushchin struct cgroup_subsys_state *memcg_css;
645c22d70a1SRoman Gushchin struct inode_switch_wbs_context *isw;
646c22d70a1SRoman Gushchin int nr;
647c22d70a1SRoman Gushchin bool restart = false;
648c22d70a1SRoman Gushchin
64998b160c8SLen Baker isw = kzalloc(struct_size(isw, inodes, WB_MAX_INODES_PER_ISW),
65098b160c8SLen Baker GFP_KERNEL);
651c22d70a1SRoman Gushchin if (!isw)
652c22d70a1SRoman Gushchin return restart;
653c22d70a1SRoman Gushchin
654c22d70a1SRoman Gushchin atomic_inc(&isw_nr_in_flight);
655c22d70a1SRoman Gushchin
656c22d70a1SRoman Gushchin for (memcg_css = wb->memcg_css->parent; memcg_css;
657c22d70a1SRoman Gushchin memcg_css = memcg_css->parent) {
658c22d70a1SRoman Gushchin isw->new_wb = wb_get_create(wb->bdi, memcg_css, GFP_KERNEL);
659c22d70a1SRoman Gushchin if (isw->new_wb)
660c22d70a1SRoman Gushchin break;
661c22d70a1SRoman Gushchin }
662c22d70a1SRoman Gushchin if (unlikely(!isw->new_wb))
663c22d70a1SRoman Gushchin isw->new_wb = &wb->bdi->wb; /* wb_get() is noop for bdi's wb */
664c22d70a1SRoman Gushchin
665c22d70a1SRoman Gushchin nr = 0;
666c22d70a1SRoman Gushchin spin_lock(&wb->list_lock);
667ea6fac90SJingbo Xu /*
668ea6fac90SJingbo Xu * In addition to the inodes that have completed writeback, also switch
669ea6fac90SJingbo Xu * cgwbs for those inodes only with dirty timestamps. Otherwise, those
670ea6fac90SJingbo Xu * inodes won't be written back for a long time when lazytime is
671ea6fac90SJingbo Xu * enabled, and thus pinning the dying cgwbs. It won't break the
672ea6fac90SJingbo Xu * bandwidth restrictions, as writeback of inode metadata is not
673ea6fac90SJingbo Xu * accounted for.
674ea6fac90SJingbo Xu */
675ea6fac90SJingbo Xu restart = isw_prepare_wbs_switch(isw, &wb->b_attached, &nr);
676ea6fac90SJingbo Xu if (!restart)
677ea6fac90SJingbo Xu restart = isw_prepare_wbs_switch(isw, &wb->b_dirty_time, &nr);
678c22d70a1SRoman Gushchin spin_unlock(&wb->list_lock);
679c22d70a1SRoman Gushchin
680c22d70a1SRoman Gushchin /* no attached inodes? bail out */
681c22d70a1SRoman Gushchin if (nr == 0) {
682c22d70a1SRoman Gushchin atomic_dec(&isw_nr_in_flight);
683c22d70a1SRoman Gushchin wb_put(isw->new_wb);
684c22d70a1SRoman Gushchin kfree(isw);
685c22d70a1SRoman Gushchin return restart;
686c22d70a1SRoman Gushchin }
687c22d70a1SRoman Gushchin
688c22d70a1SRoman Gushchin /*
689c22d70a1SRoman Gushchin * In addition to synchronizing among switchers, I_WB_SWITCH tells
690c22d70a1SRoman Gushchin * the RCU protected stat update paths to grab the i_page
691c22d70a1SRoman Gushchin * lock so that stat transfer can synchronize against them.
692c22d70a1SRoman Gushchin * Let's continue after I_WB_SWITCH is guaranteed to be visible.
693c22d70a1SRoman Gushchin */
694c22d70a1SRoman Gushchin INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn);
695c22d70a1SRoman Gushchin queue_rcu_work(isw_wq, &isw->work);
696c22d70a1SRoman Gushchin
697c22d70a1SRoman Gushchin return restart;
698c22d70a1SRoman Gushchin }
699c22d70a1SRoman Gushchin
700c22d70a1SRoman Gushchin /**
701b16b1debSTejun Heo * wbc_attach_and_unlock_inode - associate wbc with target inode and unlock it
702b16b1debSTejun Heo * @wbc: writeback_control of interest
703b16b1debSTejun Heo * @inode: target inode
704b16b1debSTejun Heo *
705b16b1debSTejun Heo * @inode is locked and about to be written back under the control of @wbc.
706b16b1debSTejun Heo * Record @inode's writeback context into @wbc and unlock the i_lock. On
707b16b1debSTejun Heo * writeback completion, wbc_detach_inode() should be called. This is used
708b16b1debSTejun Heo * to track the cgroup writeback context.
709b16b1debSTejun Heo */
wbc_attach_and_unlock_inode(struct writeback_control * wbc,struct inode * inode)710b16b1debSTejun Heo void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
711b16b1debSTejun Heo struct inode *inode)
712b16b1debSTejun Heo {
713dd73e4b7STejun Heo if (!inode_cgwb_enabled(inode)) {
714dd73e4b7STejun Heo spin_unlock(&inode->i_lock);
715dd73e4b7STejun Heo return;
716dd73e4b7STejun Heo }
717dd73e4b7STejun Heo
718b16b1debSTejun Heo wbc->wb = inode_to_wb(inode);
7192a814908STejun Heo wbc->inode = inode;
7202a814908STejun Heo
7212a814908STejun Heo wbc->wb_id = wbc->wb->memcg_css->id;
7222a814908STejun Heo wbc->wb_lcand_id = inode->i_wb_frn_winner;
7232a814908STejun Heo wbc->wb_tcand_id = 0;
7242a814908STejun Heo wbc->wb_bytes = 0;
7252a814908STejun Heo wbc->wb_lcand_bytes = 0;
7262a814908STejun Heo wbc->wb_tcand_bytes = 0;
7272a814908STejun Heo
728b16b1debSTejun Heo wb_get(wbc->wb);
729b16b1debSTejun Heo spin_unlock(&inode->i_lock);
730e8a7abf5STejun Heo
731e8a7abf5STejun Heo /*
73265de03e2STejun Heo * A dying wb indicates that either the blkcg associated with the
73365de03e2STejun Heo * memcg changed or the associated memcg is dying. In the first
73465de03e2STejun Heo * case, a replacement wb should already be available and we should
73565de03e2STejun Heo * refresh the wb immediately. In the second case, trying to
73665de03e2STejun Heo * refresh will keep failing.
737e8a7abf5STejun Heo */
73865de03e2STejun Heo if (unlikely(wb_dying(wbc->wb) && !css_is_dying(wbc->wb->memcg_css)))
739e8a7abf5STejun Heo inode_switch_wbs(inode, wbc->wb_id);
740b16b1debSTejun Heo }
7419b0eb69bSTejun Heo EXPORT_SYMBOL_GPL(wbc_attach_and_unlock_inode);
742b16b1debSTejun Heo
743b16b1debSTejun Heo /**
7442a814908STejun Heo * wbc_detach_inode - disassociate wbc from inode and perform foreign detection
7452a814908STejun Heo * @wbc: writeback_control of the just finished writeback
746b16b1debSTejun Heo *
747b16b1debSTejun Heo * To be called after a writeback attempt of an inode finishes and undoes
748b16b1debSTejun Heo * wbc_attach_and_unlock_inode(). Can be called under any context.
7492a814908STejun Heo *
7502a814908STejun Heo * As concurrent write sharing of an inode is expected to be very rare and
7512a814908STejun Heo * memcg only tracks page ownership on first-use basis severely confining
7522a814908STejun Heo * the usefulness of such sharing, cgroup writeback tracks ownership
7532a814908STejun Heo * per-inode. While the support for concurrent write sharing of an inode
7542a814908STejun Heo * is deemed unnecessary, an inode being written to by different cgroups at
7552a814908STejun Heo * different points in time is a lot more common, and, more importantly,
7562a814908STejun Heo * charging only by first-use can too readily lead to grossly incorrect
7572a814908STejun Heo * behaviors (single foreign page can lead to gigabytes of writeback to be
7582a814908STejun Heo * incorrectly attributed).
7592a814908STejun Heo *
7602a814908STejun Heo * To resolve this issue, cgroup writeback detects the majority dirtier of
7612999e1e3SJulia Lawall * an inode and transfers the ownership to it. To avoid unnecessary
7622a814908STejun Heo * oscillation, the detection mechanism keeps track of history and gives
7632a814908STejun Heo * out the switch verdict only if the foreign usage pattern is stable over
7642a814908STejun Heo * a certain amount of time and/or writeback attempts.
7652a814908STejun Heo *
7662a814908STejun Heo * On each writeback attempt, @wbc tries to detect the majority writer
7672a814908STejun Heo * using Boyer-Moore majority vote algorithm. In addition to the byte
7682a814908STejun Heo * count from the majority voting, it also counts the bytes written for the
7692a814908STejun Heo * current wb and the last round's winner wb (max of last round's current
7702a814908STejun Heo * wb, the winner from two rounds ago, and the last round's majority
7712a814908STejun Heo * candidate). Keeping track of the historical winner helps the algorithm
7722a814908STejun Heo * to semi-reliably detect the most active writer even when it's not the
7732a814908STejun Heo * absolute majority.
7742a814908STejun Heo *
7752a814908STejun Heo * Once the winner of the round is determined, whether the winner is
7762a814908STejun Heo * foreign or not and how much IO time the round consumed is recorded in
7772a814908STejun Heo * inode->i_wb_frn_history. If the amount of recorded foreign IO time is
7782a814908STejun Heo * over a certain threshold, the switch verdict is given.
779b16b1debSTejun Heo */
wbc_detach_inode(struct writeback_control * wbc)780b16b1debSTejun Heo void wbc_detach_inode(struct writeback_control *wbc)
781b16b1debSTejun Heo {
7822a814908STejun Heo struct bdi_writeback *wb = wbc->wb;
7832a814908STejun Heo struct inode *inode = wbc->inode;
784dd73e4b7STejun Heo unsigned long avg_time, max_bytes, max_time;
785dd73e4b7STejun Heo u16 history;
7862a814908STejun Heo int max_id;
7872a814908STejun Heo
788dd73e4b7STejun Heo if (!wb)
789dd73e4b7STejun Heo return;
790dd73e4b7STejun Heo
791dd73e4b7STejun Heo history = inode->i_wb_frn_history;
792dd73e4b7STejun Heo avg_time = inode->i_wb_frn_avg_time;
793dd73e4b7STejun Heo
7942a814908STejun Heo /* pick the winner of this round */
7952a814908STejun Heo if (wbc->wb_bytes >= wbc->wb_lcand_bytes &&
7962a814908STejun Heo wbc->wb_bytes >= wbc->wb_tcand_bytes) {
7972a814908STejun Heo max_id = wbc->wb_id;
7982a814908STejun Heo max_bytes = wbc->wb_bytes;
7992a814908STejun Heo } else if (wbc->wb_lcand_bytes >= wbc->wb_tcand_bytes) {
8002a814908STejun Heo max_id = wbc->wb_lcand_id;
8012a814908STejun Heo max_bytes = wbc->wb_lcand_bytes;
8022a814908STejun Heo } else {
8032a814908STejun Heo max_id = wbc->wb_tcand_id;
8042a814908STejun Heo max_bytes = wbc->wb_tcand_bytes;
8052a814908STejun Heo }
8062a814908STejun Heo
8072a814908STejun Heo /*
8082a814908STejun Heo * Calculate the amount of IO time the winner consumed and fold it
8092a814908STejun Heo * into the running average kept per inode. If the consumed IO
8102a814908STejun Heo * time is lower than avag / WB_FRN_TIME_CUT_DIV, ignore it for
8112a814908STejun Heo * deciding whether to switch or not. This is to prevent one-off
8122a814908STejun Heo * small dirtiers from skewing the verdict.
8132a814908STejun Heo */
8142a814908STejun Heo max_time = DIV_ROUND_UP((max_bytes >> PAGE_SHIFT) << WB_FRN_TIME_SHIFT,
8152a814908STejun Heo wb->avg_write_bandwidth);
8162a814908STejun Heo if (avg_time)
8172a814908STejun Heo avg_time += (max_time >> WB_FRN_TIME_AVG_SHIFT) -
8182a814908STejun Heo (avg_time >> WB_FRN_TIME_AVG_SHIFT);
8192a814908STejun Heo else
8202a814908STejun Heo avg_time = max_time; /* immediate catch up on first run */
8212a814908STejun Heo
8222a814908STejun Heo if (max_time >= avg_time / WB_FRN_TIME_CUT_DIV) {
8232a814908STejun Heo int slots;
8242a814908STejun Heo
8252a814908STejun Heo /*
8262a814908STejun Heo * The switch verdict is reached if foreign wb's consume
8272a814908STejun Heo * more than a certain proportion of IO time in a
8282a814908STejun Heo * WB_FRN_TIME_PERIOD. This is loosely tracked by 16 slot
8292a814908STejun Heo * history mask where each bit represents one sixteenth of
8302a814908STejun Heo * the period. Determine the number of slots to shift into
8312a814908STejun Heo * history from @max_time.
8322a814908STejun Heo */
8332a814908STejun Heo slots = min(DIV_ROUND_UP(max_time, WB_FRN_HIST_UNIT),
8342a814908STejun Heo (unsigned long)WB_FRN_HIST_MAX_SLOTS);
8352a814908STejun Heo history <<= slots;
8362a814908STejun Heo if (wbc->wb_id != max_id)
8372a814908STejun Heo history |= (1U << slots) - 1;
8382a814908STejun Heo
8393a8e9ac8STejun Heo if (history)
8403a8e9ac8STejun Heo trace_inode_foreign_history(inode, wbc, history);
8413a8e9ac8STejun Heo
8422a814908STejun Heo /*
8432a814908STejun Heo * Switch if the current wb isn't the consistent winner.
8442a814908STejun Heo * If there are multiple closely competing dirtiers, the
8452a814908STejun Heo * inode may switch across them repeatedly over time, which
8462a814908STejun Heo * is okay. The main goal is avoiding keeping an inode on
8472a814908STejun Heo * the wrong wb for an extended period of time.
8482a814908STejun Heo */
8493e46c89cSMaxim Korotkov if (hweight16(history) > WB_FRN_HIST_THR_SLOTS)
850682aa8e1STejun Heo inode_switch_wbs(inode, max_id);
8512a814908STejun Heo }
8522a814908STejun Heo
8532a814908STejun Heo /*
8542a814908STejun Heo * Multiple instances of this function may race to update the
8552a814908STejun Heo * following fields but we don't mind occassional inaccuracies.
8562a814908STejun Heo */
8572a814908STejun Heo inode->i_wb_frn_winner = max_id;
8582a814908STejun Heo inode->i_wb_frn_avg_time = min(avg_time, (unsigned long)U16_MAX);
8592a814908STejun Heo inode->i_wb_frn_history = history;
8602a814908STejun Heo
861b16b1debSTejun Heo wb_put(wbc->wb);
862b16b1debSTejun Heo wbc->wb = NULL;
863b16b1debSTejun Heo }
8649b0eb69bSTejun Heo EXPORT_SYMBOL_GPL(wbc_detach_inode);
865b16b1debSTejun Heo
866b16b1debSTejun Heo /**
86734e51a5eSTejun Heo * wbc_account_cgroup_owner - account writeback to update inode cgroup ownership
8682a814908STejun Heo * @wbc: writeback_control of the writeback in progress
8692a814908STejun Heo * @page: page being written out
8702a814908STejun Heo * @bytes: number of bytes being written out
8712a814908STejun Heo *
8722a814908STejun Heo * @bytes from @page are about to written out during the writeback
8732a814908STejun Heo * controlled by @wbc. Keep the book for foreign inode detection. See
8742a814908STejun Heo * wbc_detach_inode().
8752a814908STejun Heo */
wbc_account_cgroup_owner(struct writeback_control * wbc,struct page * page,size_t bytes)87634e51a5eSTejun Heo void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
8772a814908STejun Heo size_t bytes)
8782a814908STejun Heo {
87975376c6fSMatthew Wilcox (Oracle) struct folio *folio;
88066311422STejun Heo struct cgroup_subsys_state *css;
8812a814908STejun Heo int id;
8822a814908STejun Heo
8832a814908STejun Heo /*
8842a814908STejun Heo * pageout() path doesn't attach @wbc to the inode being written
8852a814908STejun Heo * out. This is intentional as we don't want the function to block
8862a814908STejun Heo * behind a slow cgroup. Ultimately, we want pageout() to kick off
8872a814908STejun Heo * regular writeback instead of writing things out itself.
8882a814908STejun Heo */
88927b36d8fSTejun Heo if (!wbc->wb || wbc->no_cgroup_owner)
8902a814908STejun Heo return;
8912a814908STejun Heo
89275376c6fSMatthew Wilcox (Oracle) folio = page_folio(page);
89375376c6fSMatthew Wilcox (Oracle) css = mem_cgroup_css_from_folio(folio);
89466311422STejun Heo /* dead cgroups shouldn't contribute to inode ownership arbitration */
89566311422STejun Heo if (!(css->flags & CSS_ONLINE))
89666311422STejun Heo return;
89766311422STejun Heo
89866311422STejun Heo id = css->id;
8992a814908STejun Heo
9002a814908STejun Heo if (id == wbc->wb_id) {
9012a814908STejun Heo wbc->wb_bytes += bytes;
9022a814908STejun Heo return;
9032a814908STejun Heo }
9042a814908STejun Heo
9052a814908STejun Heo if (id == wbc->wb_lcand_id)
9062a814908STejun Heo wbc->wb_lcand_bytes += bytes;
9072a814908STejun Heo
9082a814908STejun Heo /* Boyer-Moore majority vote algorithm */
9092a814908STejun Heo if (!wbc->wb_tcand_bytes)
9102a814908STejun Heo wbc->wb_tcand_id = id;
9112a814908STejun Heo if (id == wbc->wb_tcand_id)
9122a814908STejun Heo wbc->wb_tcand_bytes += bytes;
9132a814908STejun Heo else
9142a814908STejun Heo wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes);
9152a814908STejun Heo }
91634e51a5eSTejun Heo EXPORT_SYMBOL_GPL(wbc_account_cgroup_owner);
9172a814908STejun Heo
9182a814908STejun Heo /**
919f2b65121STejun Heo * wb_split_bdi_pages - split nr_pages to write according to bandwidth
920f2b65121STejun Heo * @wb: target bdi_writeback to split @nr_pages to
921f2b65121STejun Heo * @nr_pages: number of pages to write for the whole bdi
922f2b65121STejun Heo *
923f2b65121STejun Heo * Split @wb's portion of @nr_pages according to @wb's write bandwidth in
924f2b65121STejun Heo * relation to the total write bandwidth of all wb's w/ dirty inodes on
925f2b65121STejun Heo * @wb->bdi.
926f2b65121STejun Heo */
wb_split_bdi_pages(struct bdi_writeback * wb,long nr_pages)927f2b65121STejun Heo static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
928f2b65121STejun Heo {
929f2b65121STejun Heo unsigned long this_bw = wb->avg_write_bandwidth;
930f2b65121STejun Heo unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth);
931f2b65121STejun Heo
932f2b65121STejun Heo if (nr_pages == LONG_MAX)
933f2b65121STejun Heo return LONG_MAX;
934f2b65121STejun Heo
935f2b65121STejun Heo /*
936f2b65121STejun Heo * This may be called on clean wb's and proportional distribution
937f2b65121STejun Heo * may not make sense, just use the original @nr_pages in those
938f2b65121STejun Heo * cases. In general, we wanna err on the side of writing more.
939f2b65121STejun Heo */
940f2b65121STejun Heo if (!tot_bw || this_bw >= tot_bw)
941f2b65121STejun Heo return nr_pages;
942f2b65121STejun Heo else
943f2b65121STejun Heo return DIV_ROUND_UP_ULL((u64)nr_pages * this_bw, tot_bw);
944f2b65121STejun Heo }
945f2b65121STejun Heo
946db125360STejun Heo /**
947db125360STejun Heo * bdi_split_work_to_wbs - split a wb_writeback_work to all wb's of a bdi
948db125360STejun Heo * @bdi: target backing_dev_info
949db125360STejun Heo * @base_work: wb_writeback_work to issue
950db125360STejun Heo * @skip_if_busy: skip wb's which already have writeback in progress
951db125360STejun Heo *
952db125360STejun Heo * Split and issue @base_work to all wb's (bdi_writeback's) of @bdi which
953db125360STejun Heo * have dirty inodes. If @base_work->nr_page isn't %LONG_MAX, it's
954db125360STejun Heo * distributed to the busy wbs according to each wb's proportion in the
955db125360STejun Heo * total active write bandwidth of @bdi.
956db125360STejun Heo */
bdi_split_work_to_wbs(struct backing_dev_info * bdi,struct wb_writeback_work * base_work,bool skip_if_busy)957db125360STejun Heo static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
958db125360STejun Heo struct wb_writeback_work *base_work,
959db125360STejun Heo bool skip_if_busy)
960db125360STejun Heo {
961b817525aSTejun Heo struct bdi_writeback *last_wb = NULL;
962b33e18f6STejun Heo struct bdi_writeback *wb = list_entry(&bdi->wb_list,
963b817525aSTejun Heo struct bdi_writeback, bdi_node);
964db125360STejun Heo
965db125360STejun Heo might_sleep();
966db125360STejun Heo restart:
967db125360STejun Heo rcu_read_lock();
968b817525aSTejun Heo list_for_each_entry_continue_rcu(wb, &bdi->wb_list, bdi_node) {
9695b9cce4cSTejun Heo DEFINE_WB_COMPLETION(fallback_work_done, bdi);
9708a1270cdSTejun Heo struct wb_writeback_work fallback_work;
9718a1270cdSTejun Heo struct wb_writeback_work *work;
9728a1270cdSTejun Heo long nr_pages;
9738a1270cdSTejun Heo
974b817525aSTejun Heo if (last_wb) {
975b817525aSTejun Heo wb_put(last_wb);
976b817525aSTejun Heo last_wb = NULL;
977b817525aSTejun Heo }
978b817525aSTejun Heo
979006a0973STejun Heo /* SYNC_ALL writes out I_DIRTY_TIME too */
980006a0973STejun Heo if (!wb_has_dirty_io(wb) &&
981006a0973STejun Heo (base_work->sync_mode == WB_SYNC_NONE ||
982006a0973STejun Heo list_empty(&wb->b_dirty_time)))
983006a0973STejun Heo continue;
984006a0973STejun Heo if (skip_if_busy && writeback_in_progress(wb))
985db125360STejun Heo continue;
986db125360STejun Heo
9878a1270cdSTejun Heo nr_pages = wb_split_bdi_pages(wb, base_work->nr_pages);
9888a1270cdSTejun Heo
9898a1270cdSTejun Heo work = kmalloc(sizeof(*work), GFP_ATOMIC);
9908a1270cdSTejun Heo if (work) {
9918a1270cdSTejun Heo *work = *base_work;
9928a1270cdSTejun Heo work->nr_pages = nr_pages;
9938a1270cdSTejun Heo work->auto_free = 1;
9948a1270cdSTejun Heo wb_queue_work(wb, work);
9958a1270cdSTejun Heo continue;
996db125360STejun Heo }
9978a1270cdSTejun Heo
9981ba1199eSBaokun Li /*
9991ba1199eSBaokun Li * If wb_tryget fails, the wb has been shutdown, skip it.
10001ba1199eSBaokun Li *
10011ba1199eSBaokun Li * Pin @wb so that it stays on @bdi->wb_list. This allows
10021ba1199eSBaokun Li * continuing iteration from @wb after dropping and
10031ba1199eSBaokun Li * regrabbing rcu read lock.
10041ba1199eSBaokun Li */
10051ba1199eSBaokun Li if (!wb_tryget(wb))
10061ba1199eSBaokun Li continue;
10071ba1199eSBaokun Li
10088a1270cdSTejun Heo /* alloc failed, execute synchronously using on-stack fallback */
10098a1270cdSTejun Heo work = &fallback_work;
10108a1270cdSTejun Heo *work = *base_work;
10118a1270cdSTejun Heo work->nr_pages = nr_pages;
10128a1270cdSTejun Heo work->auto_free = 0;
10138a1270cdSTejun Heo work->done = &fallback_work_done;
10148a1270cdSTejun Heo
10158a1270cdSTejun Heo wb_queue_work(wb, work);
1016b817525aSTejun Heo last_wb = wb;
1017b817525aSTejun Heo
1018db125360STejun Heo rcu_read_unlock();
10195b9cce4cSTejun Heo wb_wait_for_completion(&fallback_work_done);
1020db125360STejun Heo goto restart;
1021db125360STejun Heo }
1022db125360STejun Heo rcu_read_unlock();
1023b817525aSTejun Heo
1024b817525aSTejun Heo if (last_wb)
1025b817525aSTejun Heo wb_put(last_wb);
1026db125360STejun Heo }
1027db125360STejun Heo
1028a1a0e23eSTejun Heo /**
1029d62241c7STejun Heo * cgroup_writeback_by_id - initiate cgroup writeback from bdi and memcg IDs
1030d62241c7STejun Heo * @bdi_id: target bdi id
1031d62241c7STejun Heo * @memcg_id: target memcg css id
1032d62241c7STejun Heo * @reason: reason why some writeback work initiated
1033d62241c7STejun Heo * @done: target wb_completion
1034d62241c7STejun Heo *
1035d62241c7STejun Heo * Initiate flush of the bdi_writeback identified by @bdi_id and @memcg_id
1036d62241c7STejun Heo * with the specified parameters.
1037d62241c7STejun Heo */
cgroup_writeback_by_id(u64 bdi_id,int memcg_id,enum wb_reason reason,struct wb_completion * done)10387490a2d2SShakeel Butt int cgroup_writeback_by_id(u64 bdi_id, int memcg_id,
1039d62241c7STejun Heo enum wb_reason reason, struct wb_completion *done)
1040d62241c7STejun Heo {
1041d62241c7STejun Heo struct backing_dev_info *bdi;
1042d62241c7STejun Heo struct cgroup_subsys_state *memcg_css;
1043d62241c7STejun Heo struct bdi_writeback *wb;
1044d62241c7STejun Heo struct wb_writeback_work *work;
10457490a2d2SShakeel Butt unsigned long dirty;
1046d62241c7STejun Heo int ret;
1047d62241c7STejun Heo
1048d62241c7STejun Heo /* lookup bdi and memcg */
1049d62241c7STejun Heo bdi = bdi_get_by_id(bdi_id);
1050d62241c7STejun Heo if (!bdi)
1051d62241c7STejun Heo return -ENOENT;
1052d62241c7STejun Heo
1053d62241c7STejun Heo rcu_read_lock();
1054d62241c7STejun Heo memcg_css = css_from_id(memcg_id, &memory_cgrp_subsys);
1055d62241c7STejun Heo if (memcg_css && !css_tryget(memcg_css))
1056d62241c7STejun Heo memcg_css = NULL;
1057d62241c7STejun Heo rcu_read_unlock();
1058d62241c7STejun Heo if (!memcg_css) {
1059d62241c7STejun Heo ret = -ENOENT;
1060d62241c7STejun Heo goto out_bdi_put;
1061d62241c7STejun Heo }
1062d62241c7STejun Heo
1063d62241c7STejun Heo /*
1064d62241c7STejun Heo * And find the associated wb. If the wb isn't there already
1065d62241c7STejun Heo * there's nothing to flush, don't create one.
1066d62241c7STejun Heo */
1067d62241c7STejun Heo wb = wb_get_lookup(bdi, memcg_css);
1068d62241c7STejun Heo if (!wb) {
1069d62241c7STejun Heo ret = -ENOENT;
1070d62241c7STejun Heo goto out_css_put;
1071d62241c7STejun Heo }
1072d62241c7STejun Heo
1073d62241c7STejun Heo /*
10747490a2d2SShakeel Butt * The caller is attempting to write out most of
1075d62241c7STejun Heo * the currently dirty pages. Let's take the current dirty page
1076d62241c7STejun Heo * count and inflate it by 25% which should be large enough to
1077d62241c7STejun Heo * flush out most dirty pages while avoiding getting livelocked by
1078d62241c7STejun Heo * concurrent dirtiers.
10797490a2d2SShakeel Butt *
10807490a2d2SShakeel Butt * BTW the memcg stats are flushed periodically and this is best-effort
10817490a2d2SShakeel Butt * estimation, so some potential error is ok.
1082d62241c7STejun Heo */
10837490a2d2SShakeel Butt dirty = memcg_page_state(mem_cgroup_from_css(memcg_css), NR_FILE_DIRTY);
10847490a2d2SShakeel Butt dirty = dirty * 10 / 8;
1085d62241c7STejun Heo
1086d62241c7STejun Heo /* issue the writeback work */
1087d62241c7STejun Heo work = kzalloc(sizeof(*work), GFP_NOWAIT | __GFP_NOWARN);
1088d62241c7STejun Heo if (work) {
10897490a2d2SShakeel Butt work->nr_pages = dirty;
1090d62241c7STejun Heo work->sync_mode = WB_SYNC_NONE;
1091d62241c7STejun Heo work->range_cyclic = 1;
1092d62241c7STejun Heo work->reason = reason;
1093d62241c7STejun Heo work->done = done;
1094d62241c7STejun Heo work->auto_free = 1;
1095d62241c7STejun Heo wb_queue_work(wb, work);
1096d62241c7STejun Heo ret = 0;
1097d62241c7STejun Heo } else {
1098d62241c7STejun Heo ret = -ENOMEM;
1099d62241c7STejun Heo }
1100d62241c7STejun Heo
1101d62241c7STejun Heo wb_put(wb);
1102d62241c7STejun Heo out_css_put:
1103d62241c7STejun Heo css_put(memcg_css);
1104d62241c7STejun Heo out_bdi_put:
1105d62241c7STejun Heo bdi_put(bdi);
1106d62241c7STejun Heo return ret;
1107d62241c7STejun Heo }
1108d62241c7STejun Heo
1109d62241c7STejun Heo /**
1110a1a0e23eSTejun Heo * cgroup_writeback_umount - flush inode wb switches for umount
1111a1a0e23eSTejun Heo *
1112a1a0e23eSTejun Heo * This function is called when a super_block is about to be destroyed and
1113a1a0e23eSTejun Heo * flushes in-flight inode wb switches. An inode wb switch goes through
1114a1a0e23eSTejun Heo * RCU and then workqueue, so the two need to be flushed in order to ensure
1115a1a0e23eSTejun Heo * that all previously scheduled switches are finished. As wb switches are
1116a1a0e23eSTejun Heo * rare occurrences and synchronize_rcu() can take a while, perform
1117a1a0e23eSTejun Heo * flushing iff wb switches are in flight.
1118a1a0e23eSTejun Heo */
cgroup_writeback_umount(void)1119a1a0e23eSTejun Heo void cgroup_writeback_umount(void)
1120a1a0e23eSTejun Heo {
1121592fa002SRoman Gushchin /*
1122592fa002SRoman Gushchin * SB_ACTIVE should be reliably cleared before checking
1123592fa002SRoman Gushchin * isw_nr_in_flight, see generic_shutdown_super().
1124592fa002SRoman Gushchin */
1125592fa002SRoman Gushchin smp_mb();
1126592fa002SRoman Gushchin
1127a1a0e23eSTejun Heo if (atomic_read(&isw_nr_in_flight)) {
1128ec084de9SJiufei Xue /*
1129ec084de9SJiufei Xue * Use rcu_barrier() to wait for all pending callbacks to
1130ec084de9SJiufei Xue * ensure that all in-flight wb switches are in the workqueue.
1131ec084de9SJiufei Xue */
1132ec084de9SJiufei Xue rcu_barrier();
1133a1a0e23eSTejun Heo flush_workqueue(isw_wq);
1134a1a0e23eSTejun Heo }
1135a1a0e23eSTejun Heo }
1136a1a0e23eSTejun Heo
cgroup_writeback_init(void)1137a1a0e23eSTejun Heo static int __init cgroup_writeback_init(void)
1138a1a0e23eSTejun Heo {
1139a1a0e23eSTejun Heo isw_wq = alloc_workqueue("inode_switch_wbs", 0, 0);
1140a1a0e23eSTejun Heo if (!isw_wq)
1141a1a0e23eSTejun Heo return -ENOMEM;
1142a1a0e23eSTejun Heo return 0;
1143a1a0e23eSTejun Heo }
1144a1a0e23eSTejun Heo fs_initcall(cgroup_writeback_init);
1145a1a0e23eSTejun Heo
1146f2b65121STejun Heo #else /* CONFIG_CGROUP_WRITEBACK */
1147f2b65121STejun Heo
bdi_down_write_wb_switch_rwsem(struct backing_dev_info * bdi)11487fc5854fSTejun Heo static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi) { }
bdi_up_write_wb_switch_rwsem(struct backing_dev_info * bdi)11497fc5854fSTejun Heo static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) { }
11507fc5854fSTejun Heo
inode_cgwb_move_to_attached(struct inode * inode,struct bdi_writeback * wb)1151f3b6a6dfSRoman Gushchin static void inode_cgwb_move_to_attached(struct inode *inode,
1152f3b6a6dfSRoman Gushchin struct bdi_writeback *wb)
1153f3b6a6dfSRoman Gushchin {
1154f3b6a6dfSRoman Gushchin assert_spin_locked(&wb->list_lock);
1155f3b6a6dfSRoman Gushchin assert_spin_locked(&inode->i_lock);
1156a9438b44SJan Kara WARN_ON_ONCE(inode->i_state & I_FREEING);
1157f3b6a6dfSRoman Gushchin
1158f3b6a6dfSRoman Gushchin inode->i_state &= ~I_SYNC_QUEUED;
1159f3b6a6dfSRoman Gushchin list_del_init(&inode->i_io_list);
1160f3b6a6dfSRoman Gushchin wb_io_lists_depopulated(wb);
1161f3b6a6dfSRoman Gushchin }
1162f3b6a6dfSRoman Gushchin
116387e1d789STejun Heo static struct bdi_writeback *
locked_inode_to_wb_and_lock_list(struct inode * inode)116487e1d789STejun Heo locked_inode_to_wb_and_lock_list(struct inode *inode)
116587e1d789STejun Heo __releases(&inode->i_lock)
116687e1d789STejun Heo __acquires(&wb->list_lock)
116787e1d789STejun Heo {
116887e1d789STejun Heo struct bdi_writeback *wb = inode_to_wb(inode);
116987e1d789STejun Heo
117087e1d789STejun Heo spin_unlock(&inode->i_lock);
117187e1d789STejun Heo spin_lock(&wb->list_lock);
117287e1d789STejun Heo return wb;
117387e1d789STejun Heo }
117487e1d789STejun Heo
inode_to_wb_and_lock_list(struct inode * inode)117587e1d789STejun Heo static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
117687e1d789STejun Heo __acquires(&wb->list_lock)
117787e1d789STejun Heo {
117887e1d789STejun Heo struct bdi_writeback *wb = inode_to_wb(inode);
117987e1d789STejun Heo
118087e1d789STejun Heo spin_lock(&wb->list_lock);
118187e1d789STejun Heo return wb;
118287e1d789STejun Heo }
118387e1d789STejun Heo
wb_split_bdi_pages(struct bdi_writeback * wb,long nr_pages)1184f2b65121STejun Heo static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
1185f2b65121STejun Heo {
1186f2b65121STejun Heo return nr_pages;
1187f2b65121STejun Heo }
1188f2b65121STejun Heo
bdi_split_work_to_wbs(struct backing_dev_info * bdi,struct wb_writeback_work * base_work,bool skip_if_busy)1189db125360STejun Heo static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
1190db125360STejun Heo struct wb_writeback_work *base_work,
1191db125360STejun Heo bool skip_if_busy)
1192db125360STejun Heo {
1193db125360STejun Heo might_sleep();
1194db125360STejun Heo
1195006a0973STejun Heo if (!skip_if_busy || !writeback_in_progress(&bdi->wb)) {
1196db125360STejun Heo base_work->auto_free = 0;
1197db125360STejun Heo wb_queue_work(&bdi->wb, base_work);
1198db125360STejun Heo }
1199db125360STejun Heo }
1200db125360STejun Heo
1201703c2708STejun Heo #endif /* CONFIG_CGROUP_WRITEBACK */
1202703c2708STejun Heo
1203e8e8a0c6SJens Axboe /*
1204e8e8a0c6SJens Axboe * Add in the number of potentially dirty inodes, because each inode
1205e8e8a0c6SJens Axboe * write can dirty pagecache in the underlying blockdev.
1206e8e8a0c6SJens Axboe */
get_nr_dirty_pages(void)1207e8e8a0c6SJens Axboe static unsigned long get_nr_dirty_pages(void)
1208e8e8a0c6SJens Axboe {
1209e8e8a0c6SJens Axboe return global_node_page_state(NR_FILE_DIRTY) +
1210e8e8a0c6SJens Axboe get_nr_dirty_inodes();
1211e8e8a0c6SJens Axboe }
1212e8e8a0c6SJens Axboe
wb_start_writeback(struct bdi_writeback * wb,enum wb_reason reason)1213e8e8a0c6SJens Axboe static void wb_start_writeback(struct bdi_writeback *wb, enum wb_reason reason)
1214b6e51316SJens Axboe {
1215c00ddad3STejun Heo if (!wb_has_dirty_io(wb))
1216c00ddad3STejun Heo return;
1217c00ddad3STejun Heo
1218c00ddad3STejun Heo /*
1219aac8d41cSJens Axboe * All callers of this function want to start writeback of all
1220aac8d41cSJens Axboe * dirty pages. Places like vmscan can call this at a very
1221aac8d41cSJens Axboe * high frequency, causing pointless allocations of tons of
1222aac8d41cSJens Axboe * work items and keeping the flusher threads busy retrieving
1223aac8d41cSJens Axboe * that work. Ensure that we only allow one of them pending and
122485009b4fSJens Axboe * inflight at the time.
1225aac8d41cSJens Axboe */
122685009b4fSJens Axboe if (test_bit(WB_start_all, &wb->state) ||
122785009b4fSJens Axboe test_and_set_bit(WB_start_all, &wb->state))
1228aac8d41cSJens Axboe return;
1229aac8d41cSJens Axboe
123085009b4fSJens Axboe wb->start_all_reason = reason;
1231c00ddad3STejun Heo wb_wakeup(wb);
1232d3ddec76SWu Fengguang }
1233d3ddec76SWu Fengguang
1234c5444198SChristoph Hellwig /**
12359ecf4866STejun Heo * wb_start_background_writeback - start background writeback
12369ecf4866STejun Heo * @wb: bdi_writback to write from
1237c5444198SChristoph Hellwig *
1238c5444198SChristoph Hellwig * Description:
12396585027aSJan Kara * This makes sure WB_SYNC_NONE background writeback happens. When
12409ecf4866STejun Heo * this function returns, it is only guaranteed that for given wb
12416585027aSJan Kara * some IO is happening if we are over background dirty threshold.
12426585027aSJan Kara * Caller need not hold sb s_umount semaphore.
1243c5444198SChristoph Hellwig */
wb_start_background_writeback(struct bdi_writeback * wb)12449ecf4866STejun Heo void wb_start_background_writeback(struct bdi_writeback *wb)
1245c5444198SChristoph Hellwig {
12466585027aSJan Kara /*
12476585027aSJan Kara * We just wake up the flusher thread. It will perform background
12486585027aSJan Kara * writeback as soon as there is no other work to do.
12496585027aSJan Kara */
12505634cc2aSTejun Heo trace_writeback_wake_background(wb);
12519ecf4866STejun Heo wb_wakeup(wb);
12521da177e4SLinus Torvalds }
12531da177e4SLinus Torvalds
12541da177e4SLinus Torvalds /*
1255a66979abSDave Chinner * Remove the inode from the writeback list it is on.
1256a66979abSDave Chinner */
inode_io_list_del(struct inode * inode)1257c7f54084SDave Chinner void inode_io_list_del(struct inode *inode)
1258a66979abSDave Chinner {
125987e1d789STejun Heo struct bdi_writeback *wb;
1260a66979abSDave Chinner
126187e1d789STejun Heo wb = inode_to_wb_and_lock_list(inode);
1262b35250c0SJan Kara spin_lock(&inode->i_lock);
1263f3b6a6dfSRoman Gushchin
1264f3b6a6dfSRoman Gushchin inode->i_state &= ~I_SYNC_QUEUED;
1265f3b6a6dfSRoman Gushchin list_del_init(&inode->i_io_list);
1266f3b6a6dfSRoman Gushchin wb_io_lists_depopulated(wb);
1267f3b6a6dfSRoman Gushchin
1268b35250c0SJan Kara spin_unlock(&inode->i_lock);
126952ebea74STejun Heo spin_unlock(&wb->list_lock);
1270f758eeabSChristoph Hellwig }
12714301efa4SJan Kara EXPORT_SYMBOL(inode_io_list_del);
1272a66979abSDave Chinner
1273a66979abSDave Chinner /*
12746c60d2b5SDave Chinner * mark an inode as under writeback on the sb
12756c60d2b5SDave Chinner */
sb_mark_inode_writeback(struct inode * inode)12766c60d2b5SDave Chinner void sb_mark_inode_writeback(struct inode *inode)
12776c60d2b5SDave Chinner {
12786c60d2b5SDave Chinner struct super_block *sb = inode->i_sb;
12796c60d2b5SDave Chinner unsigned long flags;
12806c60d2b5SDave Chinner
12816c60d2b5SDave Chinner if (list_empty(&inode->i_wb_list)) {
12826c60d2b5SDave Chinner spin_lock_irqsave(&sb->s_inode_wblist_lock, flags);
12839a46b04fSBrian Foster if (list_empty(&inode->i_wb_list)) {
12846c60d2b5SDave Chinner list_add_tail(&inode->i_wb_list, &sb->s_inodes_wb);
12859a46b04fSBrian Foster trace_sb_mark_inode_writeback(inode);
12869a46b04fSBrian Foster }
12876c60d2b5SDave Chinner spin_unlock_irqrestore(&sb->s_inode_wblist_lock, flags);
12886c60d2b5SDave Chinner }
12896c60d2b5SDave Chinner }
12906c60d2b5SDave Chinner
12916c60d2b5SDave Chinner /*
12926c60d2b5SDave Chinner * clear an inode as under writeback on the sb
12936c60d2b5SDave Chinner */
sb_clear_inode_writeback(struct inode * inode)12946c60d2b5SDave Chinner void sb_clear_inode_writeback(struct inode *inode)
12956c60d2b5SDave Chinner {
12966c60d2b5SDave Chinner struct super_block *sb = inode->i_sb;
12976c60d2b5SDave Chinner unsigned long flags;
12986c60d2b5SDave Chinner
12996c60d2b5SDave Chinner if (!list_empty(&inode->i_wb_list)) {
13006c60d2b5SDave Chinner spin_lock_irqsave(&sb->s_inode_wblist_lock, flags);
13019a46b04fSBrian Foster if (!list_empty(&inode->i_wb_list)) {
13026c60d2b5SDave Chinner list_del_init(&inode->i_wb_list);
13039a46b04fSBrian Foster trace_sb_clear_inode_writeback(inode);
13049a46b04fSBrian Foster }
13056c60d2b5SDave Chinner spin_unlock_irqrestore(&sb->s_inode_wblist_lock, flags);
13066c60d2b5SDave Chinner }
13076c60d2b5SDave Chinner }
13086c60d2b5SDave Chinner
13096c60d2b5SDave Chinner /*
13106610a0bcSAndrew Morton * Redirty an inode: set its when-it-was dirtied timestamp and move it to the
13116610a0bcSAndrew Morton * furthest end of its superblock's dirty-inode list.
13126610a0bcSAndrew Morton *
13136610a0bcSAndrew Morton * Before stamping the inode's ->dirtied_when, we check to see whether it is
131466f3b8e2SJens Axboe * already the most-recently-dirtied inode on the b_dirty list. If that is
13156610a0bcSAndrew Morton * the case then the inode must have been redirtied while it was being written
13166610a0bcSAndrew Morton * out and we don't reset its dirtied_when.
13176610a0bcSAndrew Morton */
redirty_tail_locked(struct inode * inode,struct bdi_writeback * wb)1318b35250c0SJan Kara static void redirty_tail_locked(struct inode *inode, struct bdi_writeback *wb)
13196610a0bcSAndrew Morton {
1320b35250c0SJan Kara assert_spin_locked(&inode->i_lock);
1321b35250c0SJan Kara
1322a9438b44SJan Kara inode->i_state &= ~I_SYNC_QUEUED;
1323a9438b44SJan Kara /*
1324a9438b44SJan Kara * When the inode is being freed just don't bother with dirty list
1325a9438b44SJan Kara * tracking. Flush worker will ignore this inode anyway and it will
1326a9438b44SJan Kara * trigger assertions in inode_io_list_move_locked().
1327a9438b44SJan Kara */
1328a9438b44SJan Kara if (inode->i_state & I_FREEING) {
1329a9438b44SJan Kara list_del_init(&inode->i_io_list);
1330a9438b44SJan Kara wb_io_lists_depopulated(wb);
1331a9438b44SJan Kara return;
1332a9438b44SJan Kara }
133303ba3782SJens Axboe if (!list_empty(&wb->b_dirty)) {
133466f3b8e2SJens Axboe struct inode *tail;
13356610a0bcSAndrew Morton
13367ccf19a8SNick Piggin tail = wb_inode(wb->b_dirty.next);
133766f3b8e2SJens Axboe if (time_before(inode->dirtied_when, tail->dirtied_when))
13386610a0bcSAndrew Morton inode->dirtied_when = jiffies;
13396610a0bcSAndrew Morton }
1340c7f54084SDave Chinner inode_io_list_move_locked(inode, wb, &wb->b_dirty);
13416610a0bcSAndrew Morton }
13426610a0bcSAndrew Morton
redirty_tail(struct inode * inode,struct bdi_writeback * wb)1343b35250c0SJan Kara static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
1344b35250c0SJan Kara {
1345b35250c0SJan Kara spin_lock(&inode->i_lock);
1346b35250c0SJan Kara redirty_tail_locked(inode, wb);
1347b35250c0SJan Kara spin_unlock(&inode->i_lock);
1348b35250c0SJan Kara }
1349b35250c0SJan Kara
13506610a0bcSAndrew Morton /*
135166f3b8e2SJens Axboe * requeue inode for re-scanning after bdi->b_io list is exhausted.
1352c986d1e2SAndrew Morton */
requeue_io(struct inode * inode,struct bdi_writeback * wb)1353f758eeabSChristoph Hellwig static void requeue_io(struct inode *inode, struct bdi_writeback *wb)
1354c986d1e2SAndrew Morton {
1355c7f54084SDave Chinner inode_io_list_move_locked(inode, wb, &wb->b_more_io);
1356c986d1e2SAndrew Morton }
1357c986d1e2SAndrew Morton
inode_sync_complete(struct inode * inode)13581c0eeaf5SJoern Engel static void inode_sync_complete(struct inode *inode)
13591c0eeaf5SJoern Engel {
1360365b94aeSJan Kara inode->i_state &= ~I_SYNC;
13614eff96ddSJan Kara /* If inode is clean an unused, put it into LRU now... */
13624eff96ddSJan Kara inode_add_lru(inode);
1363365b94aeSJan Kara /* Waiters must see I_SYNC cleared before being woken up */
13641c0eeaf5SJoern Engel smp_mb();
13651c0eeaf5SJoern Engel wake_up_bit(&inode->i_state, __I_SYNC);
13661c0eeaf5SJoern Engel }
13671c0eeaf5SJoern Engel
inode_dirtied_after(struct inode * inode,unsigned long t)1368d2caa3c5SJeff Layton static bool inode_dirtied_after(struct inode *inode, unsigned long t)
1369d2caa3c5SJeff Layton {
1370d2caa3c5SJeff Layton bool ret = time_after(inode->dirtied_when, t);
1371d2caa3c5SJeff Layton #ifndef CONFIG_64BIT
1372d2caa3c5SJeff Layton /*
1373d2caa3c5SJeff Layton * For inodes being constantly redirtied, dirtied_when can get stuck.
1374d2caa3c5SJeff Layton * It _appears_ to be in the future, but is actually in distant past.
1375d2caa3c5SJeff Layton * This test is necessary to prevent such wrapped-around relative times
13765b0830cbSJens Axboe * from permanently stopping the whole bdi writeback.
1377d2caa3c5SJeff Layton */
1378d2caa3c5SJeff Layton ret = ret && time_before_eq(inode->dirtied_when, jiffies);
1379d2caa3c5SJeff Layton #endif
1380d2caa3c5SJeff Layton return ret;
1381d2caa3c5SJeff Layton }
1382d2caa3c5SJeff Layton
1383c986d1e2SAndrew Morton /*
1384f9cae926SJan Kara * Move expired (dirtied before dirtied_before) dirty inodes from
1385697e6fedSJan Kara * @delaying_queue to @dispatch_queue.
13862c136579SFengguang Wu */
move_expired_inodes(struct list_head * delaying_queue,struct list_head * dispatch_queue,unsigned long dirtied_before)1387e84d0a4fSWu Fengguang static int move_expired_inodes(struct list_head *delaying_queue,
13882c136579SFengguang Wu struct list_head *dispatch_queue,
13895fcd5750SJan Kara unsigned long dirtied_before)
13902c136579SFengguang Wu {
13915c03449dSShaohua Li LIST_HEAD(tmp);
13925c03449dSShaohua Li struct list_head *pos, *node;
1393cf137307SJens Axboe struct super_block *sb = NULL;
13945c03449dSShaohua Li struct inode *inode;
1395cf137307SJens Axboe int do_sb_sort = 0;
1396e84d0a4fSWu Fengguang int moved = 0;
13975c03449dSShaohua Li
13982c136579SFengguang Wu while (!list_empty(delaying_queue)) {
13997ccf19a8SNick Piggin inode = wb_inode(delaying_queue->prev);
1400f9cae926SJan Kara if (inode_dirtied_after(inode, dirtied_before))
14012c136579SFengguang Wu break;
140210e14073SJchao Sun spin_lock(&inode->i_lock);
1403c7f54084SDave Chinner list_move(&inode->i_io_list, &tmp);
1404a8855990SJan Kara moved++;
14055afced3bSJan Kara inode->i_state |= I_SYNC_QUEUED;
14065afced3bSJan Kara spin_unlock(&inode->i_lock);
1407a8855990SJan Kara if (sb_is_blkdev_sb(inode->i_sb))
1408a8855990SJan Kara continue;
1409cf137307SJens Axboe if (sb && sb != inode->i_sb)
1410cf137307SJens Axboe do_sb_sort = 1;
1411cf137307SJens Axboe sb = inode->i_sb;
14125c03449dSShaohua Li }
14135c03449dSShaohua Li
1414cf137307SJens Axboe /* just one sb in list, splice to dispatch_queue and we're done */
1415cf137307SJens Axboe if (!do_sb_sort) {
1416cf137307SJens Axboe list_splice(&tmp, dispatch_queue);
1417e84d0a4fSWu Fengguang goto out;
1418cf137307SJens Axboe }
1419cf137307SJens Axboe
142010e14073SJchao Sun /*
142110e14073SJchao Sun * Although inode's i_io_list is moved from 'tmp' to 'dispatch_queue',
142210e14073SJchao Sun * we don't take inode->i_lock here because it is just a pointless overhead.
142310e14073SJchao Sun * Inode is already marked as I_SYNC_QUEUED so writeback list handling is
142410e14073SJchao Sun * fully under our control.
142510e14073SJchao Sun */
14265c03449dSShaohua Li while (!list_empty(&tmp)) {
14277ccf19a8SNick Piggin sb = wb_inode(tmp.prev)->i_sb;
14285c03449dSShaohua Li list_for_each_prev_safe(pos, node, &tmp) {
14297ccf19a8SNick Piggin inode = wb_inode(pos);
14305c03449dSShaohua Li if (inode->i_sb == sb)
1431c7f54084SDave Chinner list_move(&inode->i_io_list, dispatch_queue);
14322c136579SFengguang Wu }
14332c136579SFengguang Wu }
1434e84d0a4fSWu Fengguang out:
1435e84d0a4fSWu Fengguang return moved;
14365c03449dSShaohua Li }
14372c136579SFengguang Wu
14382c136579SFengguang Wu /*
14392c136579SFengguang Wu * Queue all expired dirty inodes for io, eldest first.
14404ea879b9SWu Fengguang * Before
14414ea879b9SWu Fengguang * newly dirtied b_dirty b_io b_more_io
14424ea879b9SWu Fengguang * =============> gf edc BA
14434ea879b9SWu Fengguang * After
14444ea879b9SWu Fengguang * newly dirtied b_dirty b_io b_more_io
14454ea879b9SWu Fengguang * =============> g fBAedc
14464ea879b9SWu Fengguang * |
14474ea879b9SWu Fengguang * +--> dequeue for IO
14482c136579SFengguang Wu */
queue_io(struct bdi_writeback * wb,struct wb_writeback_work * work,unsigned long dirtied_before)1449f9cae926SJan Kara static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work,
1450f9cae926SJan Kara unsigned long dirtied_before)
14512c136579SFengguang Wu {
1452e84d0a4fSWu Fengguang int moved;
1453f9cae926SJan Kara unsigned long time_expire_jif = dirtied_before;
14540ae45f63STheodore Ts'o
1455f758eeabSChristoph Hellwig assert_spin_locked(&wb->list_lock);
14564ea879b9SWu Fengguang list_splice_init(&wb->b_more_io, &wb->b_io);
14575fcd5750SJan Kara moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, dirtied_before);
1458f9cae926SJan Kara if (!work->for_sync)
1459f9cae926SJan Kara time_expire_jif = jiffies - dirtytime_expire_interval * HZ;
14600ae45f63STheodore Ts'o moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io,
14615fcd5750SJan Kara time_expire_jif);
1462d6c10f1fSTejun Heo if (moved)
1463d6c10f1fSTejun Heo wb_io_lists_populated(wb);
1464f9cae926SJan Kara trace_writeback_queue_io(wb, work, dirtied_before, moved);
146566f3b8e2SJens Axboe }
146666f3b8e2SJens Axboe
write_inode(struct inode * inode,struct writeback_control * wbc)1467a9185b41SChristoph Hellwig static int write_inode(struct inode *inode, struct writeback_control *wbc)
146866f3b8e2SJens Axboe {
14699fb0a7daSTejun Heo int ret;
14709fb0a7daSTejun Heo
14719fb0a7daSTejun Heo if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) {
14729fb0a7daSTejun Heo trace_writeback_write_inode_start(inode, wbc);
14739fb0a7daSTejun Heo ret = inode->i_sb->s_op->write_inode(inode, wbc);
14749fb0a7daSTejun Heo trace_writeback_write_inode(inode, wbc);
14759fb0a7daSTejun Heo return ret;
14769fb0a7daSTejun Heo }
147703ba3782SJens Axboe return 0;
147866f3b8e2SJens Axboe }
147908d8e974SFengguang Wu
14802c136579SFengguang Wu /*
1481169ebd90SJan Kara * Wait for writeback on an inode to complete. Called with i_lock held.
1482169ebd90SJan Kara * Caller must make sure inode cannot go away when we drop i_lock.
148301c03194SChristoph Hellwig */
__inode_wait_for_writeback(struct inode * inode)1484169ebd90SJan Kara static void __inode_wait_for_writeback(struct inode *inode)
1485169ebd90SJan Kara __releases(inode->i_lock)
1486169ebd90SJan Kara __acquires(inode->i_lock)
148701c03194SChristoph Hellwig {
148801c03194SChristoph Hellwig DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
148901c03194SChristoph Hellwig wait_queue_head_t *wqh;
149001c03194SChristoph Hellwig
149101c03194SChristoph Hellwig wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
149258a9d3d8SRichard Kennedy while (inode->i_state & I_SYNC) {
1493250df6edSDave Chinner spin_unlock(&inode->i_lock);
149474316201SNeilBrown __wait_on_bit(wqh, &wq, bit_wait,
149574316201SNeilBrown TASK_UNINTERRUPTIBLE);
1496250df6edSDave Chinner spin_lock(&inode->i_lock);
149758a9d3d8SRichard Kennedy }
149801c03194SChristoph Hellwig }
149901c03194SChristoph Hellwig
150001c03194SChristoph Hellwig /*
1501169ebd90SJan Kara * Wait for writeback on an inode to complete. Caller must have inode pinned.
1502169ebd90SJan Kara */
inode_wait_for_writeback(struct inode * inode)1503169ebd90SJan Kara void inode_wait_for_writeback(struct inode *inode)
1504169ebd90SJan Kara {
1505169ebd90SJan Kara spin_lock(&inode->i_lock);
1506169ebd90SJan Kara __inode_wait_for_writeback(inode);
1507169ebd90SJan Kara spin_unlock(&inode->i_lock);
1508169ebd90SJan Kara }
1509169ebd90SJan Kara
1510169ebd90SJan Kara /*
1511169ebd90SJan Kara * Sleep until I_SYNC is cleared. This function must be called with i_lock
1512169ebd90SJan Kara * held and drops it. It is aimed for callers not holding any inode reference
1513169ebd90SJan Kara * so once i_lock is dropped, inode can go away.
1514169ebd90SJan Kara */
inode_sleep_on_writeback(struct inode * inode)1515169ebd90SJan Kara static void inode_sleep_on_writeback(struct inode *inode)
1516169ebd90SJan Kara __releases(inode->i_lock)
1517169ebd90SJan Kara {
1518169ebd90SJan Kara DEFINE_WAIT(wait);
1519169ebd90SJan Kara wait_queue_head_t *wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
1520169ebd90SJan Kara int sleep;
1521169ebd90SJan Kara
1522169ebd90SJan Kara prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
1523169ebd90SJan Kara sleep = inode->i_state & I_SYNC;
1524169ebd90SJan Kara spin_unlock(&inode->i_lock);
1525169ebd90SJan Kara if (sleep)
1526169ebd90SJan Kara schedule();
1527169ebd90SJan Kara finish_wait(wqh, &wait);
1528169ebd90SJan Kara }
1529169ebd90SJan Kara
1530169ebd90SJan Kara /*
1531ccb26b5aSJan Kara * Find proper writeback list for the inode depending on its current state and
1532ccb26b5aSJan Kara * possibly also change of its state while we were doing writeback. Here we
1533ccb26b5aSJan Kara * handle things such as livelock prevention or fairness of writeback among
1534ccb26b5aSJan Kara * inodes. This function can be called only by flusher thread - noone else
1535ccb26b5aSJan Kara * processes all inodes in writeback lists and requeueing inodes behind flusher
1536ccb26b5aSJan Kara * thread's back can have unexpected consequences.
1537ccb26b5aSJan Kara */
requeue_inode(struct inode * inode,struct bdi_writeback * wb,struct writeback_control * wbc)1538ccb26b5aSJan Kara static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
1539ccb26b5aSJan Kara struct writeback_control *wbc)
1540ccb26b5aSJan Kara {
1541ccb26b5aSJan Kara if (inode->i_state & I_FREEING)
1542ccb26b5aSJan Kara return;
1543ccb26b5aSJan Kara
1544ccb26b5aSJan Kara /*
1545ccb26b5aSJan Kara * Sync livelock prevention. Each inode is tagged and synced in one
1546ccb26b5aSJan Kara * shot. If still dirty, it will be redirty_tail()'ed below. Update
1547ccb26b5aSJan Kara * the dirty time to prevent enqueue and sync it again.
1548ccb26b5aSJan Kara */
1549ccb26b5aSJan Kara if ((inode->i_state & I_DIRTY) &&
1550ccb26b5aSJan Kara (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages))
1551ccb26b5aSJan Kara inode->dirtied_when = jiffies;
1552ccb26b5aSJan Kara
15534f8ad655SJan Kara if (wbc->pages_skipped) {
15544f8ad655SJan Kara /*
1555be049c3aSChunhai Guo * Writeback is not making progress due to locked buffers.
1556be049c3aSChunhai Guo * Skip this inode for now. Although having skipped pages
1557be049c3aSChunhai Guo * is odd for clean inodes, it can happen for some
1558be049c3aSChunhai Guo * filesystems so handle that gracefully.
15594f8ad655SJan Kara */
1560be049c3aSChunhai Guo if (inode->i_state & I_DIRTY_ALL)
1561b35250c0SJan Kara redirty_tail_locked(inode, wb);
1562be049c3aSChunhai Guo else
1563be049c3aSChunhai Guo inode_cgwb_move_to_attached(inode, wb);
15644f8ad655SJan Kara return;
15654f8ad655SJan Kara }
15664f8ad655SJan Kara
1567ccb26b5aSJan Kara if (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
1568ccb26b5aSJan Kara /*
1569ccb26b5aSJan Kara * We didn't write back all the pages. nfs_writepages()
1570ccb26b5aSJan Kara * sometimes bales out without doing anything.
1571ccb26b5aSJan Kara */
1572ccb26b5aSJan Kara if (wbc->nr_to_write <= 0) {
1573ccb26b5aSJan Kara /* Slice used up. Queue for next turn. */
1574ccb26b5aSJan Kara requeue_io(inode, wb);
1575ccb26b5aSJan Kara } else {
1576ccb26b5aSJan Kara /*
1577ccb26b5aSJan Kara * Writeback blocked by something other than
1578ccb26b5aSJan Kara * congestion. Delay the inode for some time to
1579ccb26b5aSJan Kara * avoid spinning on the CPU (100% iowait)
1580ccb26b5aSJan Kara * retrying writeback of the dirty page/inode
1581ccb26b5aSJan Kara * that cannot be performed immediately.
1582ccb26b5aSJan Kara */
1583b35250c0SJan Kara redirty_tail_locked(inode, wb);
1584ccb26b5aSJan Kara }
1585ccb26b5aSJan Kara } else if (inode->i_state & I_DIRTY) {
1586ccb26b5aSJan Kara /*
1587ccb26b5aSJan Kara * Filesystems can dirty the inode during writeback operations,
1588ccb26b5aSJan Kara * such as delayed allocation during submission or metadata
1589ccb26b5aSJan Kara * updates after data IO completion.
1590ccb26b5aSJan Kara */
1591b35250c0SJan Kara redirty_tail_locked(inode, wb);
15920ae45f63STheodore Ts'o } else if (inode->i_state & I_DIRTY_TIME) {
1593a2f48706STheodore Ts'o inode->dirtied_when = jiffies;
1594c7f54084SDave Chinner inode_io_list_move_locked(inode, wb, &wb->b_dirty_time);
15955afced3bSJan Kara inode->i_state &= ~I_SYNC_QUEUED;
1596ccb26b5aSJan Kara } else {
1597ccb26b5aSJan Kara /* The inode is clean. Remove from writeback lists. */
1598f3b6a6dfSRoman Gushchin inode_cgwb_move_to_attached(inode, wb);
1599ccb26b5aSJan Kara }
1600ccb26b5aSJan Kara }
1601ccb26b5aSJan Kara
1602ccb26b5aSJan Kara /*
1603da0c4c60SEric Biggers * Write out an inode and its dirty pages (or some of its dirty pages, depending
1604da0c4c60SEric Biggers * on @wbc->nr_to_write), and clear the relevant dirty flags from i_state.
1605da0c4c60SEric Biggers *
1606da0c4c60SEric Biggers * This doesn't remove the inode from the writeback list it is on, except
1607da0c4c60SEric Biggers * potentially to move it from b_dirty_time to b_dirty due to timestamp
1608da0c4c60SEric Biggers * expiration. The caller is otherwise responsible for writeback list handling.
1609da0c4c60SEric Biggers *
1610da0c4c60SEric Biggers * The caller is also responsible for setting the I_SYNC flag beforehand and
1611da0c4c60SEric Biggers * calling inode_sync_complete() to clear it afterwards.
16121da177e4SLinus Torvalds */
16131da177e4SLinus Torvalds static int
__writeback_single_inode(struct inode * inode,struct writeback_control * wbc)1614cd8ed2a4SYan Hong __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
16151da177e4SLinus Torvalds {
16161da177e4SLinus Torvalds struct address_space *mapping = inode->i_mapping;
1617251d6a47SWu Fengguang long nr_to_write = wbc->nr_to_write;
161801c03194SChristoph Hellwig unsigned dirty;
16191da177e4SLinus Torvalds int ret;
16201da177e4SLinus Torvalds
16214f8ad655SJan Kara WARN_ON(!(inode->i_state & I_SYNC));
16221da177e4SLinus Torvalds
16239fb0a7daSTejun Heo trace_writeback_single_inode_start(inode, wbc, nr_to_write);
16249fb0a7daSTejun Heo
16251da177e4SLinus Torvalds ret = do_writepages(mapping, wbc);
16261da177e4SLinus Torvalds
162726821ed4SChristoph Hellwig /*
162826821ed4SChristoph Hellwig * Make sure to wait on the data before writing out the metadata.
162926821ed4SChristoph Hellwig * This is important for filesystems that modify metadata on data
16307747bd4bSDave Chinner * I/O completion. We don't do it for sync(2) writeback because it has a
16317747bd4bSDave Chinner * separate, external IO completion path and ->sync_fs for guaranteeing
16327747bd4bSDave Chinner * inode metadata is written back correctly.
163326821ed4SChristoph Hellwig */
16347747bd4bSDave Chinner if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) {
163526821ed4SChristoph Hellwig int err = filemap_fdatawait(mapping);
16361da177e4SLinus Torvalds if (ret == 0)
16371da177e4SLinus Torvalds ret = err;
16381da177e4SLinus Torvalds }
16391da177e4SLinus Torvalds
16405547e8aaSDmitry Monakhov /*
16411e249cb5SEric Biggers * If the inode has dirty timestamps and we need to write them, call
16421e249cb5SEric Biggers * mark_inode_dirty_sync() to notify the filesystem about it and to
16431e249cb5SEric Biggers * change I_DIRTY_TIME into I_DIRTY_SYNC.
16441e249cb5SEric Biggers */
16451e249cb5SEric Biggers if ((inode->i_state & I_DIRTY_TIME) &&
164683dc881dSEric Biggers (wbc->sync_mode == WB_SYNC_ALL ||
16471e249cb5SEric Biggers time_after(jiffies, inode->dirtied_time_when +
16481e249cb5SEric Biggers dirtytime_expire_interval * HZ))) {
16491e249cb5SEric Biggers trace_writeback_lazytime(inode);
16501e249cb5SEric Biggers mark_inode_dirty_sync(inode);
16511e249cb5SEric Biggers }
16521e249cb5SEric Biggers
16531e249cb5SEric Biggers /*
1654da0c4c60SEric Biggers * Get and clear the dirty flags from i_state. This needs to be done
1655da0c4c60SEric Biggers * after calling writepages because some filesystems may redirty the
1656da0c4c60SEric Biggers * inode during writepages due to delalloc. It also needs to be done
1657da0c4c60SEric Biggers * after handling timestamp expiration, as that may dirty the inode too.
16585547e8aaSDmitry Monakhov */
1659250df6edSDave Chinner spin_lock(&inode->i_lock);
16605547e8aaSDmitry Monakhov dirty = inode->i_state & I_DIRTY;
16610ae45f63STheodore Ts'o inode->i_state &= ~dirty;
16629c6ac78eSTejun Heo
16639c6ac78eSTejun Heo /*
16649c6ac78eSTejun Heo * Paired with smp_mb() in __mark_inode_dirty(). This allows
16659c6ac78eSTejun Heo * __mark_inode_dirty() to test i_state without grabbing i_lock -
16669c6ac78eSTejun Heo * either they see the I_DIRTY bits cleared or we see the dirtied
16679c6ac78eSTejun Heo * inode.
16689c6ac78eSTejun Heo *
16699c6ac78eSTejun Heo * I_DIRTY_PAGES is always cleared together above even if @mapping
16709c6ac78eSTejun Heo * still has dirty pages. The flag is reinstated after smp_mb() if
16719c6ac78eSTejun Heo * necessary. This guarantees that either __mark_inode_dirty()
16729c6ac78eSTejun Heo * sees clear I_DIRTY_PAGES or we see PAGECACHE_TAG_DIRTY.
16739c6ac78eSTejun Heo */
16749c6ac78eSTejun Heo smp_mb();
16759c6ac78eSTejun Heo
16769c6ac78eSTejun Heo if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
16779c6ac78eSTejun Heo inode->i_state |= I_DIRTY_PAGES;
167808276bdaSDavid Howells else if (unlikely(inode->i_state & I_PINNING_FSCACHE_WB)) {
167908276bdaSDavid Howells if (!(inode->i_state & I_DIRTY_PAGES)) {
168008276bdaSDavid Howells inode->i_state &= ~I_PINNING_FSCACHE_WB;
168108276bdaSDavid Howells wbc->unpinned_fscache_wb = true;
168208276bdaSDavid Howells dirty |= I_PINNING_FSCACHE_WB; /* Cause write_inode */
168308276bdaSDavid Howells }
168408276bdaSDavid Howells }
16859c6ac78eSTejun Heo
1686250df6edSDave Chinner spin_unlock(&inode->i_lock);
16879c6ac78eSTejun Heo
168826821ed4SChristoph Hellwig /* Don't write the inode if only I_DIRTY_PAGES was set */
16890ae45f63STheodore Ts'o if (dirty & ~I_DIRTY_PAGES) {
1690a9185b41SChristoph Hellwig int err = write_inode(inode, wbc);
16911da177e4SLinus Torvalds if (ret == 0)
16921da177e4SLinus Torvalds ret = err;
16931da177e4SLinus Torvalds }
169408276bdaSDavid Howells wbc->unpinned_fscache_wb = false;
16954f8ad655SJan Kara trace_writeback_single_inode(inode, wbc, nr_to_write);
16964f8ad655SJan Kara return ret;
16974f8ad655SJan Kara }
16984f8ad655SJan Kara
16994f8ad655SJan Kara /*
1700da0c4c60SEric Biggers * Write out an inode's dirty data and metadata on-demand, i.e. separately from
1701da0c4c60SEric Biggers * the regular batched writeback done by the flusher threads in
1702da0c4c60SEric Biggers * writeback_sb_inodes(). @wbc controls various aspects of the write, such as
1703da0c4c60SEric Biggers * whether it is a data-integrity sync (%WB_SYNC_ALL) or not (%WB_SYNC_NONE).
17044f8ad655SJan Kara *
1705da0c4c60SEric Biggers * To prevent the inode from going away, either the caller must have a reference
1706da0c4c60SEric Biggers * to the inode, or the inode must have I_WILL_FREE or I_FREEING set.
17074f8ad655SJan Kara */
writeback_single_inode(struct inode * inode,struct writeback_control * wbc)1708aaf25593STejun Heo static int writeback_single_inode(struct inode *inode,
17094f8ad655SJan Kara struct writeback_control *wbc)
17104f8ad655SJan Kara {
1711aaf25593STejun Heo struct bdi_writeback *wb;
17124f8ad655SJan Kara int ret = 0;
17134f8ad655SJan Kara
17144f8ad655SJan Kara spin_lock(&inode->i_lock);
17154f8ad655SJan Kara if (!atomic_read(&inode->i_count))
17164f8ad655SJan Kara WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING)));
17174f8ad655SJan Kara else
17184f8ad655SJan Kara WARN_ON(inode->i_state & I_WILL_FREE);
17194f8ad655SJan Kara
17204f8ad655SJan Kara if (inode->i_state & I_SYNC) {
1721da0c4c60SEric Biggers /*
1722da0c4c60SEric Biggers * Writeback is already running on the inode. For WB_SYNC_NONE,
1723da0c4c60SEric Biggers * that's enough and we can just return. For WB_SYNC_ALL, we
1724da0c4c60SEric Biggers * must wait for the existing writeback to complete, then do
1725da0c4c60SEric Biggers * writeback again if there's anything left.
1726da0c4c60SEric Biggers */
17274f8ad655SJan Kara if (wbc->sync_mode != WB_SYNC_ALL)
17284f8ad655SJan Kara goto out;
1729169ebd90SJan Kara __inode_wait_for_writeback(inode);
17304f8ad655SJan Kara }
17314f8ad655SJan Kara WARN_ON(inode->i_state & I_SYNC);
17324f8ad655SJan Kara /*
1733da0c4c60SEric Biggers * If the inode is already fully clean, then there's nothing to do.
1734da0c4c60SEric Biggers *
1735da0c4c60SEric Biggers * For data-integrity syncs we also need to check whether any pages are
1736da0c4c60SEric Biggers * still under writeback, e.g. due to prior WB_SYNC_NONE writeback. If
1737da0c4c60SEric Biggers * there are any such pages, we'll need to wait for them.
17384f8ad655SJan Kara */
17390ae45f63STheodore Ts'o if (!(inode->i_state & I_DIRTY_ALL) &&
1740f9b0e058SJan Kara (wbc->sync_mode != WB_SYNC_ALL ||
1741f9b0e058SJan Kara !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)))
17424f8ad655SJan Kara goto out;
17434f8ad655SJan Kara inode->i_state |= I_SYNC;
1744b16b1debSTejun Heo wbc_attach_and_unlock_inode(wbc, inode);
17454f8ad655SJan Kara
1746cd8ed2a4SYan Hong ret = __writeback_single_inode(inode, wbc);
17471da177e4SLinus Torvalds
1748b16b1debSTejun Heo wbc_detach_inode(wbc);
1749aaf25593STejun Heo
1750aaf25593STejun Heo wb = inode_to_wb_and_lock_list(inode);
1751250df6edSDave Chinner spin_lock(&inode->i_lock);
17524f8ad655SJan Kara /*
17534e3c51f4SSvyatoslav Feldsherov * If the inode is freeing, its i_io_list shoudn't be updated
17544e3c51f4SSvyatoslav Feldsherov * as it can be finally deleted at this moment.
17554e3c51f4SSvyatoslav Feldsherov */
17564e3c51f4SSvyatoslav Feldsherov if (!(inode->i_state & I_FREEING)) {
17574e3c51f4SSvyatoslav Feldsherov /*
17584e3c51f4SSvyatoslav Feldsherov * If the inode is now fully clean, then it can be safely
17594e3c51f4SSvyatoslav Feldsherov * removed from its writeback list (if any). Otherwise the
17604e3c51f4SSvyatoslav Feldsherov * flusher threads are responsible for the writeback lists.
17614f8ad655SJan Kara */
17620ae45f63STheodore Ts'o if (!(inode->i_state & I_DIRTY_ALL))
1763f3b6a6dfSRoman Gushchin inode_cgwb_move_to_attached(inode, wb);
1764cbfecb92SLukas Czerner else if (!(inode->i_state & I_SYNC_QUEUED)) {
1765cbfecb92SLukas Czerner if ((inode->i_state & I_DIRTY))
1766846a3351SJing Xia redirty_tail_locked(inode, wb);
1767cbfecb92SLukas Czerner else if (inode->i_state & I_DIRTY_TIME) {
1768cbfecb92SLukas Czerner inode->dirtied_when = jiffies;
17694e3c51f4SSvyatoslav Feldsherov inode_io_list_move_locked(inode,
17704e3c51f4SSvyatoslav Feldsherov wb,
17714e3c51f4SSvyatoslav Feldsherov &wb->b_dirty_time);
17724e3c51f4SSvyatoslav Feldsherov }
1773cbfecb92SLukas Czerner }
1774cbfecb92SLukas Czerner }
1775846a3351SJing Xia
17764f8ad655SJan Kara spin_unlock(&wb->list_lock);
17771c0eeaf5SJoern Engel inode_sync_complete(inode);
17784f8ad655SJan Kara out:
17794f8ad655SJan Kara spin_unlock(&inode->i_lock);
17801da177e4SLinus Torvalds return ret;
17811da177e4SLinus Torvalds }
17821da177e4SLinus Torvalds
writeback_chunk_size(struct bdi_writeback * wb,struct wb_writeback_work * work)1783a88a341aSTejun Heo static long writeback_chunk_size(struct bdi_writeback *wb,
17841a12d8bdSWu Fengguang struct wb_writeback_work *work)
1785d46db3d5SWu Fengguang {
1786d46db3d5SWu Fengguang long pages;
1787d46db3d5SWu Fengguang
1788d46db3d5SWu Fengguang /*
1789d46db3d5SWu Fengguang * WB_SYNC_ALL mode does livelock avoidance by syncing dirty
1790d46db3d5SWu Fengguang * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX
1791d46db3d5SWu Fengguang * here avoids calling into writeback_inodes_wb() more than once.
1792d46db3d5SWu Fengguang *
1793d46db3d5SWu Fengguang * The intended call sequence for WB_SYNC_ALL writeback is:
1794d46db3d5SWu Fengguang *
1795d46db3d5SWu Fengguang * wb_writeback()
1796d46db3d5SWu Fengguang * writeback_sb_inodes() <== called only once
1797d46db3d5SWu Fengguang * write_cache_pages() <== called once for each inode
1798d46db3d5SWu Fengguang * (quickly) tag currently dirty pages
1799d46db3d5SWu Fengguang * (maybe slowly) sync all tagged pages
1800d46db3d5SWu Fengguang */
1801d46db3d5SWu Fengguang if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages)
1802d46db3d5SWu Fengguang pages = LONG_MAX;
18031a12d8bdSWu Fengguang else {
1804a88a341aSTejun Heo pages = min(wb->avg_write_bandwidth / 2,
1805dcc25ae7STejun Heo global_wb_domain.dirty_limit / DIRTY_SCOPE);
18061a12d8bdSWu Fengguang pages = min(pages, work->nr_pages);
18071a12d8bdSWu Fengguang pages = round_down(pages + MIN_WRITEBACK_PAGES,
18081a12d8bdSWu Fengguang MIN_WRITEBACK_PAGES);
18091a12d8bdSWu Fengguang }
1810d46db3d5SWu Fengguang
1811d46db3d5SWu Fengguang return pages;
1812d46db3d5SWu Fengguang }
1813d46db3d5SWu Fengguang
181403ba3782SJens Axboe /*
1815f11c9c5cSEdward Shishkin * Write a portion of b_io inodes which belong to @sb.
1816edadfb10SChristoph Hellwig *
1817d46db3d5SWu Fengguang * Return the number of pages and/or inodes written.
18180ba13fd1SLinus Torvalds *
18190ba13fd1SLinus Torvalds * NOTE! This is called with wb->list_lock held, and will
18200ba13fd1SLinus Torvalds * unlock and relock that for each inode it ends up doing
18210ba13fd1SLinus Torvalds * IO for.
1822f11c9c5cSEdward Shishkin */
writeback_sb_inodes(struct super_block * sb,struct bdi_writeback * wb,struct wb_writeback_work * work)1823d46db3d5SWu Fengguang static long writeback_sb_inodes(struct super_block *sb,
1824d46db3d5SWu Fengguang struct bdi_writeback *wb,
1825d46db3d5SWu Fengguang struct wb_writeback_work *work)
182603ba3782SJens Axboe {
1827d46db3d5SWu Fengguang struct writeback_control wbc = {
1828d46db3d5SWu Fengguang .sync_mode = work->sync_mode,
1829d46db3d5SWu Fengguang .tagged_writepages = work->tagged_writepages,
1830d46db3d5SWu Fengguang .for_kupdate = work->for_kupdate,
1831d46db3d5SWu Fengguang .for_background = work->for_background,
18327747bd4bSDave Chinner .for_sync = work->for_sync,
1833d46db3d5SWu Fengguang .range_cyclic = work->range_cyclic,
1834d46db3d5SWu Fengguang .range_start = 0,
1835d46db3d5SWu Fengguang .range_end = LLONG_MAX,
1836d46db3d5SWu Fengguang };
1837d46db3d5SWu Fengguang unsigned long start_time = jiffies;
1838d46db3d5SWu Fengguang long write_chunk;
183968f4c6ebSZhihao Cheng long total_wrote = 0; /* count both pages and inodes */
1840d46db3d5SWu Fengguang
184103ba3782SJens Axboe while (!list_empty(&wb->b_io)) {
18427ccf19a8SNick Piggin struct inode *inode = wb_inode(wb->b_io.prev);
1843aaf25593STejun Heo struct bdi_writeback *tmp_wb;
184468f4c6ebSZhihao Cheng long wrote;
1845edadfb10SChristoph Hellwig
1846edadfb10SChristoph Hellwig if (inode->i_sb != sb) {
1847d46db3d5SWu Fengguang if (work->sb) {
1848edadfb10SChristoph Hellwig /*
1849edadfb10SChristoph Hellwig * We only want to write back data for this
1850edadfb10SChristoph Hellwig * superblock, move all inodes not belonging
1851edadfb10SChristoph Hellwig * to it back onto the dirty list.
1852edadfb10SChristoph Hellwig */
1853f758eeabSChristoph Hellwig redirty_tail(inode, wb);
185466f3b8e2SJens Axboe continue;
185566f3b8e2SJens Axboe }
1856edadfb10SChristoph Hellwig
1857edadfb10SChristoph Hellwig /*
1858edadfb10SChristoph Hellwig * The inode belongs to a different superblock.
1859edadfb10SChristoph Hellwig * Bounce back to the caller to unpin this and
1860edadfb10SChristoph Hellwig * pin the next superblock.
1861edadfb10SChristoph Hellwig */
1862d46db3d5SWu Fengguang break;
1863edadfb10SChristoph Hellwig }
1864edadfb10SChristoph Hellwig
18659843b76aSChristoph Hellwig /*
1866331cbdeeSWanpeng Li * Don't bother with new inodes or inodes being freed, first
1867331cbdeeSWanpeng Li * kind does not need periodic writeout yet, and for the latter
18689843b76aSChristoph Hellwig * kind writeout is handled by the freer.
18699843b76aSChristoph Hellwig */
1870250df6edSDave Chinner spin_lock(&inode->i_lock);
18719843b76aSChristoph Hellwig if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
1872b35250c0SJan Kara redirty_tail_locked(inode, wb);
1873250df6edSDave Chinner spin_unlock(&inode->i_lock);
18747ef0d737SNick Piggin continue;
18757ef0d737SNick Piggin }
1876cc1676d9SJan Kara if ((inode->i_state & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) {
1877cc1676d9SJan Kara /*
1878cc1676d9SJan Kara * If this inode is locked for writeback and we are not
1879cc1676d9SJan Kara * doing writeback-for-data-integrity, move it to
1880cc1676d9SJan Kara * b_more_io so that writeback can proceed with the
1881cc1676d9SJan Kara * other inodes on s_io.
1882cc1676d9SJan Kara *
1883cc1676d9SJan Kara * We'll have another go at writing back this inode
1884cc1676d9SJan Kara * when we completed a full scan of b_io.
1885cc1676d9SJan Kara */
1886cc1676d9SJan Kara requeue_io(inode, wb);
188710e14073SJchao Sun spin_unlock(&inode->i_lock);
1888cc1676d9SJan Kara trace_writeback_sb_inodes_requeue(inode);
1889cc1676d9SJan Kara continue;
1890cc1676d9SJan Kara }
1891f0d07b7fSJan Kara spin_unlock(&wb->list_lock);
1892f0d07b7fSJan Kara
18934f8ad655SJan Kara /*
18944f8ad655SJan Kara * We already requeued the inode if it had I_SYNC set and we
18954f8ad655SJan Kara * are doing WB_SYNC_NONE writeback. So this catches only the
18964f8ad655SJan Kara * WB_SYNC_ALL case.
18974f8ad655SJan Kara */
1898169ebd90SJan Kara if (inode->i_state & I_SYNC) {
1899169ebd90SJan Kara /* Wait for I_SYNC. This function drops i_lock... */
1900169ebd90SJan Kara inode_sleep_on_writeback(inode);
1901169ebd90SJan Kara /* Inode may be gone, start again */
1902ead188f9SJan Kara spin_lock(&wb->list_lock);
1903169ebd90SJan Kara continue;
1904169ebd90SJan Kara }
19054f8ad655SJan Kara inode->i_state |= I_SYNC;
1906b16b1debSTejun Heo wbc_attach_and_unlock_inode(&wbc, inode);
1907169ebd90SJan Kara
1908a88a341aSTejun Heo write_chunk = writeback_chunk_size(wb, work);
1909d46db3d5SWu Fengguang wbc.nr_to_write = write_chunk;
1910d46db3d5SWu Fengguang wbc.pages_skipped = 0;
1911250df6edSDave Chinner
1912169ebd90SJan Kara /*
1913169ebd90SJan Kara * We use I_SYNC to pin the inode in memory. While it is set
1914169ebd90SJan Kara * evict_inode() will wait so the inode cannot be freed.
1915169ebd90SJan Kara */
1916cd8ed2a4SYan Hong __writeback_single_inode(inode, &wbc);
1917d46db3d5SWu Fengguang
1918b16b1debSTejun Heo wbc_detach_inode(&wbc);
1919d46db3d5SWu Fengguang work->nr_pages -= write_chunk - wbc.nr_to_write;
192068f4c6ebSZhihao Cheng wrote = write_chunk - wbc.nr_to_write - wbc.pages_skipped;
192168f4c6ebSZhihao Cheng wrote = wrote < 0 ? 0 : wrote;
192268f4c6ebSZhihao Cheng total_wrote += wrote;
1923590dca3aSChris Mason
1924590dca3aSChris Mason if (need_resched()) {
1925590dca3aSChris Mason /*
1926590dca3aSChris Mason * We're trying to balance between building up a nice
1927590dca3aSChris Mason * long list of IOs to improve our merge rate, and
1928590dca3aSChris Mason * getting those IOs out quickly for anyone throttling
1929590dca3aSChris Mason * in balance_dirty_pages(). cond_resched() doesn't
1930590dca3aSChris Mason * unplug, so get our IOs out the door before we
1931590dca3aSChris Mason * give up the CPU.
1932590dca3aSChris Mason */
1933008f75a2SChristoph Hellwig blk_flush_plug(current->plug, false);
1934590dca3aSChris Mason cond_resched();
1935590dca3aSChris Mason }
1936590dca3aSChris Mason
1937aaf25593STejun Heo /*
1938aaf25593STejun Heo * Requeue @inode if still dirty. Be careful as @inode may
1939aaf25593STejun Heo * have been switched to another wb in the meantime.
1940aaf25593STejun Heo */
1941aaf25593STejun Heo tmp_wb = inode_to_wb_and_lock_list(inode);
19424f8ad655SJan Kara spin_lock(&inode->i_lock);
19430ae45f63STheodore Ts'o if (!(inode->i_state & I_DIRTY_ALL))
194468f4c6ebSZhihao Cheng total_wrote++;
1945aaf25593STejun Heo requeue_inode(inode, tmp_wb, &wbc);
19464f8ad655SJan Kara inode_sync_complete(inode);
19470f1b1fd8SDave Chinner spin_unlock(&inode->i_lock);
1948590dca3aSChris Mason
1949aaf25593STejun Heo if (unlikely(tmp_wb != wb)) {
1950aaf25593STejun Heo spin_unlock(&tmp_wb->list_lock);
1951aaf25593STejun Heo spin_lock(&wb->list_lock);
1952aaf25593STejun Heo }
1953aaf25593STejun Heo
1954d46db3d5SWu Fengguang /*
1955d46db3d5SWu Fengguang * bail out to wb_writeback() often enough to check
1956d46db3d5SWu Fengguang * background threshold and other termination conditions.
1957d46db3d5SWu Fengguang */
195868f4c6ebSZhihao Cheng if (total_wrote) {
1959d46db3d5SWu Fengguang if (time_is_before_jiffies(start_time + HZ / 10UL))
1960d46db3d5SWu Fengguang break;
1961d46db3d5SWu Fengguang if (work->nr_pages <= 0)
1962d46db3d5SWu Fengguang break;
19631da177e4SLinus Torvalds }
19648bc3be27SFengguang Wu }
196568f4c6ebSZhihao Cheng return total_wrote;
1966f11c9c5cSEdward Shishkin }
196738f21977SNick Piggin
__writeback_inodes_wb(struct bdi_writeback * wb,struct wb_writeback_work * work)1968d46db3d5SWu Fengguang static long __writeback_inodes_wb(struct bdi_writeback *wb,
1969d46db3d5SWu Fengguang struct wb_writeback_work *work)
1970f11c9c5cSEdward Shishkin {
1971d46db3d5SWu Fengguang unsigned long start_time = jiffies;
1972d46db3d5SWu Fengguang long wrote = 0;
1973f11c9c5cSEdward Shishkin
1974f11c9c5cSEdward Shishkin while (!list_empty(&wb->b_io)) {
19757ccf19a8SNick Piggin struct inode *inode = wb_inode(wb->b_io.prev);
1976f11c9c5cSEdward Shishkin struct super_block *sb = inode->i_sb;
1977f11c9c5cSEdward Shishkin
1978d8ce82efSChristian Brauner if (!super_trylock_shared(sb)) {
19790e995816SWu Fengguang /*
1980d8ce82efSChristian Brauner * super_trylock_shared() may fail consistently due to
19810e995816SWu Fengguang * s_umount being grabbed by someone else. Don't use
19820e995816SWu Fengguang * requeue_io() to avoid busy retrying the inode/sb.
19830e995816SWu Fengguang */
19840e995816SWu Fengguang redirty_tail(inode, wb);
1985d19de7edSChristoph Hellwig continue;
1986334132aeSChristoph Hellwig }
1987d46db3d5SWu Fengguang wrote += writeback_sb_inodes(sb, wb, work);
1988eb6ef3dfSKonstantin Khlebnikov up_read(&sb->s_umount);
1989f11c9c5cSEdward Shishkin
1990d46db3d5SWu Fengguang /* refer to the same tests at the end of writeback_sb_inodes */
1991d46db3d5SWu Fengguang if (wrote) {
1992d46db3d5SWu Fengguang if (time_is_before_jiffies(start_time + HZ / 10UL))
1993d46db3d5SWu Fengguang break;
1994d46db3d5SWu Fengguang if (work->nr_pages <= 0)
1995f11c9c5cSEdward Shishkin break;
1996f11c9c5cSEdward Shishkin }
1997d46db3d5SWu Fengguang }
199866f3b8e2SJens Axboe /* Leave any unwritten inodes on b_io */
1999d46db3d5SWu Fengguang return wrote;
200066f3b8e2SJens Axboe }
200166f3b8e2SJens Axboe
writeback_inodes_wb(struct bdi_writeback * wb,long nr_pages,enum wb_reason reason)20027d9f073bSWanpeng Li static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
20030e175a18SCurt Wohlgemuth enum wb_reason reason)
2004edadfb10SChristoph Hellwig {
2005d46db3d5SWu Fengguang struct wb_writeback_work work = {
2006d46db3d5SWu Fengguang .nr_pages = nr_pages,
2007d46db3d5SWu Fengguang .sync_mode = WB_SYNC_NONE,
2008d46db3d5SWu Fengguang .range_cyclic = 1,
20090e175a18SCurt Wohlgemuth .reason = reason,
2010d46db3d5SWu Fengguang };
2011505a666eSLinus Torvalds struct blk_plug plug;
2012edadfb10SChristoph Hellwig
2013505a666eSLinus Torvalds blk_start_plug(&plug);
2014f758eeabSChristoph Hellwig spin_lock(&wb->list_lock);
2015424b351fSWu Fengguang if (list_empty(&wb->b_io))
2016f9cae926SJan Kara queue_io(wb, &work, jiffies);
2017d46db3d5SWu Fengguang __writeback_inodes_wb(wb, &work);
2018f758eeabSChristoph Hellwig spin_unlock(&wb->list_lock);
2019505a666eSLinus Torvalds blk_finish_plug(&plug);
2020edadfb10SChristoph Hellwig
2021d46db3d5SWu Fengguang return nr_pages - work.nr_pages;
202266f3b8e2SJens Axboe }
202366f3b8e2SJens Axboe
202403ba3782SJens Axboe /*
202503ba3782SJens Axboe * Explicit flushing or periodic writeback of "old" data.
202603ba3782SJens Axboe *
202703ba3782SJens Axboe * Define "old": the first time one of an inode's pages is dirtied, we mark the
202803ba3782SJens Axboe * dirtying-time in the inode's address_space. So this periodic writeback code
202903ba3782SJens Axboe * just walks the superblock inode list, writing back any inodes which are
203003ba3782SJens Axboe * older than a specific point in time.
203103ba3782SJens Axboe *
203203ba3782SJens Axboe * Try to run once per dirty_writeback_interval. But if a writeback event
203303ba3782SJens Axboe * takes longer than a dirty_writeback_interval interval, then leave a
203403ba3782SJens Axboe * one-second gap.
203503ba3782SJens Axboe *
2036f9cae926SJan Kara * dirtied_before takes precedence over nr_to_write. So we'll only write back
203703ba3782SJens Axboe * all dirty pages if they are all attached to "old" mappings.
203803ba3782SJens Axboe */
wb_writeback(struct bdi_writeback * wb,struct wb_writeback_work * work)2039c4a77a6cSJens Axboe static long wb_writeback(struct bdi_writeback *wb,
204083ba7b07SChristoph Hellwig struct wb_writeback_work *work)
204103ba3782SJens Axboe {
2042d46db3d5SWu Fengguang long nr_pages = work->nr_pages;
2043f9cae926SJan Kara unsigned long dirtied_before = jiffies;
2044a5989bdcSJan Kara struct inode *inode;
2045d46db3d5SWu Fengguang long progress;
2046505a666eSLinus Torvalds struct blk_plug plug;
2047*0eeb28d7SKemeng Shi bool queued = false;
204803ba3782SJens Axboe
2049505a666eSLinus Torvalds blk_start_plug(&plug);
205003ba3782SJens Axboe for (;;) {
205103ba3782SJens Axboe /*
2052d3ddec76SWu Fengguang * Stop writeback when nr_pages has been consumed
205303ba3782SJens Axboe */
205483ba7b07SChristoph Hellwig if (work->nr_pages <= 0)
205503ba3782SJens Axboe break;
205603ba3782SJens Axboe
205703ba3782SJens Axboe /*
2058aa373cf5SJan Kara * Background writeout and kupdate-style writeback may
2059aa373cf5SJan Kara * run forever. Stop them if there is other work to do
2060aa373cf5SJan Kara * so that e.g. sync can proceed. They'll be restarted
2061aa373cf5SJan Kara * after the other works are all done.
2062aa373cf5SJan Kara */
2063aa373cf5SJan Kara if ((work->for_background || work->for_kupdate) &&
2064f0054bb1STejun Heo !list_empty(&wb->work_list))
2065aa373cf5SJan Kara break;
2066aa373cf5SJan Kara
2067aa373cf5SJan Kara /*
2068d3ddec76SWu Fengguang * For background writeout, stop when we are below the
2069d3ddec76SWu Fengguang * background dirty threshold
207003ba3782SJens Axboe */
2071aa661bbeSTejun Heo if (work->for_background && !wb_over_bg_thresh(wb))
207203ba3782SJens Axboe break;
207303ba3782SJens Axboe
20742816ea2aSYosry Ahmed
20752816ea2aSYosry Ahmed spin_lock(&wb->list_lock);
20762816ea2aSYosry Ahmed
20771bc36b64SJan Kara /*
20781bc36b64SJan Kara * Kupdate and background works are special and we want to
20791bc36b64SJan Kara * include all inodes that need writing. Livelock avoidance is
20801bc36b64SJan Kara * handled by these works yielding to any other work so we are
20811bc36b64SJan Kara * safe.
20821bc36b64SJan Kara */
2083ba9aa839SWu Fengguang if (work->for_kupdate) {
2084f9cae926SJan Kara dirtied_before = jiffies -
2085ba9aa839SWu Fengguang msecs_to_jiffies(dirty_expire_interval * 10);
20861bc36b64SJan Kara } else if (work->for_background)
2087f9cae926SJan Kara dirtied_before = jiffies;
2088028c2dd1SDave Chinner
20895634cc2aSTejun Heo trace_writeback_start(wb, work);
2090*0eeb28d7SKemeng Shi if (list_empty(&wb->b_io)) {
2091f9cae926SJan Kara queue_io(wb, work, dirtied_before);
2092*0eeb28d7SKemeng Shi queued = true;
2093*0eeb28d7SKemeng Shi }
209483ba7b07SChristoph Hellwig if (work->sb)
2095d46db3d5SWu Fengguang progress = writeback_sb_inodes(work->sb, wb, work);
2096edadfb10SChristoph Hellwig else
2097d46db3d5SWu Fengguang progress = __writeback_inodes_wb(wb, work);
20985634cc2aSTejun Heo trace_writeback_written(wb, work);
2099028c2dd1SDave Chinner
210003ba3782SJens Axboe /*
210171fd05a8SJens Axboe * Did we write something? Try for more
2102e6fb6da2SWu Fengguang *
2103e6fb6da2SWu Fengguang * Dirty inodes are moved to b_io for writeback in batches.
2104e6fb6da2SWu Fengguang * The completion of the current batch does not necessarily
2105e6fb6da2SWu Fengguang * mean the overall work is done. So we keep looping as long
2106e6fb6da2SWu Fengguang * as made some progress on cleaning pages or inodes.
210771fd05a8SJens Axboe */
2108*0eeb28d7SKemeng Shi if (progress || !queued) {
21092816ea2aSYosry Ahmed spin_unlock(&wb->list_lock);
211003ba3782SJens Axboe continue;
21112816ea2aSYosry Ahmed }
21122816ea2aSYosry Ahmed
2113a5989bdcSJan Kara /*
2114e6fb6da2SWu Fengguang * No more inodes for IO, bail
2115a5989bdcSJan Kara */
21162816ea2aSYosry Ahmed if (list_empty(&wb->b_more_io)) {
21172816ea2aSYosry Ahmed spin_unlock(&wb->list_lock);
211803ba3782SJens Axboe break;
21192816ea2aSYosry Ahmed }
21202816ea2aSYosry Ahmed
212103ba3782SJens Axboe /*
21228010c3b6SJens Axboe * Nothing written. Wait for some inode to
21238010c3b6SJens Axboe * become available for writeback. Otherwise
21248010c3b6SJens Axboe * we'll just busyloop.
212503ba3782SJens Axboe */
21265634cc2aSTejun Heo trace_writeback_wait(wb, work);
212703ba3782SJens Axboe inode = wb_inode(wb->b_more_io.prev);
2128250df6edSDave Chinner spin_lock(&inode->i_lock);
2129f0d07b7fSJan Kara spin_unlock(&wb->list_lock);
2130169ebd90SJan Kara /* This function drops i_lock... */
2131169ebd90SJan Kara inode_sleep_on_writeback(inode);
213203ba3782SJens Axboe }
2133505a666eSLinus Torvalds blk_finish_plug(&plug);
213403ba3782SJens Axboe
2135d46db3d5SWu Fengguang return nr_pages - work->nr_pages;
213603ba3782SJens Axboe }
213703ba3782SJens Axboe
213803ba3782SJens Axboe /*
213983ba7b07SChristoph Hellwig * Return the next wb_writeback_work struct that hasn't been processed yet.
214003ba3782SJens Axboe */
get_next_work_item(struct bdi_writeback * wb)2141f0054bb1STejun Heo static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb)
214203ba3782SJens Axboe {
214383ba7b07SChristoph Hellwig struct wb_writeback_work *work = NULL;
214403ba3782SJens Axboe
2145f87904c0SKhazhismel Kumykov spin_lock_irq(&wb->work_lock);
2146f0054bb1STejun Heo if (!list_empty(&wb->work_list)) {
2147f0054bb1STejun Heo work = list_entry(wb->work_list.next,
214883ba7b07SChristoph Hellwig struct wb_writeback_work, list);
214983ba7b07SChristoph Hellwig list_del_init(&work->list);
215003ba3782SJens Axboe }
2151f87904c0SKhazhismel Kumykov spin_unlock_irq(&wb->work_lock);
215283ba7b07SChristoph Hellwig return work;
215303ba3782SJens Axboe }
215403ba3782SJens Axboe
wb_check_background_flush(struct bdi_writeback * wb)21556585027aSJan Kara static long wb_check_background_flush(struct bdi_writeback *wb)
21566585027aSJan Kara {
2157aa661bbeSTejun Heo if (wb_over_bg_thresh(wb)) {
21586585027aSJan Kara
21596585027aSJan Kara struct wb_writeback_work work = {
21606585027aSJan Kara .nr_pages = LONG_MAX,
21616585027aSJan Kara .sync_mode = WB_SYNC_NONE,
21626585027aSJan Kara .for_background = 1,
21636585027aSJan Kara .range_cyclic = 1,
21640e175a18SCurt Wohlgemuth .reason = WB_REASON_BACKGROUND,
21656585027aSJan Kara };
21666585027aSJan Kara
21676585027aSJan Kara return wb_writeback(wb, &work);
21686585027aSJan Kara }
21696585027aSJan Kara
21706585027aSJan Kara return 0;
21716585027aSJan Kara }
21726585027aSJan Kara
wb_check_old_data_flush(struct bdi_writeback * wb)217303ba3782SJens Axboe static long wb_check_old_data_flush(struct bdi_writeback *wb)
217403ba3782SJens Axboe {
217503ba3782SJens Axboe unsigned long expired;
217603ba3782SJens Axboe long nr_pages;
217703ba3782SJens Axboe
217869b62d01SJens Axboe /*
217969b62d01SJens Axboe * When set to zero, disable periodic writeback
218069b62d01SJens Axboe */
218169b62d01SJens Axboe if (!dirty_writeback_interval)
218269b62d01SJens Axboe return 0;
218369b62d01SJens Axboe
218403ba3782SJens Axboe expired = wb->last_old_flush +
218503ba3782SJens Axboe msecs_to_jiffies(dirty_writeback_interval * 10);
218603ba3782SJens Axboe if (time_before(jiffies, expired))
218703ba3782SJens Axboe return 0;
218803ba3782SJens Axboe
218903ba3782SJens Axboe wb->last_old_flush = jiffies;
2190cdf01dd5SLinus Torvalds nr_pages = get_nr_dirty_pages();
219103ba3782SJens Axboe
2192c4a77a6cSJens Axboe if (nr_pages) {
219383ba7b07SChristoph Hellwig struct wb_writeback_work work = {
2194c4a77a6cSJens Axboe .nr_pages = nr_pages,
2195c4a77a6cSJens Axboe .sync_mode = WB_SYNC_NONE,
2196c4a77a6cSJens Axboe .for_kupdate = 1,
2197c4a77a6cSJens Axboe .range_cyclic = 1,
21980e175a18SCurt Wohlgemuth .reason = WB_REASON_PERIODIC,
2199c4a77a6cSJens Axboe };
2200c4a77a6cSJens Axboe
220183ba7b07SChristoph Hellwig return wb_writeback(wb, &work);
2202c4a77a6cSJens Axboe }
220303ba3782SJens Axboe
220403ba3782SJens Axboe return 0;
220503ba3782SJens Axboe }
220603ba3782SJens Axboe
wb_check_start_all(struct bdi_writeback * wb)220785009b4fSJens Axboe static long wb_check_start_all(struct bdi_writeback *wb)
220885009b4fSJens Axboe {
220985009b4fSJens Axboe long nr_pages;
221085009b4fSJens Axboe
221185009b4fSJens Axboe if (!test_bit(WB_start_all, &wb->state))
221285009b4fSJens Axboe return 0;
221385009b4fSJens Axboe
221485009b4fSJens Axboe nr_pages = get_nr_dirty_pages();
221585009b4fSJens Axboe if (nr_pages) {
221685009b4fSJens Axboe struct wb_writeback_work work = {
221785009b4fSJens Axboe .nr_pages = wb_split_bdi_pages(wb, nr_pages),
221885009b4fSJens Axboe .sync_mode = WB_SYNC_NONE,
221985009b4fSJens Axboe .range_cyclic = 1,
222085009b4fSJens Axboe .reason = wb->start_all_reason,
222185009b4fSJens Axboe };
222285009b4fSJens Axboe
222385009b4fSJens Axboe nr_pages = wb_writeback(wb, &work);
222485009b4fSJens Axboe }
222585009b4fSJens Axboe
222685009b4fSJens Axboe clear_bit(WB_start_all, &wb->state);
222785009b4fSJens Axboe return nr_pages;
222885009b4fSJens Axboe }
222985009b4fSJens Axboe
223085009b4fSJens Axboe
223103ba3782SJens Axboe /*
223203ba3782SJens Axboe * Retrieve work items and do the writeback they describe
223303ba3782SJens Axboe */
wb_do_writeback(struct bdi_writeback * wb)223425d130baSWanpeng Li static long wb_do_writeback(struct bdi_writeback *wb)
223503ba3782SJens Axboe {
223683ba7b07SChristoph Hellwig struct wb_writeback_work *work;
2237c4a77a6cSJens Axboe long wrote = 0;
223803ba3782SJens Axboe
22394452226eSTejun Heo set_bit(WB_writeback_running, &wb->state);
2240f0054bb1STejun Heo while ((work = get_next_work_item(wb)) != NULL) {
22415634cc2aSTejun Heo trace_writeback_exec(wb, work);
224283ba7b07SChristoph Hellwig wrote += wb_writeback(wb, work);
22434a3a485bSTahsin Erdogan finish_writeback_work(wb, work);
224403ba3782SJens Axboe }
224503ba3782SJens Axboe
224603ba3782SJens Axboe /*
224785009b4fSJens Axboe * Check for a flush-everything request
224885009b4fSJens Axboe */
224985009b4fSJens Axboe wrote += wb_check_start_all(wb);
225085009b4fSJens Axboe
225185009b4fSJens Axboe /*
225203ba3782SJens Axboe * Check for periodic writeback, kupdated() style
225303ba3782SJens Axboe */
225403ba3782SJens Axboe wrote += wb_check_old_data_flush(wb);
22556585027aSJan Kara wrote += wb_check_background_flush(wb);
22564452226eSTejun Heo clear_bit(WB_writeback_running, &wb->state);
225703ba3782SJens Axboe
225803ba3782SJens Axboe return wrote;
225903ba3782SJens Axboe }
226003ba3782SJens Axboe
226103ba3782SJens Axboe /*
226203ba3782SJens Axboe * Handle writeback of dirty data for the device backed by this bdi. Also
2263839a8e86STejun Heo * reschedules periodically and does kupdated style flushing.
226403ba3782SJens Axboe */
wb_workfn(struct work_struct * work)2265f0054bb1STejun Heo void wb_workfn(struct work_struct *work)
226603ba3782SJens Axboe {
2267839a8e86STejun Heo struct bdi_writeback *wb = container_of(to_delayed_work(work),
2268839a8e86STejun Heo struct bdi_writeback, dwork);
226903ba3782SJens Axboe long pages_written;
227003ba3782SJens Axboe
227168f23b89STheodore Ts'o set_worker_desc("flush-%s", bdi_dev_name(wb->bdi));
227203ba3782SJens Axboe
2273839a8e86STejun Heo if (likely(!current_is_workqueue_rescuer() ||
22744452226eSTejun Heo !test_bit(WB_registered, &wb->state))) {
227503ba3782SJens Axboe /*
2276f0054bb1STejun Heo * The normal path. Keep writing back @wb until its
2277839a8e86STejun Heo * work_list is empty. Note that this path is also taken
2278f0054bb1STejun Heo * if @wb is shutting down even when we're running off the
2279839a8e86STejun Heo * rescuer as work_list needs to be drained.
228003ba3782SJens Axboe */
2281839a8e86STejun Heo do {
228225d130baSWanpeng Li pages_written = wb_do_writeback(wb);
2283455b2864SDave Chinner trace_writeback_pages_written(pages_written);
2284f0054bb1STejun Heo } while (!list_empty(&wb->work_list));
2285839a8e86STejun Heo } else {
2286253c34e9SArtem Bityutskiy /*
2287839a8e86STejun Heo * bdi_wq can't get enough workers and we're running off
2288839a8e86STejun Heo * the emergency worker. Don't hog it. Hopefully, 1024 is
2289839a8e86STejun Heo * enough for efficient IO.
2290253c34e9SArtem Bityutskiy */
2291f0054bb1STejun Heo pages_written = writeback_inodes_wb(wb, 1024,
2292839a8e86STejun Heo WB_REASON_FORKER_THREAD);
2293839a8e86STejun Heo trace_writeback_pages_written(pages_written);
229403ba3782SJens Axboe }
229503ba3782SJens Axboe
2296f0054bb1STejun Heo if (!list_empty(&wb->work_list))
2297b8b78495SJan Kara wb_wakeup(wb);
22986ca738d6SDerek Basehore else if (wb_has_dirty_io(wb) && dirty_writeback_interval)
2299f0054bb1STejun Heo wb_wakeup_delayed(wb);
230003ba3782SJens Axboe }
230103ba3782SJens Axboe
230203ba3782SJens Axboe /*
2303595043e5SJens Axboe * Start writeback of `nr_pages' pages on this bdi. If `nr_pages' is zero,
2304595043e5SJens Axboe * write back the whole world.
2305595043e5SJens Axboe */
__wakeup_flusher_threads_bdi(struct backing_dev_info * bdi,enum wb_reason reason)2306595043e5SJens Axboe static void __wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
2307e8e8a0c6SJens Axboe enum wb_reason reason)
2308595043e5SJens Axboe {
2309595043e5SJens Axboe struct bdi_writeback *wb;
2310595043e5SJens Axboe
2311595043e5SJens Axboe if (!bdi_has_dirty_io(bdi))
2312595043e5SJens Axboe return;
2313595043e5SJens Axboe
2314595043e5SJens Axboe list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
2315e8e8a0c6SJens Axboe wb_start_writeback(wb, reason);
2316595043e5SJens Axboe }
2317595043e5SJens Axboe
wakeup_flusher_threads_bdi(struct backing_dev_info * bdi,enum wb_reason reason)2318595043e5SJens Axboe void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
2319595043e5SJens Axboe enum wb_reason reason)
2320595043e5SJens Axboe {
2321595043e5SJens Axboe rcu_read_lock();
2322e8e8a0c6SJens Axboe __wakeup_flusher_threads_bdi(bdi, reason);
2323595043e5SJens Axboe rcu_read_unlock();
2324595043e5SJens Axboe }
2325595043e5SJens Axboe
2326595043e5SJens Axboe /*
23279ba4b2dfSJens Axboe * Wakeup the flusher threads to start writeback of all currently dirty pages
232803ba3782SJens Axboe */
wakeup_flusher_threads(enum wb_reason reason)23299ba4b2dfSJens Axboe void wakeup_flusher_threads(enum wb_reason reason)
233003ba3782SJens Axboe {
2331b8c2f347SChristoph Hellwig struct backing_dev_info *bdi;
2332b8c2f347SChristoph Hellwig
233351350ea0SKonstantin Khlebnikov /*
233451350ea0SKonstantin Khlebnikov * If we are expecting writeback progress we must submit plugged IO.
233551350ea0SKonstantin Khlebnikov */
2336008f75a2SChristoph Hellwig blk_flush_plug(current->plug, true);
233751350ea0SKonstantin Khlebnikov
2338b8c2f347SChristoph Hellwig rcu_read_lock();
2339595043e5SJens Axboe list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
2340e8e8a0c6SJens Axboe __wakeup_flusher_threads_bdi(bdi, reason);
2341b8c2f347SChristoph Hellwig rcu_read_unlock();
234203ba3782SJens Axboe }
234303ba3782SJens Axboe
2344a2f48706STheodore Ts'o /*
2345a2f48706STheodore Ts'o * Wake up bdi's periodically to make sure dirtytime inodes gets
2346a2f48706STheodore Ts'o * written back periodically. We deliberately do *not* check the
2347a2f48706STheodore Ts'o * b_dirtytime list in wb_has_dirty_io(), since this would cause the
2348a2f48706STheodore Ts'o * kernel to be constantly waking up once there are any dirtytime
2349a2f48706STheodore Ts'o * inodes on the system. So instead we define a separate delayed work
2350a2f48706STheodore Ts'o * function which gets called much more rarely. (By default, only
2351a2f48706STheodore Ts'o * once every 12 hours.)
2352a2f48706STheodore Ts'o *
2353a2f48706STheodore Ts'o * If there is any other write activity going on in the file system,
2354a2f48706STheodore Ts'o * this function won't be necessary. But if the only thing that has
2355a2f48706STheodore Ts'o * happened on the file system is a dirtytime inode caused by an atime
2356a2f48706STheodore Ts'o * update, we need this infrastructure below to make sure that inode
2357a2f48706STheodore Ts'o * eventually gets pushed out to disk.
2358a2f48706STheodore Ts'o */
2359a2f48706STheodore Ts'o static void wakeup_dirtytime_writeback(struct work_struct *w);
2360a2f48706STheodore Ts'o static DECLARE_DELAYED_WORK(dirtytime_work, wakeup_dirtytime_writeback);
2361a2f48706STheodore Ts'o
wakeup_dirtytime_writeback(struct work_struct * w)2362a2f48706STheodore Ts'o static void wakeup_dirtytime_writeback(struct work_struct *w)
2363a2f48706STheodore Ts'o {
2364a2f48706STheodore Ts'o struct backing_dev_info *bdi;
2365a2f48706STheodore Ts'o
2366a2f48706STheodore Ts'o rcu_read_lock();
2367a2f48706STheodore Ts'o list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
2368001fe6f6STejun Heo struct bdi_writeback *wb;
2369001fe6f6STejun Heo
2370b817525aSTejun Heo list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
23716fdf860fSTejun Heo if (!list_empty(&wb->b_dirty_time))
23726fdf860fSTejun Heo wb_wakeup(wb);
2373a2f48706STheodore Ts'o }
2374a2f48706STheodore Ts'o rcu_read_unlock();
2375a2f48706STheodore Ts'o schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
2376a2f48706STheodore Ts'o }
2377a2f48706STheodore Ts'o
start_dirtytime_writeback(void)2378a2f48706STheodore Ts'o static int __init start_dirtytime_writeback(void)
2379a2f48706STheodore Ts'o {
2380a2f48706STheodore Ts'o schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
2381a2f48706STheodore Ts'o return 0;
2382a2f48706STheodore Ts'o }
2383a2f48706STheodore Ts'o __initcall(start_dirtytime_writeback);
2384a2f48706STheodore Ts'o
dirtytime_interval_handler(struct ctl_table * table,int write,void * buffer,size_t * lenp,loff_t * ppos)23851efff914STheodore Ts'o int dirtytime_interval_handler(struct ctl_table *table, int write,
23869ca48e20STobias Klauser void *buffer, size_t *lenp, loff_t *ppos)
23871efff914STheodore Ts'o {
23881efff914STheodore Ts'o int ret;
23891efff914STheodore Ts'o
23901efff914STheodore Ts'o ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
23911efff914STheodore Ts'o if (ret == 0 && write)
23921efff914STheodore Ts'o mod_delayed_work(system_wq, &dirtytime_work, 0);
23931efff914STheodore Ts'o return ret;
23941efff914STheodore Ts'o }
23951efff914STheodore Ts'o
239603ba3782SJens Axboe /**
239735d14f27SEric Biggers * __mark_inode_dirty - internal function to mark an inode dirty
23980117d427SMauro Carvalho Chehab *
239903ba3782SJens Axboe * @inode: inode to mark
240035d14f27SEric Biggers * @flags: what kind of dirty, e.g. I_DIRTY_SYNC. This can be a combination of
240135d14f27SEric Biggers * multiple I_DIRTY_* flags, except that I_DIRTY_TIME can't be combined
240235d14f27SEric Biggers * with I_DIRTY_PAGES.
24030117d427SMauro Carvalho Chehab *
240435d14f27SEric Biggers * Mark an inode as dirty. We notify the filesystem, then update the inode's
240535d14f27SEric Biggers * dirty flags. Then, if needed we add the inode to the appropriate dirty list.
240603ba3782SJens Axboe *
240735d14f27SEric Biggers * Most callers should use mark_inode_dirty() or mark_inode_dirty_sync()
240835d14f27SEric Biggers * instead of calling this directly.
240903ba3782SJens Axboe *
241035d14f27SEric Biggers * CAREFUL! We only add the inode to the dirty list if it is hashed or if it
241135d14f27SEric Biggers * refers to a blockdev. Unhashed inodes will never be added to the dirty list
241235d14f27SEric Biggers * even if they are later hashed, as they will have been marked dirty already.
241303ba3782SJens Axboe *
241435d14f27SEric Biggers * In short, ensure you hash any inodes _before_ you start marking them dirty.
241503ba3782SJens Axboe *
241603ba3782SJens Axboe * Note that for blockdevs, inode->dirtied_when represents the dirtying time of
241703ba3782SJens Axboe * the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of
241803ba3782SJens Axboe * the kernel-internal blockdev inode represents the dirtying time of the
241903ba3782SJens Axboe * blockdev's pages. This is why for I_DIRTY_PAGES we always use
242003ba3782SJens Axboe * page->mapping->host, so the page-dirtying time is recorded in the internal
242103ba3782SJens Axboe * blockdev inode.
242203ba3782SJens Axboe */
__mark_inode_dirty(struct inode * inode,int flags)242303ba3782SJens Axboe void __mark_inode_dirty(struct inode *inode, int flags)
242403ba3782SJens Axboe {
242503ba3782SJens Axboe struct super_block *sb = inode->i_sb;
242635d14f27SEric Biggers int dirtytime = 0;
242710e14073SJchao Sun struct bdi_writeback *wb = NULL;
24280ae45f63STheodore Ts'o
24290ae45f63STheodore Ts'o trace_writeback_mark_inode_dirty(inode, flags);
243003ba3782SJens Axboe
2431e2728c56SEric Biggers if (flags & I_DIRTY_INODE) {
243235d14f27SEric Biggers /*
2433cbfecb92SLukas Czerner * Inode timestamp update will piggback on this dirtying.
2434cbfecb92SLukas Czerner * We tell ->dirty_inode callback that timestamps need to
2435cbfecb92SLukas Czerner * be updated by setting I_DIRTY_TIME in flags.
2436cbfecb92SLukas Czerner */
2437cbfecb92SLukas Czerner if (inode->i_state & I_DIRTY_TIME) {
2438cbfecb92SLukas Czerner spin_lock(&inode->i_lock);
2439cbfecb92SLukas Czerner if (inode->i_state & I_DIRTY_TIME) {
2440cbfecb92SLukas Czerner inode->i_state &= ~I_DIRTY_TIME;
2441cbfecb92SLukas Czerner flags |= I_DIRTY_TIME;
2442cbfecb92SLukas Czerner }
2443cbfecb92SLukas Czerner spin_unlock(&inode->i_lock);
2444cbfecb92SLukas Czerner }
2445cbfecb92SLukas Czerner
2446cbfecb92SLukas Czerner /*
244735d14f27SEric Biggers * Notify the filesystem about the inode being dirtied, so that
244835d14f27SEric Biggers * (if needed) it can update on-disk fields and journal the
244935d14f27SEric Biggers * inode. This is only needed when the inode itself is being
245035d14f27SEric Biggers * dirtied now. I.e. it's only needed for I_DIRTY_INODE, not
245135d14f27SEric Biggers * for just I_DIRTY_PAGES or I_DIRTY_TIME.
245235d14f27SEric Biggers */
24539fb0a7daSTejun Heo trace_writeback_dirty_inode_start(inode, flags);
245403ba3782SJens Axboe if (sb->s_op->dirty_inode)
2455cbfecb92SLukas Czerner sb->s_op->dirty_inode(inode,
2456cbfecb92SLukas Czerner flags & (I_DIRTY_INODE | I_DIRTY_TIME));
24579fb0a7daSTejun Heo trace_writeback_dirty_inode(inode, flags);
2458e2728c56SEric Biggers
245935d14f27SEric Biggers /* I_DIRTY_INODE supersedes I_DIRTY_TIME. */
24600ae45f63STheodore Ts'o flags &= ~I_DIRTY_TIME;
246135d14f27SEric Biggers } else {
246235d14f27SEric Biggers /*
246335d14f27SEric Biggers * Else it's either I_DIRTY_PAGES, I_DIRTY_TIME, or nothing.
246435d14f27SEric Biggers * (We don't support setting both I_DIRTY_PAGES and I_DIRTY_TIME
246535d14f27SEric Biggers * in one call to __mark_inode_dirty().)
246635d14f27SEric Biggers */
24670ae45f63STheodore Ts'o dirtytime = flags & I_DIRTY_TIME;
246835d14f27SEric Biggers WARN_ON_ONCE(dirtytime && flags != I_DIRTY_TIME);
246935d14f27SEric Biggers }
247003ba3782SJens Axboe
247103ba3782SJens Axboe /*
24729c6ac78eSTejun Heo * Paired with smp_mb() in __writeback_single_inode() for the
24739c6ac78eSTejun Heo * following lockless i_state test. See there for details.
247403ba3782SJens Axboe */
247503ba3782SJens Axboe smp_mb();
247603ba3782SJens Axboe
2477cbfecb92SLukas Czerner if ((inode->i_state & flags) == flags)
247803ba3782SJens Axboe return;
247903ba3782SJens Axboe
2480250df6edSDave Chinner spin_lock(&inode->i_lock);
248103ba3782SJens Axboe if ((inode->i_state & flags) != flags) {
248203ba3782SJens Axboe const int was_dirty = inode->i_state & I_DIRTY;
248303ba3782SJens Axboe
248452ebea74STejun Heo inode_attach_wb(inode, NULL);
248552ebea74STejun Heo
248603ba3782SJens Axboe inode->i_state |= flags;
248703ba3782SJens Axboe
248803ba3782SJens Axboe /*
248910e14073SJchao Sun * Grab inode's wb early because it requires dropping i_lock and we
249010e14073SJchao Sun * need to make sure following checks happen atomically with dirty
249110e14073SJchao Sun * list handling so that we don't move inodes under flush worker's
249210e14073SJchao Sun * hands.
249310e14073SJchao Sun */
249410e14073SJchao Sun if (!was_dirty) {
249510e14073SJchao Sun wb = locked_inode_to_wb_and_lock_list(inode);
249610e14073SJchao Sun spin_lock(&inode->i_lock);
249710e14073SJchao Sun }
249810e14073SJchao Sun
249910e14073SJchao Sun /*
25005afced3bSJan Kara * If the inode is queued for writeback by flush worker, just
25015afced3bSJan Kara * update its dirty state. Once the flush worker is done with
25025afced3bSJan Kara * the inode it will place it on the appropriate superblock
25035afced3bSJan Kara * list, based upon its state.
250403ba3782SJens Axboe */
25055afced3bSJan Kara if (inode->i_state & I_SYNC_QUEUED)
250610e14073SJchao Sun goto out_unlock;
250703ba3782SJens Axboe
250803ba3782SJens Axboe /*
250903ba3782SJens Axboe * Only add valid (hashed) inodes to the superblock's
251003ba3782SJens Axboe * dirty list. Add blockdev inodes as well.
251103ba3782SJens Axboe */
251203ba3782SJens Axboe if (!S_ISBLK(inode->i_mode)) {
25131d3382cbSAl Viro if (inode_unhashed(inode))
251410e14073SJchao Sun goto out_unlock;
251503ba3782SJens Axboe }
2516a4ffdde6SAl Viro if (inode->i_state & I_FREEING)
251710e14073SJchao Sun goto out_unlock;
251803ba3782SJens Axboe
251903ba3782SJens Axboe /*
252003ba3782SJens Axboe * If the inode was already on b_dirty/b_io/b_more_io, don't
252103ba3782SJens Axboe * reposition it (that would break b_dirty time-ordering).
252203ba3782SJens Axboe */
252303ba3782SJens Axboe if (!was_dirty) {
2524d6c10f1fSTejun Heo struct list_head *dirty_list;
2525a66979abSDave Chinner bool wakeup_bdi = false;
2526500b067cSJens Axboe
252703ba3782SJens Axboe inode->dirtied_when = jiffies;
2528a2f48706STheodore Ts'o if (dirtytime)
2529a2f48706STheodore Ts'o inode->dirtied_time_when = jiffies;
2530d6c10f1fSTejun Heo
25310e11f644SChristoph Hellwig if (inode->i_state & I_DIRTY)
25320747259dSTejun Heo dirty_list = &wb->b_dirty;
2533a2f48706STheodore Ts'o else
25340747259dSTejun Heo dirty_list = &wb->b_dirty_time;
2535d6c10f1fSTejun Heo
2536c7f54084SDave Chinner wakeup_bdi = inode_io_list_move_locked(inode, wb,
2537d6c10f1fSTejun Heo dirty_list);
2538d6c10f1fSTejun Heo
25390747259dSTejun Heo spin_unlock(&wb->list_lock);
254010e14073SJchao Sun spin_unlock(&inode->i_lock);
25410ae45f63STheodore Ts'o trace_writeback_dirty_inode_enqueue(inode);
2542253c34e9SArtem Bityutskiy
2543d6c10f1fSTejun Heo /*
2544d6c10f1fSTejun Heo * If this is the first dirty inode for this bdi,
2545d6c10f1fSTejun Heo * we have to wake-up the corresponding bdi thread
2546d6c10f1fSTejun Heo * to make sure background write-back happens
2547d6c10f1fSTejun Heo * later.
2548d6c10f1fSTejun Heo */
2549f56753acSChristoph Hellwig if (wakeup_bdi &&
2550f56753acSChristoph Hellwig (wb->bdi->capabilities & BDI_CAP_WRITEBACK))
25510747259dSTejun Heo wb_wakeup_delayed(wb);
2552a66979abSDave Chinner return;
2553a66979abSDave Chinner }
2554a66979abSDave Chinner }
255510e14073SJchao Sun out_unlock:
255610e14073SJchao Sun if (wb)
255710e14073SJchao Sun spin_unlock(&wb->list_lock);
2558a66979abSDave Chinner spin_unlock(&inode->i_lock);
255903ba3782SJens Axboe }
256003ba3782SJens Axboe EXPORT_SYMBOL(__mark_inode_dirty);
256103ba3782SJens Axboe
2562e97fedb9SDave Chinner /*
2563e97fedb9SDave Chinner * The @s_sync_lock is used to serialise concurrent sync operations
2564e97fedb9SDave Chinner * to avoid lock contention problems with concurrent wait_sb_inodes() calls.
2565e97fedb9SDave Chinner * Concurrent callers will block on the s_sync_lock rather than doing contending
2566e97fedb9SDave Chinner * walks. The queueing maintains sync(2) required behaviour as all the IO that
2567e97fedb9SDave Chinner * has been issued up to the time this function is enter is guaranteed to be
2568e97fedb9SDave Chinner * completed by the time we have gained the lock and waited for all IO that is
2569e97fedb9SDave Chinner * in progress regardless of the order callers are granted the lock.
2570e97fedb9SDave Chinner */
wait_sb_inodes(struct super_block * sb)2571b6e51316SJens Axboe static void wait_sb_inodes(struct super_block *sb)
257266f3b8e2SJens Axboe {
25736c60d2b5SDave Chinner LIST_HEAD(sync_list);
257438f21977SNick Piggin
257503ba3782SJens Axboe /*
257603ba3782SJens Axboe * We need to be protected against the filesystem going from
257703ba3782SJens Axboe * r/o to r/w or vice versa.
257803ba3782SJens Axboe */
2579b6e51316SJens Axboe WARN_ON(!rwsem_is_locked(&sb->s_umount));
258003ba3782SJens Axboe
2581e97fedb9SDave Chinner mutex_lock(&sb->s_sync_lock);
258266f3b8e2SJens Axboe
258338f21977SNick Piggin /*
25846c60d2b5SDave Chinner * Splice the writeback list onto a temporary list to avoid waiting on
25856c60d2b5SDave Chinner * inodes that have started writeback after this point.
25866c60d2b5SDave Chinner *
25876c60d2b5SDave Chinner * Use rcu_read_lock() to keep the inodes around until we have a
25886c60d2b5SDave Chinner * reference. s_inode_wblist_lock protects sb->s_inodes_wb as well as
25896c60d2b5SDave Chinner * the local list because inodes can be dropped from either by writeback
25906c60d2b5SDave Chinner * completion.
259138f21977SNick Piggin */
25926c60d2b5SDave Chinner rcu_read_lock();
25936c60d2b5SDave Chinner spin_lock_irq(&sb->s_inode_wblist_lock);
25946c60d2b5SDave Chinner list_splice_init(&sb->s_inodes_wb, &sync_list);
25956c60d2b5SDave Chinner
25966c60d2b5SDave Chinner /*
25976c60d2b5SDave Chinner * Data integrity sync. Must wait for all pages under writeback, because
25986c60d2b5SDave Chinner * there may have been pages dirtied before our sync call, but which had
25996c60d2b5SDave Chinner * writeout started before we write it out. In which case, the inode
26006c60d2b5SDave Chinner * may not be on the dirty list, but we still have to wait for that
26016c60d2b5SDave Chinner * writeout.
26026c60d2b5SDave Chinner */
26036c60d2b5SDave Chinner while (!list_empty(&sync_list)) {
26046c60d2b5SDave Chinner struct inode *inode = list_first_entry(&sync_list, struct inode,
26056c60d2b5SDave Chinner i_wb_list);
2606250df6edSDave Chinner struct address_space *mapping = inode->i_mapping;
260738f21977SNick Piggin
26086c60d2b5SDave Chinner /*
26096c60d2b5SDave Chinner * Move each inode back to the wb list before we drop the lock
26106c60d2b5SDave Chinner * to preserve consistency between i_wb_list and the mapping
26116c60d2b5SDave Chinner * writeback tag. Writeback completion is responsible to remove
26126c60d2b5SDave Chinner * the inode from either list once the writeback tag is cleared.
26136c60d2b5SDave Chinner */
26146c60d2b5SDave Chinner list_move_tail(&inode->i_wb_list, &sb->s_inodes_wb);
26156c60d2b5SDave Chinner
26166c60d2b5SDave Chinner /*
26176c60d2b5SDave Chinner * The mapping can appear untagged while still on-list since we
26186c60d2b5SDave Chinner * do not have the mapping lock. Skip it here, wb completion
26196c60d2b5SDave Chinner * will remove it.
26206c60d2b5SDave Chinner */
26216c60d2b5SDave Chinner if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK))
26226c60d2b5SDave Chinner continue;
26236c60d2b5SDave Chinner
26246c60d2b5SDave Chinner spin_unlock_irq(&sb->s_inode_wblist_lock);
26256c60d2b5SDave Chinner
2626250df6edSDave Chinner spin_lock(&inode->i_lock);
26276c60d2b5SDave Chinner if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
2628250df6edSDave Chinner spin_unlock(&inode->i_lock);
26296c60d2b5SDave Chinner
26306c60d2b5SDave Chinner spin_lock_irq(&sb->s_inode_wblist_lock);
263138f21977SNick Piggin continue;
2632250df6edSDave Chinner }
263338f21977SNick Piggin __iget(inode);
2634250df6edSDave Chinner spin_unlock(&inode->i_lock);
26356c60d2b5SDave Chinner rcu_read_unlock();
263638f21977SNick Piggin
2637aa750fd7SJunichi Nomura /*
2638aa750fd7SJunichi Nomura * We keep the error status of individual mapping so that
2639aa750fd7SJunichi Nomura * applications can catch the writeback error using fsync(2).
2640aa750fd7SJunichi Nomura * See filemap_fdatawait_keep_errors() for details.
2641aa750fd7SJunichi Nomura */
2642aa750fd7SJunichi Nomura filemap_fdatawait_keep_errors(mapping);
264338f21977SNick Piggin
264438f21977SNick Piggin cond_resched();
264538f21977SNick Piggin
26466c60d2b5SDave Chinner iput(inode);
26476c60d2b5SDave Chinner
26486c60d2b5SDave Chinner rcu_read_lock();
26496c60d2b5SDave Chinner spin_lock_irq(&sb->s_inode_wblist_lock);
265038f21977SNick Piggin }
26516c60d2b5SDave Chinner spin_unlock_irq(&sb->s_inode_wblist_lock);
26526c60d2b5SDave Chinner rcu_read_unlock();
2653e97fedb9SDave Chinner mutex_unlock(&sb->s_sync_lock);
265466f3b8e2SJens Axboe }
26551da177e4SLinus Torvalds
__writeback_inodes_sb_nr(struct super_block * sb,unsigned long nr,enum wb_reason reason,bool skip_if_busy)2656f30a7d0cSTejun Heo static void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr,
2657f30a7d0cSTejun Heo enum wb_reason reason, bool skip_if_busy)
26581da177e4SLinus Torvalds {
26595b9cce4cSTejun Heo struct backing_dev_info *bdi = sb->s_bdi;
26605b9cce4cSTejun Heo DEFINE_WB_COMPLETION(done, bdi);
266183ba7b07SChristoph Hellwig struct wb_writeback_work work = {
26623c4d7165SChristoph Hellwig .sb = sb,
26633c4d7165SChristoph Hellwig .sync_mode = WB_SYNC_NONE,
26646e6938b6SWu Fengguang .tagged_writepages = 1,
266583ba7b07SChristoph Hellwig .done = &done,
26663259f8beSChris Mason .nr_pages = nr,
26670e175a18SCurt Wohlgemuth .reason = reason,
26683c4d7165SChristoph Hellwig };
26690e3c9a22SJens Axboe
2670e7972912STejun Heo if (!bdi_has_dirty_io(bdi) || bdi == &noop_backing_dev_info)
26716eedc701SJan Kara return;
2672cf37e972SChristoph Hellwig WARN_ON(!rwsem_is_locked(&sb->s_umount));
2673f30a7d0cSTejun Heo
2674db125360STejun Heo bdi_split_work_to_wbs(sb->s_bdi, &work, skip_if_busy);
26755b9cce4cSTejun Heo wb_wait_for_completion(&done);
26761da177e4SLinus Torvalds }
2677f30a7d0cSTejun Heo
2678f30a7d0cSTejun Heo /**
2679f30a7d0cSTejun Heo * writeback_inodes_sb_nr - writeback dirty inodes from given super_block
2680f30a7d0cSTejun Heo * @sb: the superblock
2681f30a7d0cSTejun Heo * @nr: the number of pages to write
2682f30a7d0cSTejun Heo * @reason: reason why some writeback work initiated
2683f30a7d0cSTejun Heo *
2684f30a7d0cSTejun Heo * Start writeback on some inodes on this super_block. No guarantees are made
2685f30a7d0cSTejun Heo * on how many (if any) will be written, and this function does not wait
2686f30a7d0cSTejun Heo * for IO completion of submitted IO.
2687f30a7d0cSTejun Heo */
writeback_inodes_sb_nr(struct super_block * sb,unsigned long nr,enum wb_reason reason)2688f30a7d0cSTejun Heo void writeback_inodes_sb_nr(struct super_block *sb,
2689f30a7d0cSTejun Heo unsigned long nr,
2690f30a7d0cSTejun Heo enum wb_reason reason)
2691f30a7d0cSTejun Heo {
2692f30a7d0cSTejun Heo __writeback_inodes_sb_nr(sb, nr, reason, false);
2693f30a7d0cSTejun Heo }
26943259f8beSChris Mason EXPORT_SYMBOL(writeback_inodes_sb_nr);
26953259f8beSChris Mason
26963259f8beSChris Mason /**
26973259f8beSChris Mason * writeback_inodes_sb - writeback dirty inodes from given super_block
26983259f8beSChris Mason * @sb: the superblock
2699786228abSMarcos Paulo de Souza * @reason: reason why some writeback work was initiated
27003259f8beSChris Mason *
27013259f8beSChris Mason * Start writeback on some inodes on this super_block. No guarantees are made
27023259f8beSChris Mason * on how many (if any) will be written, and this function does not wait
27033259f8beSChris Mason * for IO completion of submitted IO.
27043259f8beSChris Mason */
writeback_inodes_sb(struct super_block * sb,enum wb_reason reason)27050e175a18SCurt Wohlgemuth void writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
27063259f8beSChris Mason {
27070e175a18SCurt Wohlgemuth return writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason);
27083259f8beSChris Mason }
2709d8a8559cSJens Axboe EXPORT_SYMBOL(writeback_inodes_sb);
2710d8a8559cSJens Axboe
2711d8a8559cSJens Axboe /**
271210ee27a0SMiao Xie * try_to_writeback_inodes_sb - try to start writeback if none underway
271310ee27a0SMiao Xie * @sb: the superblock
271410ee27a0SMiao Xie * @reason: reason why some writeback work was initiated
271510ee27a0SMiao Xie *
27168264c321SRakesh Pandit * Invoke __writeback_inodes_sb_nr if no writeback is currently underway.
271710ee27a0SMiao Xie */
try_to_writeback_inodes_sb(struct super_block * sb,enum wb_reason reason)27188264c321SRakesh Pandit void try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
271910ee27a0SMiao Xie {
27208264c321SRakesh Pandit if (!down_read_trylock(&sb->s_umount))
27218264c321SRakesh Pandit return;
27228264c321SRakesh Pandit
27238264c321SRakesh Pandit __writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason, true);
27248264c321SRakesh Pandit up_read(&sb->s_umount);
272510ee27a0SMiao Xie }
272610ee27a0SMiao Xie EXPORT_SYMBOL(try_to_writeback_inodes_sb);
27273259f8beSChris Mason
27283259f8beSChris Mason /**
2729d8a8559cSJens Axboe * sync_inodes_sb - sync sb inode pages
2730d8a8559cSJens Axboe * @sb: the superblock
2731d8a8559cSJens Axboe *
2732d8a8559cSJens Axboe * This function writes and waits on any dirty inode belonging to this
27330dc83bd3SJan Kara * super_block.
2734d8a8559cSJens Axboe */
sync_inodes_sb(struct super_block * sb)27350dc83bd3SJan Kara void sync_inodes_sb(struct super_block *sb)
2736d8a8559cSJens Axboe {
27375b9cce4cSTejun Heo struct backing_dev_info *bdi = sb->s_bdi;
27385b9cce4cSTejun Heo DEFINE_WB_COMPLETION(done, bdi);
273983ba7b07SChristoph Hellwig struct wb_writeback_work work = {
27403c4d7165SChristoph Hellwig .sb = sb,
27413c4d7165SChristoph Hellwig .sync_mode = WB_SYNC_ALL,
27423c4d7165SChristoph Hellwig .nr_pages = LONG_MAX,
27433c4d7165SChristoph Hellwig .range_cyclic = 0,
274483ba7b07SChristoph Hellwig .done = &done,
27450e175a18SCurt Wohlgemuth .reason = WB_REASON_SYNC,
27467747bd4bSDave Chinner .for_sync = 1,
27473c4d7165SChristoph Hellwig };
27483c4d7165SChristoph Hellwig
2749006a0973STejun Heo /*
2750006a0973STejun Heo * Can't skip on !bdi_has_dirty() because we should wait for !dirty
2751006a0973STejun Heo * inodes under writeback and I_DIRTY_TIME inodes ignored by
2752006a0973STejun Heo * bdi_has_dirty() need to be written out too.
2753006a0973STejun Heo */
2754006a0973STejun Heo if (bdi == &noop_backing_dev_info)
27556eedc701SJan Kara return;
2756cf37e972SChristoph Hellwig WARN_ON(!rwsem_is_locked(&sb->s_umount));
2757cf37e972SChristoph Hellwig
27587fc5854fSTejun Heo /* protect against inode wb switch, see inode_switch_wbs_work_fn() */
27597fc5854fSTejun Heo bdi_down_write_wb_switch_rwsem(bdi);
2760db125360STejun Heo bdi_split_work_to_wbs(bdi, &work, false);
27615b9cce4cSTejun Heo wb_wait_for_completion(&done);
27627fc5854fSTejun Heo bdi_up_write_wb_switch_rwsem(bdi);
276383ba7b07SChristoph Hellwig
2764b6e51316SJens Axboe wait_sb_inodes(sb);
2765d8a8559cSJens Axboe }
2766d8a8559cSJens Axboe EXPORT_SYMBOL(sync_inodes_sb);
27671da177e4SLinus Torvalds
27681da177e4SLinus Torvalds /**
27691da177e4SLinus Torvalds * write_inode_now - write an inode to disk
27701da177e4SLinus Torvalds * @inode: inode to write to disk
27711da177e4SLinus Torvalds * @sync: whether the write should be synchronous or not
27721da177e4SLinus Torvalds *
27737f04c26dSAndrea Arcangeli * This function commits an inode to disk immediately if it is dirty. This is
27747f04c26dSAndrea Arcangeli * primarily needed by knfsd.
27757f04c26dSAndrea Arcangeli *
27767f04c26dSAndrea Arcangeli * The caller must either have a ref on the inode or must have set I_WILL_FREE.
27771da177e4SLinus Torvalds */
write_inode_now(struct inode * inode,int sync)27781da177e4SLinus Torvalds int write_inode_now(struct inode *inode, int sync)
27791da177e4SLinus Torvalds {
27801da177e4SLinus Torvalds struct writeback_control wbc = {
27811da177e4SLinus Torvalds .nr_to_write = LONG_MAX,
278218914b18SMike Galbraith .sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE,
2783111ebb6eSOGAWA Hirofumi .range_start = 0,
2784111ebb6eSOGAWA Hirofumi .range_end = LLONG_MAX,
27851da177e4SLinus Torvalds };
27861da177e4SLinus Torvalds
2787f56753acSChristoph Hellwig if (!mapping_can_writeback(inode->i_mapping))
278849364ce2SAndrew Morton wbc.nr_to_write = 0;
27891da177e4SLinus Torvalds
27901da177e4SLinus Torvalds might_sleep();
2791aaf25593STejun Heo return writeback_single_inode(inode, &wbc);
27921da177e4SLinus Torvalds }
27931da177e4SLinus Torvalds EXPORT_SYMBOL(write_inode_now);
27941da177e4SLinus Torvalds
27951da177e4SLinus Torvalds /**
2796c691b9d9SAndrew Morton * sync_inode_metadata - write an inode to disk
2797c3765016SChristoph Hellwig * @inode: the inode to sync
2798c3765016SChristoph Hellwig * @wait: wait for I/O to complete.
2799c3765016SChristoph Hellwig *
2800c691b9d9SAndrew Morton * Write an inode to disk and adjust its dirty state after completion.
2801c3765016SChristoph Hellwig *
2802c3765016SChristoph Hellwig * Note: only writes the actual inode, no associated data or other metadata.
2803c3765016SChristoph Hellwig */
sync_inode_metadata(struct inode * inode,int wait)2804c3765016SChristoph Hellwig int sync_inode_metadata(struct inode *inode, int wait)
2805c3765016SChristoph Hellwig {
2806c3765016SChristoph Hellwig struct writeback_control wbc = {
2807c3765016SChristoph Hellwig .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE,
2808c3765016SChristoph Hellwig .nr_to_write = 0, /* metadata-only */
2809c3765016SChristoph Hellwig };
2810c3765016SChristoph Hellwig
28115662c967SJosef Bacik return writeback_single_inode(inode, &wbc);
2812c3765016SChristoph Hellwig }
2813c3765016SChristoph Hellwig EXPORT_SYMBOL(sync_inode_metadata);
2814