1 /* 2 * fs/fs-writeback.c 3 * 4 * Copyright (C) 2002, Linus Torvalds. 5 * 6 * Contains all the functions related to writing back and waiting 7 * upon dirty inodes against superblocks, and writing back dirty 8 * pages against inodes. ie: data writeback. Writeout of the 9 * inode itself is not handled here. 10 * 11 * 10Apr2002 Andrew Morton 12 * Split out of fs/inode.c 13 * Additions for address_space-based writeback 14 */ 15 16 #include <linux/kernel.h> 17 #include <linux/export.h> 18 #include <linux/spinlock.h> 19 #include <linux/slab.h> 20 #include <linux/sched.h> 21 #include <linux/fs.h> 22 #include <linux/mm.h> 23 #include <linux/pagemap.h> 24 #include <linux/kthread.h> 25 #include <linux/writeback.h> 26 #include <linux/blkdev.h> 27 #include <linux/backing-dev.h> 28 #include <linux/tracepoint.h> 29 #include <linux/device.h> 30 #include <linux/memcontrol.h> 31 #include "internal.h" 32 33 /* 34 * 4MB minimal write chunk size 35 */ 36 #define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_CACHE_SHIFT - 10)) 37 38 struct wb_completion { 39 atomic_t cnt; 40 }; 41 42 /* 43 * Passed into wb_writeback(), essentially a subset of writeback_control 44 */ 45 struct wb_writeback_work { 46 long nr_pages; 47 struct super_block *sb; 48 unsigned long *older_than_this; 49 enum writeback_sync_modes sync_mode; 50 unsigned int tagged_writepages:1; 51 unsigned int for_kupdate:1; 52 unsigned int range_cyclic:1; 53 unsigned int for_background:1; 54 unsigned int for_sync:1; /* sync(2) WB_SYNC_ALL writeback */ 55 unsigned int auto_free:1; /* free on completion */ 56 enum wb_reason reason; /* why was writeback initiated? */ 57 58 struct list_head list; /* pending work list */ 59 struct wb_completion *done; /* set if the caller waits */ 60 }; 61 62 /* 63 * If one wants to wait for one or more wb_writeback_works, each work's 64 * ->done should be set to a wb_completion defined using the following 65 * macro. Once all work items are issued with wb_queue_work(), the caller 66 * can wait for the completion of all using wb_wait_for_completion(). Work 67 * items which are waited upon aren't freed automatically on completion. 68 */ 69 #define DEFINE_WB_COMPLETION_ONSTACK(cmpl) \ 70 struct wb_completion cmpl = { \ 71 .cnt = ATOMIC_INIT(1), \ 72 } 73 74 75 /* 76 * If an inode is constantly having its pages dirtied, but then the 77 * updates stop dirtytime_expire_interval seconds in the past, it's 78 * possible for the worst case time between when an inode has its 79 * timestamps updated and when they finally get written out to be two 80 * dirtytime_expire_intervals. We set the default to 12 hours (in 81 * seconds), which means most of the time inodes will have their 82 * timestamps written to disk after 12 hours, but in the worst case a 83 * few inodes might not their timestamps updated for 24 hours. 84 */ 85 unsigned int dirtytime_expire_interval = 12 * 60 * 60; 86 87 static inline struct inode *wb_inode(struct list_head *head) 88 { 89 return list_entry(head, struct inode, i_io_list); 90 } 91 92 /* 93 * Include the creation of the trace points after defining the 94 * wb_writeback_work structure and inline functions so that the definition 95 * remains local to this file. 96 */ 97 #define CREATE_TRACE_POINTS 98 #include <trace/events/writeback.h> 99 100 EXPORT_TRACEPOINT_SYMBOL_GPL(wbc_writepage); 101 102 static bool wb_io_lists_populated(struct bdi_writeback *wb) 103 { 104 if (wb_has_dirty_io(wb)) { 105 return false; 106 } else { 107 set_bit(WB_has_dirty_io, &wb->state); 108 WARN_ON_ONCE(!wb->avg_write_bandwidth); 109 atomic_long_add(wb->avg_write_bandwidth, 110 &wb->bdi->tot_write_bandwidth); 111 return true; 112 } 113 } 114 115 static void wb_io_lists_depopulated(struct bdi_writeback *wb) 116 { 117 if (wb_has_dirty_io(wb) && list_empty(&wb->b_dirty) && 118 list_empty(&wb->b_io) && list_empty(&wb->b_more_io)) { 119 clear_bit(WB_has_dirty_io, &wb->state); 120 WARN_ON_ONCE(atomic_long_sub_return(wb->avg_write_bandwidth, 121 &wb->bdi->tot_write_bandwidth) < 0); 122 } 123 } 124 125 /** 126 * inode_io_list_move_locked - move an inode onto a bdi_writeback IO list 127 * @inode: inode to be moved 128 * @wb: target bdi_writeback 129 * @head: one of @wb->b_{dirty|io|more_io} 130 * 131 * Move @inode->i_io_list to @list of @wb and set %WB_has_dirty_io. 132 * Returns %true if @inode is the first occupant of the !dirty_time IO 133 * lists; otherwise, %false. 134 */ 135 static bool inode_io_list_move_locked(struct inode *inode, 136 struct bdi_writeback *wb, 137 struct list_head *head) 138 { 139 assert_spin_locked(&wb->list_lock); 140 141 list_move(&inode->i_io_list, head); 142 143 /* dirty_time doesn't count as dirty_io until expiration */ 144 if (head != &wb->b_dirty_time) 145 return wb_io_lists_populated(wb); 146 147 wb_io_lists_depopulated(wb); 148 return false; 149 } 150 151 /** 152 * inode_io_list_del_locked - remove an inode from its bdi_writeback IO list 153 * @inode: inode to be removed 154 * @wb: bdi_writeback @inode is being removed from 155 * 156 * Remove @inode which may be on one of @wb->b_{dirty|io|more_io} lists and 157 * clear %WB_has_dirty_io if all are empty afterwards. 158 */ 159 static void inode_io_list_del_locked(struct inode *inode, 160 struct bdi_writeback *wb) 161 { 162 assert_spin_locked(&wb->list_lock); 163 164 list_del_init(&inode->i_io_list); 165 wb_io_lists_depopulated(wb); 166 } 167 168 static void wb_wakeup(struct bdi_writeback *wb) 169 { 170 spin_lock_bh(&wb->work_lock); 171 if (test_bit(WB_registered, &wb->state)) 172 mod_delayed_work(bdi_wq, &wb->dwork, 0); 173 spin_unlock_bh(&wb->work_lock); 174 } 175 176 static void wb_queue_work(struct bdi_writeback *wb, 177 struct wb_writeback_work *work) 178 { 179 trace_writeback_queue(wb, work); 180 181 spin_lock_bh(&wb->work_lock); 182 if (!test_bit(WB_registered, &wb->state)) 183 goto out_unlock; 184 if (work->done) 185 atomic_inc(&work->done->cnt); 186 list_add_tail(&work->list, &wb->work_list); 187 mod_delayed_work(bdi_wq, &wb->dwork, 0); 188 out_unlock: 189 spin_unlock_bh(&wb->work_lock); 190 } 191 192 /** 193 * wb_wait_for_completion - wait for completion of bdi_writeback_works 194 * @bdi: bdi work items were issued to 195 * @done: target wb_completion 196 * 197 * Wait for one or more work items issued to @bdi with their ->done field 198 * set to @done, which should have been defined with 199 * DEFINE_WB_COMPLETION_ONSTACK(). This function returns after all such 200 * work items are completed. Work items which are waited upon aren't freed 201 * automatically on completion. 202 */ 203 static void wb_wait_for_completion(struct backing_dev_info *bdi, 204 struct wb_completion *done) 205 { 206 atomic_dec(&done->cnt); /* put down the initial count */ 207 wait_event(bdi->wb_waitq, !atomic_read(&done->cnt)); 208 } 209 210 #ifdef CONFIG_CGROUP_WRITEBACK 211 212 /* parameters for foreign inode detection, see wb_detach_inode() */ 213 #define WB_FRN_TIME_SHIFT 13 /* 1s = 2^13, upto 8 secs w/ 16bit */ 214 #define WB_FRN_TIME_AVG_SHIFT 3 /* avg = avg * 7/8 + new * 1/8 */ 215 #define WB_FRN_TIME_CUT_DIV 2 /* ignore rounds < avg / 2 */ 216 #define WB_FRN_TIME_PERIOD (2 * (1 << WB_FRN_TIME_SHIFT)) /* 2s */ 217 218 #define WB_FRN_HIST_SLOTS 16 /* inode->i_wb_frn_history is 16bit */ 219 #define WB_FRN_HIST_UNIT (WB_FRN_TIME_PERIOD / WB_FRN_HIST_SLOTS) 220 /* each slot's duration is 2s / 16 */ 221 #define WB_FRN_HIST_THR_SLOTS (WB_FRN_HIST_SLOTS / 2) 222 /* if foreign slots >= 8, switch */ 223 #define WB_FRN_HIST_MAX_SLOTS (WB_FRN_HIST_THR_SLOTS / 2 + 1) 224 /* one round can affect upto 5 slots */ 225 226 void __inode_attach_wb(struct inode *inode, struct page *page) 227 { 228 struct backing_dev_info *bdi = inode_to_bdi(inode); 229 struct bdi_writeback *wb = NULL; 230 231 if (inode_cgwb_enabled(inode)) { 232 struct cgroup_subsys_state *memcg_css; 233 234 if (page) { 235 memcg_css = mem_cgroup_css_from_page(page); 236 wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); 237 } else { 238 /* must pin memcg_css, see wb_get_create() */ 239 memcg_css = task_get_css(current, memory_cgrp_id); 240 wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); 241 css_put(memcg_css); 242 } 243 } 244 245 if (!wb) 246 wb = &bdi->wb; 247 248 /* 249 * There may be multiple instances of this function racing to 250 * update the same inode. Use cmpxchg() to tell the winner. 251 */ 252 if (unlikely(cmpxchg(&inode->i_wb, NULL, wb))) 253 wb_put(wb); 254 } 255 256 /** 257 * locked_inode_to_wb_and_lock_list - determine a locked inode's wb and lock it 258 * @inode: inode of interest with i_lock held 259 * 260 * Returns @inode's wb with its list_lock held. @inode->i_lock must be 261 * held on entry and is released on return. The returned wb is guaranteed 262 * to stay @inode's associated wb until its list_lock is released. 263 */ 264 static struct bdi_writeback * 265 locked_inode_to_wb_and_lock_list(struct inode *inode) 266 __releases(&inode->i_lock) 267 __acquires(&wb->list_lock) 268 { 269 while (true) { 270 struct bdi_writeback *wb = inode_to_wb(inode); 271 272 /* 273 * inode_to_wb() association is protected by both 274 * @inode->i_lock and @wb->list_lock but list_lock nests 275 * outside i_lock. Drop i_lock and verify that the 276 * association hasn't changed after acquiring list_lock. 277 */ 278 wb_get(wb); 279 spin_unlock(&inode->i_lock); 280 spin_lock(&wb->list_lock); 281 wb_put(wb); /* not gonna deref it anymore */ 282 283 /* i_wb may have changed inbetween, can't use inode_to_wb() */ 284 if (likely(wb == inode->i_wb)) 285 return wb; /* @inode already has ref */ 286 287 spin_unlock(&wb->list_lock); 288 cpu_relax(); 289 spin_lock(&inode->i_lock); 290 } 291 } 292 293 /** 294 * inode_to_wb_and_lock_list - determine an inode's wb and lock it 295 * @inode: inode of interest 296 * 297 * Same as locked_inode_to_wb_and_lock_list() but @inode->i_lock isn't held 298 * on entry. 299 */ 300 static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode) 301 __acquires(&wb->list_lock) 302 { 303 spin_lock(&inode->i_lock); 304 return locked_inode_to_wb_and_lock_list(inode); 305 } 306 307 struct inode_switch_wbs_context { 308 struct inode *inode; 309 struct bdi_writeback *new_wb; 310 311 struct rcu_head rcu_head; 312 struct work_struct work; 313 }; 314 315 static void inode_switch_wbs_work_fn(struct work_struct *work) 316 { 317 struct inode_switch_wbs_context *isw = 318 container_of(work, struct inode_switch_wbs_context, work); 319 struct inode *inode = isw->inode; 320 struct super_block *sb = inode->i_sb; 321 struct address_space *mapping = inode->i_mapping; 322 struct bdi_writeback *old_wb = inode->i_wb; 323 struct bdi_writeback *new_wb = isw->new_wb; 324 struct radix_tree_iter iter; 325 bool switched = false; 326 void **slot; 327 328 /* 329 * By the time control reaches here, RCU grace period has passed 330 * since I_WB_SWITCH assertion and all wb stat update transactions 331 * between unlocked_inode_to_wb_begin/end() are guaranteed to be 332 * synchronizing against mapping->tree_lock. 333 * 334 * Grabbing old_wb->list_lock, inode->i_lock and mapping->tree_lock 335 * gives us exclusion against all wb related operations on @inode 336 * including IO list manipulations and stat updates. 337 */ 338 if (old_wb < new_wb) { 339 spin_lock(&old_wb->list_lock); 340 spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING); 341 } else { 342 spin_lock(&new_wb->list_lock); 343 spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING); 344 } 345 spin_lock(&inode->i_lock); 346 spin_lock_irq(&mapping->tree_lock); 347 348 /* 349 * Once I_FREEING is visible under i_lock, the eviction path owns 350 * the inode and we shouldn't modify ->i_io_list. 351 */ 352 if (unlikely(inode->i_state & I_FREEING)) 353 goto skip_switch; 354 355 /* 356 * Count and transfer stats. Note that PAGECACHE_TAG_DIRTY points 357 * to possibly dirty pages while PAGECACHE_TAG_WRITEBACK points to 358 * pages actually under underwriteback. 359 */ 360 radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, 0, 361 PAGECACHE_TAG_DIRTY) { 362 struct page *page = radix_tree_deref_slot_protected(slot, 363 &mapping->tree_lock); 364 if (likely(page) && PageDirty(page)) { 365 __dec_wb_stat(old_wb, WB_RECLAIMABLE); 366 __inc_wb_stat(new_wb, WB_RECLAIMABLE); 367 } 368 } 369 370 radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, 0, 371 PAGECACHE_TAG_WRITEBACK) { 372 struct page *page = radix_tree_deref_slot_protected(slot, 373 &mapping->tree_lock); 374 if (likely(page)) { 375 WARN_ON_ONCE(!PageWriteback(page)); 376 __dec_wb_stat(old_wb, WB_WRITEBACK); 377 __inc_wb_stat(new_wb, WB_WRITEBACK); 378 } 379 } 380 381 wb_get(new_wb); 382 383 /* 384 * Transfer to @new_wb's IO list if necessary. The specific list 385 * @inode was on is ignored and the inode is put on ->b_dirty which 386 * is always correct including from ->b_dirty_time. The transfer 387 * preserves @inode->dirtied_when ordering. 388 */ 389 if (!list_empty(&inode->i_io_list)) { 390 struct inode *pos; 391 392 inode_io_list_del_locked(inode, old_wb); 393 inode->i_wb = new_wb; 394 list_for_each_entry(pos, &new_wb->b_dirty, i_io_list) 395 if (time_after_eq(inode->dirtied_when, 396 pos->dirtied_when)) 397 break; 398 inode_io_list_move_locked(inode, new_wb, pos->i_io_list.prev); 399 } else { 400 inode->i_wb = new_wb; 401 } 402 403 /* ->i_wb_frn updates may race wbc_detach_inode() but doesn't matter */ 404 inode->i_wb_frn_winner = 0; 405 inode->i_wb_frn_avg_time = 0; 406 inode->i_wb_frn_history = 0; 407 switched = true; 408 skip_switch: 409 /* 410 * Paired with load_acquire in unlocked_inode_to_wb_begin() and 411 * ensures that the new wb is visible if they see !I_WB_SWITCH. 412 */ 413 smp_store_release(&inode->i_state, inode->i_state & ~I_WB_SWITCH); 414 415 spin_unlock_irq(&mapping->tree_lock); 416 spin_unlock(&inode->i_lock); 417 spin_unlock(&new_wb->list_lock); 418 spin_unlock(&old_wb->list_lock); 419 420 if (switched) { 421 wb_wakeup(new_wb); 422 wb_put(old_wb); 423 } 424 wb_put(new_wb); 425 426 iput(inode); 427 deactivate_super(sb); 428 kfree(isw); 429 } 430 431 static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head) 432 { 433 struct inode_switch_wbs_context *isw = container_of(rcu_head, 434 struct inode_switch_wbs_context, rcu_head); 435 436 /* needs to grab bh-unsafe locks, bounce to work item */ 437 INIT_WORK(&isw->work, inode_switch_wbs_work_fn); 438 schedule_work(&isw->work); 439 } 440 441 /** 442 * inode_switch_wbs - change the wb association of an inode 443 * @inode: target inode 444 * @new_wb_id: ID of the new wb 445 * 446 * Switch @inode's wb association to the wb identified by @new_wb_id. The 447 * switching is performed asynchronously and may fail silently. 448 */ 449 static void inode_switch_wbs(struct inode *inode, int new_wb_id) 450 { 451 struct backing_dev_info *bdi = inode_to_bdi(inode); 452 struct cgroup_subsys_state *memcg_css; 453 struct inode_switch_wbs_context *isw; 454 455 /* noop if seems to be already in progress */ 456 if (inode->i_state & I_WB_SWITCH) 457 return; 458 459 isw = kzalloc(sizeof(*isw), GFP_ATOMIC); 460 if (!isw) 461 return; 462 463 /* find and pin the new wb */ 464 rcu_read_lock(); 465 memcg_css = css_from_id(new_wb_id, &memory_cgrp_subsys); 466 if (memcg_css) 467 isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); 468 rcu_read_unlock(); 469 if (!isw->new_wb) 470 goto out_free; 471 472 /* while holding I_WB_SWITCH, no one else can update the association */ 473 spin_lock(&inode->i_lock); 474 475 if (inode->i_state & (I_WB_SWITCH | I_FREEING) || 476 inode_to_wb(inode) == isw->new_wb) 477 goto out_unlock; 478 479 if (!atomic_inc_not_zero(&inode->i_sb->s_active)) 480 goto out_unlock; 481 482 inode->i_state |= I_WB_SWITCH; 483 spin_unlock(&inode->i_lock); 484 485 ihold(inode); 486 isw->inode = inode; 487 488 /* 489 * In addition to synchronizing among switchers, I_WB_SWITCH tells 490 * the RCU protected stat update paths to grab the mapping's 491 * tree_lock so that stat transfer can synchronize against them. 492 * Let's continue after I_WB_SWITCH is guaranteed to be visible. 493 */ 494 call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn); 495 return; 496 497 out_unlock: 498 spin_unlock(&inode->i_lock); 499 out_free: 500 if (isw->new_wb) 501 wb_put(isw->new_wb); 502 kfree(isw); 503 } 504 505 /** 506 * wbc_attach_and_unlock_inode - associate wbc with target inode and unlock it 507 * @wbc: writeback_control of interest 508 * @inode: target inode 509 * 510 * @inode is locked and about to be written back under the control of @wbc. 511 * Record @inode's writeback context into @wbc and unlock the i_lock. On 512 * writeback completion, wbc_detach_inode() should be called. This is used 513 * to track the cgroup writeback context. 514 */ 515 void wbc_attach_and_unlock_inode(struct writeback_control *wbc, 516 struct inode *inode) 517 { 518 if (!inode_cgwb_enabled(inode)) { 519 spin_unlock(&inode->i_lock); 520 return; 521 } 522 523 wbc->wb = inode_to_wb(inode); 524 wbc->inode = inode; 525 526 wbc->wb_id = wbc->wb->memcg_css->id; 527 wbc->wb_lcand_id = inode->i_wb_frn_winner; 528 wbc->wb_tcand_id = 0; 529 wbc->wb_bytes = 0; 530 wbc->wb_lcand_bytes = 0; 531 wbc->wb_tcand_bytes = 0; 532 533 wb_get(wbc->wb); 534 spin_unlock(&inode->i_lock); 535 536 /* 537 * A dying wb indicates that the memcg-blkcg mapping has changed 538 * and a new wb is already serving the memcg. Switch immediately. 539 */ 540 if (unlikely(wb_dying(wbc->wb))) 541 inode_switch_wbs(inode, wbc->wb_id); 542 } 543 544 /** 545 * wbc_detach_inode - disassociate wbc from inode and perform foreign detection 546 * @wbc: writeback_control of the just finished writeback 547 * 548 * To be called after a writeback attempt of an inode finishes and undoes 549 * wbc_attach_and_unlock_inode(). Can be called under any context. 550 * 551 * As concurrent write sharing of an inode is expected to be very rare and 552 * memcg only tracks page ownership on first-use basis severely confining 553 * the usefulness of such sharing, cgroup writeback tracks ownership 554 * per-inode. While the support for concurrent write sharing of an inode 555 * is deemed unnecessary, an inode being written to by different cgroups at 556 * different points in time is a lot more common, and, more importantly, 557 * charging only by first-use can too readily lead to grossly incorrect 558 * behaviors (single foreign page can lead to gigabytes of writeback to be 559 * incorrectly attributed). 560 * 561 * To resolve this issue, cgroup writeback detects the majority dirtier of 562 * an inode and transfers the ownership to it. To avoid unnnecessary 563 * oscillation, the detection mechanism keeps track of history and gives 564 * out the switch verdict only if the foreign usage pattern is stable over 565 * a certain amount of time and/or writeback attempts. 566 * 567 * On each writeback attempt, @wbc tries to detect the majority writer 568 * using Boyer-Moore majority vote algorithm. In addition to the byte 569 * count from the majority voting, it also counts the bytes written for the 570 * current wb and the last round's winner wb (max of last round's current 571 * wb, the winner from two rounds ago, and the last round's majority 572 * candidate). Keeping track of the historical winner helps the algorithm 573 * to semi-reliably detect the most active writer even when it's not the 574 * absolute majority. 575 * 576 * Once the winner of the round is determined, whether the winner is 577 * foreign or not and how much IO time the round consumed is recorded in 578 * inode->i_wb_frn_history. If the amount of recorded foreign IO time is 579 * over a certain threshold, the switch verdict is given. 580 */ 581 void wbc_detach_inode(struct writeback_control *wbc) 582 { 583 struct bdi_writeback *wb = wbc->wb; 584 struct inode *inode = wbc->inode; 585 unsigned long avg_time, max_bytes, max_time; 586 u16 history; 587 int max_id; 588 589 if (!wb) 590 return; 591 592 history = inode->i_wb_frn_history; 593 avg_time = inode->i_wb_frn_avg_time; 594 595 /* pick the winner of this round */ 596 if (wbc->wb_bytes >= wbc->wb_lcand_bytes && 597 wbc->wb_bytes >= wbc->wb_tcand_bytes) { 598 max_id = wbc->wb_id; 599 max_bytes = wbc->wb_bytes; 600 } else if (wbc->wb_lcand_bytes >= wbc->wb_tcand_bytes) { 601 max_id = wbc->wb_lcand_id; 602 max_bytes = wbc->wb_lcand_bytes; 603 } else { 604 max_id = wbc->wb_tcand_id; 605 max_bytes = wbc->wb_tcand_bytes; 606 } 607 608 /* 609 * Calculate the amount of IO time the winner consumed and fold it 610 * into the running average kept per inode. If the consumed IO 611 * time is lower than avag / WB_FRN_TIME_CUT_DIV, ignore it for 612 * deciding whether to switch or not. This is to prevent one-off 613 * small dirtiers from skewing the verdict. 614 */ 615 max_time = DIV_ROUND_UP((max_bytes >> PAGE_SHIFT) << WB_FRN_TIME_SHIFT, 616 wb->avg_write_bandwidth); 617 if (avg_time) 618 avg_time += (max_time >> WB_FRN_TIME_AVG_SHIFT) - 619 (avg_time >> WB_FRN_TIME_AVG_SHIFT); 620 else 621 avg_time = max_time; /* immediate catch up on first run */ 622 623 if (max_time >= avg_time / WB_FRN_TIME_CUT_DIV) { 624 int slots; 625 626 /* 627 * The switch verdict is reached if foreign wb's consume 628 * more than a certain proportion of IO time in a 629 * WB_FRN_TIME_PERIOD. This is loosely tracked by 16 slot 630 * history mask where each bit represents one sixteenth of 631 * the period. Determine the number of slots to shift into 632 * history from @max_time. 633 */ 634 slots = min(DIV_ROUND_UP(max_time, WB_FRN_HIST_UNIT), 635 (unsigned long)WB_FRN_HIST_MAX_SLOTS); 636 history <<= slots; 637 if (wbc->wb_id != max_id) 638 history |= (1U << slots) - 1; 639 640 /* 641 * Switch if the current wb isn't the consistent winner. 642 * If there are multiple closely competing dirtiers, the 643 * inode may switch across them repeatedly over time, which 644 * is okay. The main goal is avoiding keeping an inode on 645 * the wrong wb for an extended period of time. 646 */ 647 if (hweight32(history) > WB_FRN_HIST_THR_SLOTS) 648 inode_switch_wbs(inode, max_id); 649 } 650 651 /* 652 * Multiple instances of this function may race to update the 653 * following fields but we don't mind occassional inaccuracies. 654 */ 655 inode->i_wb_frn_winner = max_id; 656 inode->i_wb_frn_avg_time = min(avg_time, (unsigned long)U16_MAX); 657 inode->i_wb_frn_history = history; 658 659 wb_put(wbc->wb); 660 wbc->wb = NULL; 661 } 662 663 /** 664 * wbc_account_io - account IO issued during writeback 665 * @wbc: writeback_control of the writeback in progress 666 * @page: page being written out 667 * @bytes: number of bytes being written out 668 * 669 * @bytes from @page are about to written out during the writeback 670 * controlled by @wbc. Keep the book for foreign inode detection. See 671 * wbc_detach_inode(). 672 */ 673 void wbc_account_io(struct writeback_control *wbc, struct page *page, 674 size_t bytes) 675 { 676 int id; 677 678 /* 679 * pageout() path doesn't attach @wbc to the inode being written 680 * out. This is intentional as we don't want the function to block 681 * behind a slow cgroup. Ultimately, we want pageout() to kick off 682 * regular writeback instead of writing things out itself. 683 */ 684 if (!wbc->wb) 685 return; 686 687 id = mem_cgroup_css_from_page(page)->id; 688 689 if (id == wbc->wb_id) { 690 wbc->wb_bytes += bytes; 691 return; 692 } 693 694 if (id == wbc->wb_lcand_id) 695 wbc->wb_lcand_bytes += bytes; 696 697 /* Boyer-Moore majority vote algorithm */ 698 if (!wbc->wb_tcand_bytes) 699 wbc->wb_tcand_id = id; 700 if (id == wbc->wb_tcand_id) 701 wbc->wb_tcand_bytes += bytes; 702 else 703 wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes); 704 } 705 EXPORT_SYMBOL_GPL(wbc_account_io); 706 707 /** 708 * inode_congested - test whether an inode is congested 709 * @inode: inode to test for congestion (may be NULL) 710 * @cong_bits: mask of WB_[a]sync_congested bits to test 711 * 712 * Tests whether @inode is congested. @cong_bits is the mask of congestion 713 * bits to test and the return value is the mask of set bits. 714 * 715 * If cgroup writeback is enabled for @inode, the congestion state is 716 * determined by whether the cgwb (cgroup bdi_writeback) for the blkcg 717 * associated with @inode is congested; otherwise, the root wb's congestion 718 * state is used. 719 * 720 * @inode is allowed to be NULL as this function is often called on 721 * mapping->host which is NULL for the swapper space. 722 */ 723 int inode_congested(struct inode *inode, int cong_bits) 724 { 725 /* 726 * Once set, ->i_wb never becomes NULL while the inode is alive. 727 * Start transaction iff ->i_wb is visible. 728 */ 729 if (inode && inode_to_wb_is_valid(inode)) { 730 struct bdi_writeback *wb; 731 bool locked, congested; 732 733 wb = unlocked_inode_to_wb_begin(inode, &locked); 734 congested = wb_congested(wb, cong_bits); 735 unlocked_inode_to_wb_end(inode, locked); 736 return congested; 737 } 738 739 return wb_congested(&inode_to_bdi(inode)->wb, cong_bits); 740 } 741 EXPORT_SYMBOL_GPL(inode_congested); 742 743 /** 744 * wb_split_bdi_pages - split nr_pages to write according to bandwidth 745 * @wb: target bdi_writeback to split @nr_pages to 746 * @nr_pages: number of pages to write for the whole bdi 747 * 748 * Split @wb's portion of @nr_pages according to @wb's write bandwidth in 749 * relation to the total write bandwidth of all wb's w/ dirty inodes on 750 * @wb->bdi. 751 */ 752 static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages) 753 { 754 unsigned long this_bw = wb->avg_write_bandwidth; 755 unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth); 756 757 if (nr_pages == LONG_MAX) 758 return LONG_MAX; 759 760 /* 761 * This may be called on clean wb's and proportional distribution 762 * may not make sense, just use the original @nr_pages in those 763 * cases. In general, we wanna err on the side of writing more. 764 */ 765 if (!tot_bw || this_bw >= tot_bw) 766 return nr_pages; 767 else 768 return DIV_ROUND_UP_ULL((u64)nr_pages * this_bw, tot_bw); 769 } 770 771 /** 772 * bdi_split_work_to_wbs - split a wb_writeback_work to all wb's of a bdi 773 * @bdi: target backing_dev_info 774 * @base_work: wb_writeback_work to issue 775 * @skip_if_busy: skip wb's which already have writeback in progress 776 * 777 * Split and issue @base_work to all wb's (bdi_writeback's) of @bdi which 778 * have dirty inodes. If @base_work->nr_page isn't %LONG_MAX, it's 779 * distributed to the busy wbs according to each wb's proportion in the 780 * total active write bandwidth of @bdi. 781 */ 782 static void bdi_split_work_to_wbs(struct backing_dev_info *bdi, 783 struct wb_writeback_work *base_work, 784 bool skip_if_busy) 785 { 786 struct bdi_writeback *last_wb = NULL; 787 struct bdi_writeback *wb = list_entry(&bdi->wb_list, 788 struct bdi_writeback, bdi_node); 789 790 might_sleep(); 791 restart: 792 rcu_read_lock(); 793 list_for_each_entry_continue_rcu(wb, &bdi->wb_list, bdi_node) { 794 DEFINE_WB_COMPLETION_ONSTACK(fallback_work_done); 795 struct wb_writeback_work fallback_work; 796 struct wb_writeback_work *work; 797 long nr_pages; 798 799 if (last_wb) { 800 wb_put(last_wb); 801 last_wb = NULL; 802 } 803 804 /* SYNC_ALL writes out I_DIRTY_TIME too */ 805 if (!wb_has_dirty_io(wb) && 806 (base_work->sync_mode == WB_SYNC_NONE || 807 list_empty(&wb->b_dirty_time))) 808 continue; 809 if (skip_if_busy && writeback_in_progress(wb)) 810 continue; 811 812 nr_pages = wb_split_bdi_pages(wb, base_work->nr_pages); 813 814 work = kmalloc(sizeof(*work), GFP_ATOMIC); 815 if (work) { 816 *work = *base_work; 817 work->nr_pages = nr_pages; 818 work->auto_free = 1; 819 wb_queue_work(wb, work); 820 continue; 821 } 822 823 /* alloc failed, execute synchronously using on-stack fallback */ 824 work = &fallback_work; 825 *work = *base_work; 826 work->nr_pages = nr_pages; 827 work->auto_free = 0; 828 work->done = &fallback_work_done; 829 830 wb_queue_work(wb, work); 831 832 /* 833 * Pin @wb so that it stays on @bdi->wb_list. This allows 834 * continuing iteration from @wb after dropping and 835 * regrabbing rcu read lock. 836 */ 837 wb_get(wb); 838 last_wb = wb; 839 840 rcu_read_unlock(); 841 wb_wait_for_completion(bdi, &fallback_work_done); 842 goto restart; 843 } 844 rcu_read_unlock(); 845 846 if (last_wb) 847 wb_put(last_wb); 848 } 849 850 #else /* CONFIG_CGROUP_WRITEBACK */ 851 852 static struct bdi_writeback * 853 locked_inode_to_wb_and_lock_list(struct inode *inode) 854 __releases(&inode->i_lock) 855 __acquires(&wb->list_lock) 856 { 857 struct bdi_writeback *wb = inode_to_wb(inode); 858 859 spin_unlock(&inode->i_lock); 860 spin_lock(&wb->list_lock); 861 return wb; 862 } 863 864 static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode) 865 __acquires(&wb->list_lock) 866 { 867 struct bdi_writeback *wb = inode_to_wb(inode); 868 869 spin_lock(&wb->list_lock); 870 return wb; 871 } 872 873 static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages) 874 { 875 return nr_pages; 876 } 877 878 static void bdi_split_work_to_wbs(struct backing_dev_info *bdi, 879 struct wb_writeback_work *base_work, 880 bool skip_if_busy) 881 { 882 might_sleep(); 883 884 if (!skip_if_busy || !writeback_in_progress(&bdi->wb)) { 885 base_work->auto_free = 0; 886 wb_queue_work(&bdi->wb, base_work); 887 } 888 } 889 890 #endif /* CONFIG_CGROUP_WRITEBACK */ 891 892 void wb_start_writeback(struct bdi_writeback *wb, long nr_pages, 893 bool range_cyclic, enum wb_reason reason) 894 { 895 struct wb_writeback_work *work; 896 897 if (!wb_has_dirty_io(wb)) 898 return; 899 900 /* 901 * This is WB_SYNC_NONE writeback, so if allocation fails just 902 * wakeup the thread for old dirty data writeback 903 */ 904 work = kzalloc(sizeof(*work), GFP_ATOMIC); 905 if (!work) { 906 trace_writeback_nowork(wb); 907 wb_wakeup(wb); 908 return; 909 } 910 911 work->sync_mode = WB_SYNC_NONE; 912 work->nr_pages = nr_pages; 913 work->range_cyclic = range_cyclic; 914 work->reason = reason; 915 work->auto_free = 1; 916 917 wb_queue_work(wb, work); 918 } 919 920 /** 921 * wb_start_background_writeback - start background writeback 922 * @wb: bdi_writback to write from 923 * 924 * Description: 925 * This makes sure WB_SYNC_NONE background writeback happens. When 926 * this function returns, it is only guaranteed that for given wb 927 * some IO is happening if we are over background dirty threshold. 928 * Caller need not hold sb s_umount semaphore. 929 */ 930 void wb_start_background_writeback(struct bdi_writeback *wb) 931 { 932 /* 933 * We just wake up the flusher thread. It will perform background 934 * writeback as soon as there is no other work to do. 935 */ 936 trace_writeback_wake_background(wb); 937 wb_wakeup(wb); 938 } 939 940 /* 941 * Remove the inode from the writeback list it is on. 942 */ 943 void inode_io_list_del(struct inode *inode) 944 { 945 struct bdi_writeback *wb; 946 947 wb = inode_to_wb_and_lock_list(inode); 948 inode_io_list_del_locked(inode, wb); 949 spin_unlock(&wb->list_lock); 950 } 951 952 /* 953 * Redirty an inode: set its when-it-was dirtied timestamp and move it to the 954 * furthest end of its superblock's dirty-inode list. 955 * 956 * Before stamping the inode's ->dirtied_when, we check to see whether it is 957 * already the most-recently-dirtied inode on the b_dirty list. If that is 958 * the case then the inode must have been redirtied while it was being written 959 * out and we don't reset its dirtied_when. 960 */ 961 static void redirty_tail(struct inode *inode, struct bdi_writeback *wb) 962 { 963 if (!list_empty(&wb->b_dirty)) { 964 struct inode *tail; 965 966 tail = wb_inode(wb->b_dirty.next); 967 if (time_before(inode->dirtied_when, tail->dirtied_when)) 968 inode->dirtied_when = jiffies; 969 } 970 inode_io_list_move_locked(inode, wb, &wb->b_dirty); 971 } 972 973 /* 974 * requeue inode for re-scanning after bdi->b_io list is exhausted. 975 */ 976 static void requeue_io(struct inode *inode, struct bdi_writeback *wb) 977 { 978 inode_io_list_move_locked(inode, wb, &wb->b_more_io); 979 } 980 981 static void inode_sync_complete(struct inode *inode) 982 { 983 inode->i_state &= ~I_SYNC; 984 /* If inode is clean an unused, put it into LRU now... */ 985 inode_add_lru(inode); 986 /* Waiters must see I_SYNC cleared before being woken up */ 987 smp_mb(); 988 wake_up_bit(&inode->i_state, __I_SYNC); 989 } 990 991 static bool inode_dirtied_after(struct inode *inode, unsigned long t) 992 { 993 bool ret = time_after(inode->dirtied_when, t); 994 #ifndef CONFIG_64BIT 995 /* 996 * For inodes being constantly redirtied, dirtied_when can get stuck. 997 * It _appears_ to be in the future, but is actually in distant past. 998 * This test is necessary to prevent such wrapped-around relative times 999 * from permanently stopping the whole bdi writeback. 1000 */ 1001 ret = ret && time_before_eq(inode->dirtied_when, jiffies); 1002 #endif 1003 return ret; 1004 } 1005 1006 #define EXPIRE_DIRTY_ATIME 0x0001 1007 1008 /* 1009 * Move expired (dirtied before work->older_than_this) dirty inodes from 1010 * @delaying_queue to @dispatch_queue. 1011 */ 1012 static int move_expired_inodes(struct list_head *delaying_queue, 1013 struct list_head *dispatch_queue, 1014 int flags, 1015 struct wb_writeback_work *work) 1016 { 1017 unsigned long *older_than_this = NULL; 1018 unsigned long expire_time; 1019 LIST_HEAD(tmp); 1020 struct list_head *pos, *node; 1021 struct super_block *sb = NULL; 1022 struct inode *inode; 1023 int do_sb_sort = 0; 1024 int moved = 0; 1025 1026 if ((flags & EXPIRE_DIRTY_ATIME) == 0) 1027 older_than_this = work->older_than_this; 1028 else if (!work->for_sync) { 1029 expire_time = jiffies - (dirtytime_expire_interval * HZ); 1030 older_than_this = &expire_time; 1031 } 1032 while (!list_empty(delaying_queue)) { 1033 inode = wb_inode(delaying_queue->prev); 1034 if (older_than_this && 1035 inode_dirtied_after(inode, *older_than_this)) 1036 break; 1037 list_move(&inode->i_io_list, &tmp); 1038 moved++; 1039 if (flags & EXPIRE_DIRTY_ATIME) 1040 set_bit(__I_DIRTY_TIME_EXPIRED, &inode->i_state); 1041 if (sb_is_blkdev_sb(inode->i_sb)) 1042 continue; 1043 if (sb && sb != inode->i_sb) 1044 do_sb_sort = 1; 1045 sb = inode->i_sb; 1046 } 1047 1048 /* just one sb in list, splice to dispatch_queue and we're done */ 1049 if (!do_sb_sort) { 1050 list_splice(&tmp, dispatch_queue); 1051 goto out; 1052 } 1053 1054 /* Move inodes from one superblock together */ 1055 while (!list_empty(&tmp)) { 1056 sb = wb_inode(tmp.prev)->i_sb; 1057 list_for_each_prev_safe(pos, node, &tmp) { 1058 inode = wb_inode(pos); 1059 if (inode->i_sb == sb) 1060 list_move(&inode->i_io_list, dispatch_queue); 1061 } 1062 } 1063 out: 1064 return moved; 1065 } 1066 1067 /* 1068 * Queue all expired dirty inodes for io, eldest first. 1069 * Before 1070 * newly dirtied b_dirty b_io b_more_io 1071 * =============> gf edc BA 1072 * After 1073 * newly dirtied b_dirty b_io b_more_io 1074 * =============> g fBAedc 1075 * | 1076 * +--> dequeue for IO 1077 */ 1078 static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work) 1079 { 1080 int moved; 1081 1082 assert_spin_locked(&wb->list_lock); 1083 list_splice_init(&wb->b_more_io, &wb->b_io); 1084 moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, 0, work); 1085 moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io, 1086 EXPIRE_DIRTY_ATIME, work); 1087 if (moved) 1088 wb_io_lists_populated(wb); 1089 trace_writeback_queue_io(wb, work, moved); 1090 } 1091 1092 static int write_inode(struct inode *inode, struct writeback_control *wbc) 1093 { 1094 int ret; 1095 1096 if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) { 1097 trace_writeback_write_inode_start(inode, wbc); 1098 ret = inode->i_sb->s_op->write_inode(inode, wbc); 1099 trace_writeback_write_inode(inode, wbc); 1100 return ret; 1101 } 1102 return 0; 1103 } 1104 1105 /* 1106 * Wait for writeback on an inode to complete. Called with i_lock held. 1107 * Caller must make sure inode cannot go away when we drop i_lock. 1108 */ 1109 static void __inode_wait_for_writeback(struct inode *inode) 1110 __releases(inode->i_lock) 1111 __acquires(inode->i_lock) 1112 { 1113 DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC); 1114 wait_queue_head_t *wqh; 1115 1116 wqh = bit_waitqueue(&inode->i_state, __I_SYNC); 1117 while (inode->i_state & I_SYNC) { 1118 spin_unlock(&inode->i_lock); 1119 __wait_on_bit(wqh, &wq, bit_wait, 1120 TASK_UNINTERRUPTIBLE); 1121 spin_lock(&inode->i_lock); 1122 } 1123 } 1124 1125 /* 1126 * Wait for writeback on an inode to complete. Caller must have inode pinned. 1127 */ 1128 void inode_wait_for_writeback(struct inode *inode) 1129 { 1130 spin_lock(&inode->i_lock); 1131 __inode_wait_for_writeback(inode); 1132 spin_unlock(&inode->i_lock); 1133 } 1134 1135 /* 1136 * Sleep until I_SYNC is cleared. This function must be called with i_lock 1137 * held and drops it. It is aimed for callers not holding any inode reference 1138 * so once i_lock is dropped, inode can go away. 1139 */ 1140 static void inode_sleep_on_writeback(struct inode *inode) 1141 __releases(inode->i_lock) 1142 { 1143 DEFINE_WAIT(wait); 1144 wait_queue_head_t *wqh = bit_waitqueue(&inode->i_state, __I_SYNC); 1145 int sleep; 1146 1147 prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); 1148 sleep = inode->i_state & I_SYNC; 1149 spin_unlock(&inode->i_lock); 1150 if (sleep) 1151 schedule(); 1152 finish_wait(wqh, &wait); 1153 } 1154 1155 /* 1156 * Find proper writeback list for the inode depending on its current state and 1157 * possibly also change of its state while we were doing writeback. Here we 1158 * handle things such as livelock prevention or fairness of writeback among 1159 * inodes. This function can be called only by flusher thread - noone else 1160 * processes all inodes in writeback lists and requeueing inodes behind flusher 1161 * thread's back can have unexpected consequences. 1162 */ 1163 static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, 1164 struct writeback_control *wbc) 1165 { 1166 if (inode->i_state & I_FREEING) 1167 return; 1168 1169 /* 1170 * Sync livelock prevention. Each inode is tagged and synced in one 1171 * shot. If still dirty, it will be redirty_tail()'ed below. Update 1172 * the dirty time to prevent enqueue and sync it again. 1173 */ 1174 if ((inode->i_state & I_DIRTY) && 1175 (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)) 1176 inode->dirtied_when = jiffies; 1177 1178 if (wbc->pages_skipped) { 1179 /* 1180 * writeback is not making progress due to locked 1181 * buffers. Skip this inode for now. 1182 */ 1183 redirty_tail(inode, wb); 1184 return; 1185 } 1186 1187 if (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) { 1188 /* 1189 * We didn't write back all the pages. nfs_writepages() 1190 * sometimes bales out without doing anything. 1191 */ 1192 if (wbc->nr_to_write <= 0) { 1193 /* Slice used up. Queue for next turn. */ 1194 requeue_io(inode, wb); 1195 } else { 1196 /* 1197 * Writeback blocked by something other than 1198 * congestion. Delay the inode for some time to 1199 * avoid spinning on the CPU (100% iowait) 1200 * retrying writeback of the dirty page/inode 1201 * that cannot be performed immediately. 1202 */ 1203 redirty_tail(inode, wb); 1204 } 1205 } else if (inode->i_state & I_DIRTY) { 1206 /* 1207 * Filesystems can dirty the inode during writeback operations, 1208 * such as delayed allocation during submission or metadata 1209 * updates after data IO completion. 1210 */ 1211 redirty_tail(inode, wb); 1212 } else if (inode->i_state & I_DIRTY_TIME) { 1213 inode->dirtied_when = jiffies; 1214 inode_io_list_move_locked(inode, wb, &wb->b_dirty_time); 1215 } else { 1216 /* The inode is clean. Remove from writeback lists. */ 1217 inode_io_list_del_locked(inode, wb); 1218 } 1219 } 1220 1221 /* 1222 * Write out an inode and its dirty pages. Do not update the writeback list 1223 * linkage. That is left to the caller. The caller is also responsible for 1224 * setting I_SYNC flag and calling inode_sync_complete() to clear it. 1225 */ 1226 static int 1227 __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) 1228 { 1229 struct address_space *mapping = inode->i_mapping; 1230 long nr_to_write = wbc->nr_to_write; 1231 unsigned dirty; 1232 int ret; 1233 1234 WARN_ON(!(inode->i_state & I_SYNC)); 1235 1236 trace_writeback_single_inode_start(inode, wbc, nr_to_write); 1237 1238 ret = do_writepages(mapping, wbc); 1239 1240 /* 1241 * Make sure to wait on the data before writing out the metadata. 1242 * This is important for filesystems that modify metadata on data 1243 * I/O completion. We don't do it for sync(2) writeback because it has a 1244 * separate, external IO completion path and ->sync_fs for guaranteeing 1245 * inode metadata is written back correctly. 1246 */ 1247 if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) { 1248 int err = filemap_fdatawait(mapping); 1249 if (ret == 0) 1250 ret = err; 1251 } 1252 1253 /* 1254 * Some filesystems may redirty the inode during the writeback 1255 * due to delalloc, clear dirty metadata flags right before 1256 * write_inode() 1257 */ 1258 spin_lock(&inode->i_lock); 1259 1260 dirty = inode->i_state & I_DIRTY; 1261 if (inode->i_state & I_DIRTY_TIME) { 1262 if ((dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) || 1263 unlikely(inode->i_state & I_DIRTY_TIME_EXPIRED) || 1264 unlikely(time_after(jiffies, 1265 (inode->dirtied_time_when + 1266 dirtytime_expire_interval * HZ)))) { 1267 dirty |= I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED; 1268 trace_writeback_lazytime(inode); 1269 } 1270 } else 1271 inode->i_state &= ~I_DIRTY_TIME_EXPIRED; 1272 inode->i_state &= ~dirty; 1273 1274 /* 1275 * Paired with smp_mb() in __mark_inode_dirty(). This allows 1276 * __mark_inode_dirty() to test i_state without grabbing i_lock - 1277 * either they see the I_DIRTY bits cleared or we see the dirtied 1278 * inode. 1279 * 1280 * I_DIRTY_PAGES is always cleared together above even if @mapping 1281 * still has dirty pages. The flag is reinstated after smp_mb() if 1282 * necessary. This guarantees that either __mark_inode_dirty() 1283 * sees clear I_DIRTY_PAGES or we see PAGECACHE_TAG_DIRTY. 1284 */ 1285 smp_mb(); 1286 1287 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) 1288 inode->i_state |= I_DIRTY_PAGES; 1289 1290 spin_unlock(&inode->i_lock); 1291 1292 if (dirty & I_DIRTY_TIME) 1293 mark_inode_dirty_sync(inode); 1294 /* Don't write the inode if only I_DIRTY_PAGES was set */ 1295 if (dirty & ~I_DIRTY_PAGES) { 1296 int err = write_inode(inode, wbc); 1297 if (ret == 0) 1298 ret = err; 1299 } 1300 trace_writeback_single_inode(inode, wbc, nr_to_write); 1301 return ret; 1302 } 1303 1304 /* 1305 * Write out an inode's dirty pages. Either the caller has an active reference 1306 * on the inode or the inode has I_WILL_FREE set. 1307 * 1308 * This function is designed to be called for writing back one inode which 1309 * we go e.g. from filesystem. Flusher thread uses __writeback_single_inode() 1310 * and does more profound writeback list handling in writeback_sb_inodes(). 1311 */ 1312 static int 1313 writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, 1314 struct writeback_control *wbc) 1315 { 1316 int ret = 0; 1317 1318 spin_lock(&inode->i_lock); 1319 if (!atomic_read(&inode->i_count)) 1320 WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING))); 1321 else 1322 WARN_ON(inode->i_state & I_WILL_FREE); 1323 1324 if (inode->i_state & I_SYNC) { 1325 if (wbc->sync_mode != WB_SYNC_ALL) 1326 goto out; 1327 /* 1328 * It's a data-integrity sync. We must wait. Since callers hold 1329 * inode reference or inode has I_WILL_FREE set, it cannot go 1330 * away under us. 1331 */ 1332 __inode_wait_for_writeback(inode); 1333 } 1334 WARN_ON(inode->i_state & I_SYNC); 1335 /* 1336 * Skip inode if it is clean and we have no outstanding writeback in 1337 * WB_SYNC_ALL mode. We don't want to mess with writeback lists in this 1338 * function since flusher thread may be doing for example sync in 1339 * parallel and if we move the inode, it could get skipped. So here we 1340 * make sure inode is on some writeback list and leave it there unless 1341 * we have completely cleaned the inode. 1342 */ 1343 if (!(inode->i_state & I_DIRTY_ALL) && 1344 (wbc->sync_mode != WB_SYNC_ALL || 1345 !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK))) 1346 goto out; 1347 inode->i_state |= I_SYNC; 1348 wbc_attach_and_unlock_inode(wbc, inode); 1349 1350 ret = __writeback_single_inode(inode, wbc); 1351 1352 wbc_detach_inode(wbc); 1353 spin_lock(&wb->list_lock); 1354 spin_lock(&inode->i_lock); 1355 /* 1356 * If inode is clean, remove it from writeback lists. Otherwise don't 1357 * touch it. See comment above for explanation. 1358 */ 1359 if (!(inode->i_state & I_DIRTY_ALL)) 1360 inode_io_list_del_locked(inode, wb); 1361 spin_unlock(&wb->list_lock); 1362 inode_sync_complete(inode); 1363 out: 1364 spin_unlock(&inode->i_lock); 1365 return ret; 1366 } 1367 1368 static long writeback_chunk_size(struct bdi_writeback *wb, 1369 struct wb_writeback_work *work) 1370 { 1371 long pages; 1372 1373 /* 1374 * WB_SYNC_ALL mode does livelock avoidance by syncing dirty 1375 * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX 1376 * here avoids calling into writeback_inodes_wb() more than once. 1377 * 1378 * The intended call sequence for WB_SYNC_ALL writeback is: 1379 * 1380 * wb_writeback() 1381 * writeback_sb_inodes() <== called only once 1382 * write_cache_pages() <== called once for each inode 1383 * (quickly) tag currently dirty pages 1384 * (maybe slowly) sync all tagged pages 1385 */ 1386 if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages) 1387 pages = LONG_MAX; 1388 else { 1389 pages = min(wb->avg_write_bandwidth / 2, 1390 global_wb_domain.dirty_limit / DIRTY_SCOPE); 1391 pages = min(pages, work->nr_pages); 1392 pages = round_down(pages + MIN_WRITEBACK_PAGES, 1393 MIN_WRITEBACK_PAGES); 1394 } 1395 1396 return pages; 1397 } 1398 1399 /* 1400 * Write a portion of b_io inodes which belong to @sb. 1401 * 1402 * Return the number of pages and/or inodes written. 1403 * 1404 * NOTE! This is called with wb->list_lock held, and will 1405 * unlock and relock that for each inode it ends up doing 1406 * IO for. 1407 */ 1408 static long writeback_sb_inodes(struct super_block *sb, 1409 struct bdi_writeback *wb, 1410 struct wb_writeback_work *work) 1411 { 1412 struct writeback_control wbc = { 1413 .sync_mode = work->sync_mode, 1414 .tagged_writepages = work->tagged_writepages, 1415 .for_kupdate = work->for_kupdate, 1416 .for_background = work->for_background, 1417 .for_sync = work->for_sync, 1418 .range_cyclic = work->range_cyclic, 1419 .range_start = 0, 1420 .range_end = LLONG_MAX, 1421 }; 1422 unsigned long start_time = jiffies; 1423 long write_chunk; 1424 long wrote = 0; /* count both pages and inodes */ 1425 1426 while (!list_empty(&wb->b_io)) { 1427 struct inode *inode = wb_inode(wb->b_io.prev); 1428 1429 if (inode->i_sb != sb) { 1430 if (work->sb) { 1431 /* 1432 * We only want to write back data for this 1433 * superblock, move all inodes not belonging 1434 * to it back onto the dirty list. 1435 */ 1436 redirty_tail(inode, wb); 1437 continue; 1438 } 1439 1440 /* 1441 * The inode belongs to a different superblock. 1442 * Bounce back to the caller to unpin this and 1443 * pin the next superblock. 1444 */ 1445 break; 1446 } 1447 1448 /* 1449 * Don't bother with new inodes or inodes being freed, first 1450 * kind does not need periodic writeout yet, and for the latter 1451 * kind writeout is handled by the freer. 1452 */ 1453 spin_lock(&inode->i_lock); 1454 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { 1455 spin_unlock(&inode->i_lock); 1456 redirty_tail(inode, wb); 1457 continue; 1458 } 1459 if ((inode->i_state & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) { 1460 /* 1461 * If this inode is locked for writeback and we are not 1462 * doing writeback-for-data-integrity, move it to 1463 * b_more_io so that writeback can proceed with the 1464 * other inodes on s_io. 1465 * 1466 * We'll have another go at writing back this inode 1467 * when we completed a full scan of b_io. 1468 */ 1469 spin_unlock(&inode->i_lock); 1470 requeue_io(inode, wb); 1471 trace_writeback_sb_inodes_requeue(inode); 1472 continue; 1473 } 1474 spin_unlock(&wb->list_lock); 1475 1476 /* 1477 * We already requeued the inode if it had I_SYNC set and we 1478 * are doing WB_SYNC_NONE writeback. So this catches only the 1479 * WB_SYNC_ALL case. 1480 */ 1481 if (inode->i_state & I_SYNC) { 1482 /* Wait for I_SYNC. This function drops i_lock... */ 1483 inode_sleep_on_writeback(inode); 1484 /* Inode may be gone, start again */ 1485 spin_lock(&wb->list_lock); 1486 continue; 1487 } 1488 inode->i_state |= I_SYNC; 1489 wbc_attach_and_unlock_inode(&wbc, inode); 1490 1491 write_chunk = writeback_chunk_size(wb, work); 1492 wbc.nr_to_write = write_chunk; 1493 wbc.pages_skipped = 0; 1494 1495 /* 1496 * We use I_SYNC to pin the inode in memory. While it is set 1497 * evict_inode() will wait so the inode cannot be freed. 1498 */ 1499 __writeback_single_inode(inode, &wbc); 1500 1501 wbc_detach_inode(&wbc); 1502 work->nr_pages -= write_chunk - wbc.nr_to_write; 1503 wrote += write_chunk - wbc.nr_to_write; 1504 1505 if (need_resched()) { 1506 /* 1507 * We're trying to balance between building up a nice 1508 * long list of IOs to improve our merge rate, and 1509 * getting those IOs out quickly for anyone throttling 1510 * in balance_dirty_pages(). cond_resched() doesn't 1511 * unplug, so get our IOs out the door before we 1512 * give up the CPU. 1513 */ 1514 blk_flush_plug(current); 1515 cond_resched(); 1516 } 1517 1518 1519 spin_lock(&wb->list_lock); 1520 spin_lock(&inode->i_lock); 1521 if (!(inode->i_state & I_DIRTY_ALL)) 1522 wrote++; 1523 requeue_inode(inode, wb, &wbc); 1524 inode_sync_complete(inode); 1525 spin_unlock(&inode->i_lock); 1526 1527 /* 1528 * bail out to wb_writeback() often enough to check 1529 * background threshold and other termination conditions. 1530 */ 1531 if (wrote) { 1532 if (time_is_before_jiffies(start_time + HZ / 10UL)) 1533 break; 1534 if (work->nr_pages <= 0) 1535 break; 1536 } 1537 } 1538 return wrote; 1539 } 1540 1541 static long __writeback_inodes_wb(struct bdi_writeback *wb, 1542 struct wb_writeback_work *work) 1543 { 1544 unsigned long start_time = jiffies; 1545 long wrote = 0; 1546 1547 while (!list_empty(&wb->b_io)) { 1548 struct inode *inode = wb_inode(wb->b_io.prev); 1549 struct super_block *sb = inode->i_sb; 1550 1551 if (!trylock_super(sb)) { 1552 /* 1553 * trylock_super() may fail consistently due to 1554 * s_umount being grabbed by someone else. Don't use 1555 * requeue_io() to avoid busy retrying the inode/sb. 1556 */ 1557 redirty_tail(inode, wb); 1558 continue; 1559 } 1560 wrote += writeback_sb_inodes(sb, wb, work); 1561 up_read(&sb->s_umount); 1562 1563 /* refer to the same tests at the end of writeback_sb_inodes */ 1564 if (wrote) { 1565 if (time_is_before_jiffies(start_time + HZ / 10UL)) 1566 break; 1567 if (work->nr_pages <= 0) 1568 break; 1569 } 1570 } 1571 /* Leave any unwritten inodes on b_io */ 1572 return wrote; 1573 } 1574 1575 static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages, 1576 enum wb_reason reason) 1577 { 1578 struct wb_writeback_work work = { 1579 .nr_pages = nr_pages, 1580 .sync_mode = WB_SYNC_NONE, 1581 .range_cyclic = 1, 1582 .reason = reason, 1583 }; 1584 struct blk_plug plug; 1585 1586 blk_start_plug(&plug); 1587 spin_lock(&wb->list_lock); 1588 if (list_empty(&wb->b_io)) 1589 queue_io(wb, &work); 1590 __writeback_inodes_wb(wb, &work); 1591 spin_unlock(&wb->list_lock); 1592 blk_finish_plug(&plug); 1593 1594 return nr_pages - work.nr_pages; 1595 } 1596 1597 /* 1598 * Explicit flushing or periodic writeback of "old" data. 1599 * 1600 * Define "old": the first time one of an inode's pages is dirtied, we mark the 1601 * dirtying-time in the inode's address_space. So this periodic writeback code 1602 * just walks the superblock inode list, writing back any inodes which are 1603 * older than a specific point in time. 1604 * 1605 * Try to run once per dirty_writeback_interval. But if a writeback event 1606 * takes longer than a dirty_writeback_interval interval, then leave a 1607 * one-second gap. 1608 * 1609 * older_than_this takes precedence over nr_to_write. So we'll only write back 1610 * all dirty pages if they are all attached to "old" mappings. 1611 */ 1612 static long wb_writeback(struct bdi_writeback *wb, 1613 struct wb_writeback_work *work) 1614 { 1615 unsigned long wb_start = jiffies; 1616 long nr_pages = work->nr_pages; 1617 unsigned long oldest_jif; 1618 struct inode *inode; 1619 long progress; 1620 struct blk_plug plug; 1621 1622 oldest_jif = jiffies; 1623 work->older_than_this = &oldest_jif; 1624 1625 blk_start_plug(&plug); 1626 spin_lock(&wb->list_lock); 1627 for (;;) { 1628 /* 1629 * Stop writeback when nr_pages has been consumed 1630 */ 1631 if (work->nr_pages <= 0) 1632 break; 1633 1634 /* 1635 * Background writeout and kupdate-style writeback may 1636 * run forever. Stop them if there is other work to do 1637 * so that e.g. sync can proceed. They'll be restarted 1638 * after the other works are all done. 1639 */ 1640 if ((work->for_background || work->for_kupdate) && 1641 !list_empty(&wb->work_list)) 1642 break; 1643 1644 /* 1645 * For background writeout, stop when we are below the 1646 * background dirty threshold 1647 */ 1648 if (work->for_background && !wb_over_bg_thresh(wb)) 1649 break; 1650 1651 /* 1652 * Kupdate and background works are special and we want to 1653 * include all inodes that need writing. Livelock avoidance is 1654 * handled by these works yielding to any other work so we are 1655 * safe. 1656 */ 1657 if (work->for_kupdate) { 1658 oldest_jif = jiffies - 1659 msecs_to_jiffies(dirty_expire_interval * 10); 1660 } else if (work->for_background) 1661 oldest_jif = jiffies; 1662 1663 trace_writeback_start(wb, work); 1664 if (list_empty(&wb->b_io)) 1665 queue_io(wb, work); 1666 if (work->sb) 1667 progress = writeback_sb_inodes(work->sb, wb, work); 1668 else 1669 progress = __writeback_inodes_wb(wb, work); 1670 trace_writeback_written(wb, work); 1671 1672 wb_update_bandwidth(wb, wb_start); 1673 1674 /* 1675 * Did we write something? Try for more 1676 * 1677 * Dirty inodes are moved to b_io for writeback in batches. 1678 * The completion of the current batch does not necessarily 1679 * mean the overall work is done. So we keep looping as long 1680 * as made some progress on cleaning pages or inodes. 1681 */ 1682 if (progress) 1683 continue; 1684 /* 1685 * No more inodes for IO, bail 1686 */ 1687 if (list_empty(&wb->b_more_io)) 1688 break; 1689 /* 1690 * Nothing written. Wait for some inode to 1691 * become available for writeback. Otherwise 1692 * we'll just busyloop. 1693 */ 1694 if (!list_empty(&wb->b_more_io)) { 1695 trace_writeback_wait(wb, work); 1696 inode = wb_inode(wb->b_more_io.prev); 1697 spin_lock(&inode->i_lock); 1698 spin_unlock(&wb->list_lock); 1699 /* This function drops i_lock... */ 1700 inode_sleep_on_writeback(inode); 1701 spin_lock(&wb->list_lock); 1702 } 1703 } 1704 spin_unlock(&wb->list_lock); 1705 blk_finish_plug(&plug); 1706 1707 return nr_pages - work->nr_pages; 1708 } 1709 1710 /* 1711 * Return the next wb_writeback_work struct that hasn't been processed yet. 1712 */ 1713 static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb) 1714 { 1715 struct wb_writeback_work *work = NULL; 1716 1717 spin_lock_bh(&wb->work_lock); 1718 if (!list_empty(&wb->work_list)) { 1719 work = list_entry(wb->work_list.next, 1720 struct wb_writeback_work, list); 1721 list_del_init(&work->list); 1722 } 1723 spin_unlock_bh(&wb->work_lock); 1724 return work; 1725 } 1726 1727 /* 1728 * Add in the number of potentially dirty inodes, because each inode 1729 * write can dirty pagecache in the underlying blockdev. 1730 */ 1731 static unsigned long get_nr_dirty_pages(void) 1732 { 1733 return global_page_state(NR_FILE_DIRTY) + 1734 global_page_state(NR_UNSTABLE_NFS) + 1735 get_nr_dirty_inodes(); 1736 } 1737 1738 static long wb_check_background_flush(struct bdi_writeback *wb) 1739 { 1740 if (wb_over_bg_thresh(wb)) { 1741 1742 struct wb_writeback_work work = { 1743 .nr_pages = LONG_MAX, 1744 .sync_mode = WB_SYNC_NONE, 1745 .for_background = 1, 1746 .range_cyclic = 1, 1747 .reason = WB_REASON_BACKGROUND, 1748 }; 1749 1750 return wb_writeback(wb, &work); 1751 } 1752 1753 return 0; 1754 } 1755 1756 static long wb_check_old_data_flush(struct bdi_writeback *wb) 1757 { 1758 unsigned long expired; 1759 long nr_pages; 1760 1761 /* 1762 * When set to zero, disable periodic writeback 1763 */ 1764 if (!dirty_writeback_interval) 1765 return 0; 1766 1767 expired = wb->last_old_flush + 1768 msecs_to_jiffies(dirty_writeback_interval * 10); 1769 if (time_before(jiffies, expired)) 1770 return 0; 1771 1772 wb->last_old_flush = jiffies; 1773 nr_pages = get_nr_dirty_pages(); 1774 1775 if (nr_pages) { 1776 struct wb_writeback_work work = { 1777 .nr_pages = nr_pages, 1778 .sync_mode = WB_SYNC_NONE, 1779 .for_kupdate = 1, 1780 .range_cyclic = 1, 1781 .reason = WB_REASON_PERIODIC, 1782 }; 1783 1784 return wb_writeback(wb, &work); 1785 } 1786 1787 return 0; 1788 } 1789 1790 /* 1791 * Retrieve work items and do the writeback they describe 1792 */ 1793 static long wb_do_writeback(struct bdi_writeback *wb) 1794 { 1795 struct wb_writeback_work *work; 1796 long wrote = 0; 1797 1798 set_bit(WB_writeback_running, &wb->state); 1799 while ((work = get_next_work_item(wb)) != NULL) { 1800 struct wb_completion *done = work->done; 1801 1802 trace_writeback_exec(wb, work); 1803 1804 wrote += wb_writeback(wb, work); 1805 1806 if (work->auto_free) 1807 kfree(work); 1808 if (done && atomic_dec_and_test(&done->cnt)) 1809 wake_up_all(&wb->bdi->wb_waitq); 1810 } 1811 1812 /* 1813 * Check for periodic writeback, kupdated() style 1814 */ 1815 wrote += wb_check_old_data_flush(wb); 1816 wrote += wb_check_background_flush(wb); 1817 clear_bit(WB_writeback_running, &wb->state); 1818 1819 return wrote; 1820 } 1821 1822 /* 1823 * Handle writeback of dirty data for the device backed by this bdi. Also 1824 * reschedules periodically and does kupdated style flushing. 1825 */ 1826 void wb_workfn(struct work_struct *work) 1827 { 1828 struct bdi_writeback *wb = container_of(to_delayed_work(work), 1829 struct bdi_writeback, dwork); 1830 long pages_written; 1831 1832 set_worker_desc("flush-%s", dev_name(wb->bdi->dev)); 1833 current->flags |= PF_SWAPWRITE; 1834 1835 if (likely(!current_is_workqueue_rescuer() || 1836 !test_bit(WB_registered, &wb->state))) { 1837 /* 1838 * The normal path. Keep writing back @wb until its 1839 * work_list is empty. Note that this path is also taken 1840 * if @wb is shutting down even when we're running off the 1841 * rescuer as work_list needs to be drained. 1842 */ 1843 do { 1844 pages_written = wb_do_writeback(wb); 1845 trace_writeback_pages_written(pages_written); 1846 } while (!list_empty(&wb->work_list)); 1847 } else { 1848 /* 1849 * bdi_wq can't get enough workers and we're running off 1850 * the emergency worker. Don't hog it. Hopefully, 1024 is 1851 * enough for efficient IO. 1852 */ 1853 pages_written = writeback_inodes_wb(wb, 1024, 1854 WB_REASON_FORKER_THREAD); 1855 trace_writeback_pages_written(pages_written); 1856 } 1857 1858 if (!list_empty(&wb->work_list)) 1859 mod_delayed_work(bdi_wq, &wb->dwork, 0); 1860 else if (wb_has_dirty_io(wb) && dirty_writeback_interval) 1861 wb_wakeup_delayed(wb); 1862 1863 current->flags &= ~PF_SWAPWRITE; 1864 } 1865 1866 /* 1867 * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back 1868 * the whole world. 1869 */ 1870 void wakeup_flusher_threads(long nr_pages, enum wb_reason reason) 1871 { 1872 struct backing_dev_info *bdi; 1873 1874 if (!nr_pages) 1875 nr_pages = get_nr_dirty_pages(); 1876 1877 rcu_read_lock(); 1878 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { 1879 struct bdi_writeback *wb; 1880 1881 if (!bdi_has_dirty_io(bdi)) 1882 continue; 1883 1884 list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) 1885 wb_start_writeback(wb, wb_split_bdi_pages(wb, nr_pages), 1886 false, reason); 1887 } 1888 rcu_read_unlock(); 1889 } 1890 1891 /* 1892 * Wake up bdi's periodically to make sure dirtytime inodes gets 1893 * written back periodically. We deliberately do *not* check the 1894 * b_dirtytime list in wb_has_dirty_io(), since this would cause the 1895 * kernel to be constantly waking up once there are any dirtytime 1896 * inodes on the system. So instead we define a separate delayed work 1897 * function which gets called much more rarely. (By default, only 1898 * once every 12 hours.) 1899 * 1900 * If there is any other write activity going on in the file system, 1901 * this function won't be necessary. But if the only thing that has 1902 * happened on the file system is a dirtytime inode caused by an atime 1903 * update, we need this infrastructure below to make sure that inode 1904 * eventually gets pushed out to disk. 1905 */ 1906 static void wakeup_dirtytime_writeback(struct work_struct *w); 1907 static DECLARE_DELAYED_WORK(dirtytime_work, wakeup_dirtytime_writeback); 1908 1909 static void wakeup_dirtytime_writeback(struct work_struct *w) 1910 { 1911 struct backing_dev_info *bdi; 1912 1913 rcu_read_lock(); 1914 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { 1915 struct bdi_writeback *wb; 1916 1917 list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) 1918 if (!list_empty(&wb->b_dirty_time)) 1919 wb_wakeup(wb); 1920 } 1921 rcu_read_unlock(); 1922 schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ); 1923 } 1924 1925 static int __init start_dirtytime_writeback(void) 1926 { 1927 schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ); 1928 return 0; 1929 } 1930 __initcall(start_dirtytime_writeback); 1931 1932 int dirtytime_interval_handler(struct ctl_table *table, int write, 1933 void __user *buffer, size_t *lenp, loff_t *ppos) 1934 { 1935 int ret; 1936 1937 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 1938 if (ret == 0 && write) 1939 mod_delayed_work(system_wq, &dirtytime_work, 0); 1940 return ret; 1941 } 1942 1943 static noinline void block_dump___mark_inode_dirty(struct inode *inode) 1944 { 1945 if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) { 1946 struct dentry *dentry; 1947 const char *name = "?"; 1948 1949 dentry = d_find_alias(inode); 1950 if (dentry) { 1951 spin_lock(&dentry->d_lock); 1952 name = (const char *) dentry->d_name.name; 1953 } 1954 printk(KERN_DEBUG 1955 "%s(%d): dirtied inode %lu (%s) on %s\n", 1956 current->comm, task_pid_nr(current), inode->i_ino, 1957 name, inode->i_sb->s_id); 1958 if (dentry) { 1959 spin_unlock(&dentry->d_lock); 1960 dput(dentry); 1961 } 1962 } 1963 } 1964 1965 /** 1966 * __mark_inode_dirty - internal function 1967 * @inode: inode to mark 1968 * @flags: what kind of dirty (i.e. I_DIRTY_SYNC) 1969 * Mark an inode as dirty. Callers should use mark_inode_dirty or 1970 * mark_inode_dirty_sync. 1971 * 1972 * Put the inode on the super block's dirty list. 1973 * 1974 * CAREFUL! We mark it dirty unconditionally, but move it onto the 1975 * dirty list only if it is hashed or if it refers to a blockdev. 1976 * If it was not hashed, it will never be added to the dirty list 1977 * even if it is later hashed, as it will have been marked dirty already. 1978 * 1979 * In short, make sure you hash any inodes _before_ you start marking 1980 * them dirty. 1981 * 1982 * Note that for blockdevs, inode->dirtied_when represents the dirtying time of 1983 * the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of 1984 * the kernel-internal blockdev inode represents the dirtying time of the 1985 * blockdev's pages. This is why for I_DIRTY_PAGES we always use 1986 * page->mapping->host, so the page-dirtying time is recorded in the internal 1987 * blockdev inode. 1988 */ 1989 void __mark_inode_dirty(struct inode *inode, int flags) 1990 { 1991 #define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC) 1992 struct super_block *sb = inode->i_sb; 1993 int dirtytime; 1994 1995 trace_writeback_mark_inode_dirty(inode, flags); 1996 1997 /* 1998 * Don't do this for I_DIRTY_PAGES - that doesn't actually 1999 * dirty the inode itself 2000 */ 2001 if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_TIME)) { 2002 trace_writeback_dirty_inode_start(inode, flags); 2003 2004 if (sb->s_op->dirty_inode) 2005 sb->s_op->dirty_inode(inode, flags); 2006 2007 trace_writeback_dirty_inode(inode, flags); 2008 } 2009 if (flags & I_DIRTY_INODE) 2010 flags &= ~I_DIRTY_TIME; 2011 dirtytime = flags & I_DIRTY_TIME; 2012 2013 /* 2014 * Paired with smp_mb() in __writeback_single_inode() for the 2015 * following lockless i_state test. See there for details. 2016 */ 2017 smp_mb(); 2018 2019 if (((inode->i_state & flags) == flags) || 2020 (dirtytime && (inode->i_state & I_DIRTY_INODE))) 2021 return; 2022 2023 if (unlikely(block_dump)) 2024 block_dump___mark_inode_dirty(inode); 2025 2026 spin_lock(&inode->i_lock); 2027 if (dirtytime && (inode->i_state & I_DIRTY_INODE)) 2028 goto out_unlock_inode; 2029 if ((inode->i_state & flags) != flags) { 2030 const int was_dirty = inode->i_state & I_DIRTY; 2031 2032 inode_attach_wb(inode, NULL); 2033 2034 if (flags & I_DIRTY_INODE) 2035 inode->i_state &= ~I_DIRTY_TIME; 2036 inode->i_state |= flags; 2037 2038 /* 2039 * If the inode is being synced, just update its dirty state. 2040 * The unlocker will place the inode on the appropriate 2041 * superblock list, based upon its state. 2042 */ 2043 if (inode->i_state & I_SYNC) 2044 goto out_unlock_inode; 2045 2046 /* 2047 * Only add valid (hashed) inodes to the superblock's 2048 * dirty list. Add blockdev inodes as well. 2049 */ 2050 if (!S_ISBLK(inode->i_mode)) { 2051 if (inode_unhashed(inode)) 2052 goto out_unlock_inode; 2053 } 2054 if (inode->i_state & I_FREEING) 2055 goto out_unlock_inode; 2056 2057 /* 2058 * If the inode was already on b_dirty/b_io/b_more_io, don't 2059 * reposition it (that would break b_dirty time-ordering). 2060 */ 2061 if (!was_dirty) { 2062 struct bdi_writeback *wb; 2063 struct list_head *dirty_list; 2064 bool wakeup_bdi = false; 2065 2066 wb = locked_inode_to_wb_and_lock_list(inode); 2067 2068 WARN(bdi_cap_writeback_dirty(wb->bdi) && 2069 !test_bit(WB_registered, &wb->state), 2070 "bdi-%s not registered\n", wb->bdi->name); 2071 2072 inode->dirtied_when = jiffies; 2073 if (dirtytime) 2074 inode->dirtied_time_when = jiffies; 2075 2076 if (inode->i_state & (I_DIRTY_INODE | I_DIRTY_PAGES)) 2077 dirty_list = &wb->b_dirty; 2078 else 2079 dirty_list = &wb->b_dirty_time; 2080 2081 wakeup_bdi = inode_io_list_move_locked(inode, wb, 2082 dirty_list); 2083 2084 spin_unlock(&wb->list_lock); 2085 trace_writeback_dirty_inode_enqueue(inode); 2086 2087 /* 2088 * If this is the first dirty inode for this bdi, 2089 * we have to wake-up the corresponding bdi thread 2090 * to make sure background write-back happens 2091 * later. 2092 */ 2093 if (bdi_cap_writeback_dirty(wb->bdi) && wakeup_bdi) 2094 wb_wakeup_delayed(wb); 2095 return; 2096 } 2097 } 2098 out_unlock_inode: 2099 spin_unlock(&inode->i_lock); 2100 2101 #undef I_DIRTY_INODE 2102 } 2103 EXPORT_SYMBOL(__mark_inode_dirty); 2104 2105 /* 2106 * The @s_sync_lock is used to serialise concurrent sync operations 2107 * to avoid lock contention problems with concurrent wait_sb_inodes() calls. 2108 * Concurrent callers will block on the s_sync_lock rather than doing contending 2109 * walks. The queueing maintains sync(2) required behaviour as all the IO that 2110 * has been issued up to the time this function is enter is guaranteed to be 2111 * completed by the time we have gained the lock and waited for all IO that is 2112 * in progress regardless of the order callers are granted the lock. 2113 */ 2114 static void wait_sb_inodes(struct super_block *sb) 2115 { 2116 struct inode *inode, *old_inode = NULL; 2117 2118 /* 2119 * We need to be protected against the filesystem going from 2120 * r/o to r/w or vice versa. 2121 */ 2122 WARN_ON(!rwsem_is_locked(&sb->s_umount)); 2123 2124 mutex_lock(&sb->s_sync_lock); 2125 spin_lock(&sb->s_inode_list_lock); 2126 2127 /* 2128 * Data integrity sync. Must wait for all pages under writeback, 2129 * because there may have been pages dirtied before our sync 2130 * call, but which had writeout started before we write it out. 2131 * In which case, the inode may not be on the dirty list, but 2132 * we still have to wait for that writeout. 2133 */ 2134 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 2135 struct address_space *mapping = inode->i_mapping; 2136 2137 spin_lock(&inode->i_lock); 2138 if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || 2139 (mapping->nrpages == 0)) { 2140 spin_unlock(&inode->i_lock); 2141 continue; 2142 } 2143 __iget(inode); 2144 spin_unlock(&inode->i_lock); 2145 spin_unlock(&sb->s_inode_list_lock); 2146 2147 /* 2148 * We hold a reference to 'inode' so it couldn't have been 2149 * removed from s_inodes list while we dropped the 2150 * s_inode_list_lock. We cannot iput the inode now as we can 2151 * be holding the last reference and we cannot iput it under 2152 * s_inode_list_lock. So we keep the reference and iput it 2153 * later. 2154 */ 2155 iput(old_inode); 2156 old_inode = inode; 2157 2158 /* 2159 * We keep the error status of individual mapping so that 2160 * applications can catch the writeback error using fsync(2). 2161 * See filemap_fdatawait_keep_errors() for details. 2162 */ 2163 filemap_fdatawait_keep_errors(mapping); 2164 2165 cond_resched(); 2166 2167 spin_lock(&sb->s_inode_list_lock); 2168 } 2169 spin_unlock(&sb->s_inode_list_lock); 2170 iput(old_inode); 2171 mutex_unlock(&sb->s_sync_lock); 2172 } 2173 2174 static void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr, 2175 enum wb_reason reason, bool skip_if_busy) 2176 { 2177 DEFINE_WB_COMPLETION_ONSTACK(done); 2178 struct wb_writeback_work work = { 2179 .sb = sb, 2180 .sync_mode = WB_SYNC_NONE, 2181 .tagged_writepages = 1, 2182 .done = &done, 2183 .nr_pages = nr, 2184 .reason = reason, 2185 }; 2186 struct backing_dev_info *bdi = sb->s_bdi; 2187 2188 if (!bdi_has_dirty_io(bdi) || bdi == &noop_backing_dev_info) 2189 return; 2190 WARN_ON(!rwsem_is_locked(&sb->s_umount)); 2191 2192 bdi_split_work_to_wbs(sb->s_bdi, &work, skip_if_busy); 2193 wb_wait_for_completion(bdi, &done); 2194 } 2195 2196 /** 2197 * writeback_inodes_sb_nr - writeback dirty inodes from given super_block 2198 * @sb: the superblock 2199 * @nr: the number of pages to write 2200 * @reason: reason why some writeback work initiated 2201 * 2202 * Start writeback on some inodes on this super_block. No guarantees are made 2203 * on how many (if any) will be written, and this function does not wait 2204 * for IO completion of submitted IO. 2205 */ 2206 void writeback_inodes_sb_nr(struct super_block *sb, 2207 unsigned long nr, 2208 enum wb_reason reason) 2209 { 2210 __writeback_inodes_sb_nr(sb, nr, reason, false); 2211 } 2212 EXPORT_SYMBOL(writeback_inodes_sb_nr); 2213 2214 /** 2215 * writeback_inodes_sb - writeback dirty inodes from given super_block 2216 * @sb: the superblock 2217 * @reason: reason why some writeback work was initiated 2218 * 2219 * Start writeback on some inodes on this super_block. No guarantees are made 2220 * on how many (if any) will be written, and this function does not wait 2221 * for IO completion of submitted IO. 2222 */ 2223 void writeback_inodes_sb(struct super_block *sb, enum wb_reason reason) 2224 { 2225 return writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason); 2226 } 2227 EXPORT_SYMBOL(writeback_inodes_sb); 2228 2229 /** 2230 * try_to_writeback_inodes_sb_nr - try to start writeback if none underway 2231 * @sb: the superblock 2232 * @nr: the number of pages to write 2233 * @reason: the reason of writeback 2234 * 2235 * Invoke writeback_inodes_sb_nr if no writeback is currently underway. 2236 * Returns 1 if writeback was started, 0 if not. 2237 */ 2238 bool try_to_writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr, 2239 enum wb_reason reason) 2240 { 2241 if (!down_read_trylock(&sb->s_umount)) 2242 return false; 2243 2244 __writeback_inodes_sb_nr(sb, nr, reason, true); 2245 up_read(&sb->s_umount); 2246 return true; 2247 } 2248 EXPORT_SYMBOL(try_to_writeback_inodes_sb_nr); 2249 2250 /** 2251 * try_to_writeback_inodes_sb - try to start writeback if none underway 2252 * @sb: the superblock 2253 * @reason: reason why some writeback work was initiated 2254 * 2255 * Implement by try_to_writeback_inodes_sb_nr() 2256 * Returns 1 if writeback was started, 0 if not. 2257 */ 2258 bool try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason) 2259 { 2260 return try_to_writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason); 2261 } 2262 EXPORT_SYMBOL(try_to_writeback_inodes_sb); 2263 2264 /** 2265 * sync_inodes_sb - sync sb inode pages 2266 * @sb: the superblock 2267 * 2268 * This function writes and waits on any dirty inode belonging to this 2269 * super_block. 2270 */ 2271 void sync_inodes_sb(struct super_block *sb) 2272 { 2273 DEFINE_WB_COMPLETION_ONSTACK(done); 2274 struct wb_writeback_work work = { 2275 .sb = sb, 2276 .sync_mode = WB_SYNC_ALL, 2277 .nr_pages = LONG_MAX, 2278 .range_cyclic = 0, 2279 .done = &done, 2280 .reason = WB_REASON_SYNC, 2281 .for_sync = 1, 2282 }; 2283 struct backing_dev_info *bdi = sb->s_bdi; 2284 2285 /* 2286 * Can't skip on !bdi_has_dirty() because we should wait for !dirty 2287 * inodes under writeback and I_DIRTY_TIME inodes ignored by 2288 * bdi_has_dirty() need to be written out too. 2289 */ 2290 if (bdi == &noop_backing_dev_info) 2291 return; 2292 WARN_ON(!rwsem_is_locked(&sb->s_umount)); 2293 2294 bdi_split_work_to_wbs(bdi, &work, false); 2295 wb_wait_for_completion(bdi, &done); 2296 2297 wait_sb_inodes(sb); 2298 } 2299 EXPORT_SYMBOL(sync_inodes_sb); 2300 2301 /** 2302 * write_inode_now - write an inode to disk 2303 * @inode: inode to write to disk 2304 * @sync: whether the write should be synchronous or not 2305 * 2306 * This function commits an inode to disk immediately if it is dirty. This is 2307 * primarily needed by knfsd. 2308 * 2309 * The caller must either have a ref on the inode or must have set I_WILL_FREE. 2310 */ 2311 int write_inode_now(struct inode *inode, int sync) 2312 { 2313 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; 2314 struct writeback_control wbc = { 2315 .nr_to_write = LONG_MAX, 2316 .sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE, 2317 .range_start = 0, 2318 .range_end = LLONG_MAX, 2319 }; 2320 2321 if (!mapping_cap_writeback_dirty(inode->i_mapping)) 2322 wbc.nr_to_write = 0; 2323 2324 might_sleep(); 2325 return writeback_single_inode(inode, wb, &wbc); 2326 } 2327 EXPORT_SYMBOL(write_inode_now); 2328 2329 /** 2330 * sync_inode - write an inode and its pages to disk. 2331 * @inode: the inode to sync 2332 * @wbc: controls the writeback mode 2333 * 2334 * sync_inode() will write an inode and its pages to disk. It will also 2335 * correctly update the inode on its superblock's dirty inode lists and will 2336 * update inode->i_state. 2337 * 2338 * The caller must have a ref on the inode. 2339 */ 2340 int sync_inode(struct inode *inode, struct writeback_control *wbc) 2341 { 2342 return writeback_single_inode(inode, &inode_to_bdi(inode)->wb, wbc); 2343 } 2344 EXPORT_SYMBOL(sync_inode); 2345 2346 /** 2347 * sync_inode_metadata - write an inode to disk 2348 * @inode: the inode to sync 2349 * @wait: wait for I/O to complete. 2350 * 2351 * Write an inode to disk and adjust its dirty state after completion. 2352 * 2353 * Note: only writes the actual inode, no associated data or other metadata. 2354 */ 2355 int sync_inode_metadata(struct inode *inode, int wait) 2356 { 2357 struct writeback_control wbc = { 2358 .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE, 2359 .nr_to_write = 0, /* metadata-only */ 2360 }; 2361 2362 return sync_inode(inode, &wbc); 2363 } 2364 EXPORT_SYMBOL(sync_inode_metadata); 2365