1 /* 2 * fs/fs-writeback.c 3 * 4 * Copyright (C) 2002, Linus Torvalds. 5 * 6 * Contains all the functions related to writing back and waiting 7 * upon dirty inodes against superblocks, and writing back dirty 8 * pages against inodes. ie: data writeback. Writeout of the 9 * inode itself is not handled here. 10 * 11 * 10Apr2002 Andrew Morton 12 * Split out of fs/inode.c 13 * Additions for address_space-based writeback 14 */ 15 16 #include <linux/kernel.h> 17 #include <linux/module.h> 18 #include <linux/spinlock.h> 19 #include <linux/slab.h> 20 #include <linux/sched.h> 21 #include <linux/fs.h> 22 #include <linux/mm.h> 23 #include <linux/kthread.h> 24 #include <linux/freezer.h> 25 #include <linux/writeback.h> 26 #include <linux/blkdev.h> 27 #include <linux/backing-dev.h> 28 #include <linux/buffer_head.h> 29 #include <linux/tracepoint.h> 30 #include "internal.h" 31 32 /* 33 * The maximum number of pages to writeout in a single bdi flush/kupdate 34 * operation. We do this so we don't hold I_SYNC against an inode for 35 * enormous amounts of time, which would block a userspace task which has 36 * been forced to throttle against that inode. Also, the code reevaluates 37 * the dirty each time it has written this many pages. 38 */ 39 #define MAX_WRITEBACK_PAGES 1024L 40 41 /* 42 * Passed into wb_writeback(), essentially a subset of writeback_control 43 */ 44 struct wb_writeback_work { 45 long nr_pages; 46 struct super_block *sb; 47 unsigned long *older_than_this; 48 enum writeback_sync_modes sync_mode; 49 unsigned int tagged_writepages:1; 50 unsigned int for_kupdate:1; 51 unsigned int range_cyclic:1; 52 unsigned int for_background:1; 53 54 struct list_head list; /* pending work list */ 55 struct completion *done; /* set if the caller waits */ 56 }; 57 58 /* 59 * Include the creation of the trace points after defining the 60 * wb_writeback_work structure so that the definition remains local to this 61 * file. 62 */ 63 #define CREATE_TRACE_POINTS 64 #include <trace/events/writeback.h> 65 66 /* 67 * We don't actually have pdflush, but this one is exported though /proc... 68 */ 69 int nr_pdflush_threads; 70 71 /** 72 * writeback_in_progress - determine whether there is writeback in progress 73 * @bdi: the device's backing_dev_info structure. 74 * 75 * Determine whether there is writeback waiting to be handled against a 76 * backing device. 77 */ 78 int writeback_in_progress(struct backing_dev_info *bdi) 79 { 80 return test_bit(BDI_writeback_running, &bdi->state); 81 } 82 83 static inline struct backing_dev_info *inode_to_bdi(struct inode *inode) 84 { 85 struct super_block *sb = inode->i_sb; 86 87 if (strcmp(sb->s_type->name, "bdev") == 0) 88 return inode->i_mapping->backing_dev_info; 89 90 return sb->s_bdi; 91 } 92 93 static inline struct inode *wb_inode(struct list_head *head) 94 { 95 return list_entry(head, struct inode, i_wb_list); 96 } 97 98 /* Wakeup flusher thread or forker thread to fork it. Requires bdi->wb_lock. */ 99 static void bdi_wakeup_flusher(struct backing_dev_info *bdi) 100 { 101 if (bdi->wb.task) { 102 wake_up_process(bdi->wb.task); 103 } else { 104 /* 105 * The bdi thread isn't there, wake up the forker thread which 106 * will create and run it. 107 */ 108 wake_up_process(default_backing_dev_info.wb.task); 109 } 110 } 111 112 static void bdi_queue_work(struct backing_dev_info *bdi, 113 struct wb_writeback_work *work) 114 { 115 trace_writeback_queue(bdi, work); 116 117 spin_lock_bh(&bdi->wb_lock); 118 list_add_tail(&work->list, &bdi->work_list); 119 if (!bdi->wb.task) 120 trace_writeback_nothread(bdi, work); 121 bdi_wakeup_flusher(bdi); 122 spin_unlock_bh(&bdi->wb_lock); 123 } 124 125 static void 126 __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, 127 bool range_cyclic) 128 { 129 struct wb_writeback_work *work; 130 131 /* 132 * This is WB_SYNC_NONE writeback, so if allocation fails just 133 * wakeup the thread for old dirty data writeback 134 */ 135 work = kzalloc(sizeof(*work), GFP_ATOMIC); 136 if (!work) { 137 if (bdi->wb.task) { 138 trace_writeback_nowork(bdi); 139 wake_up_process(bdi->wb.task); 140 } 141 return; 142 } 143 144 work->sync_mode = WB_SYNC_NONE; 145 work->nr_pages = nr_pages; 146 work->range_cyclic = range_cyclic; 147 148 bdi_queue_work(bdi, work); 149 } 150 151 /** 152 * bdi_start_writeback - start writeback 153 * @bdi: the backing device to write from 154 * @nr_pages: the number of pages to write 155 * 156 * Description: 157 * This does WB_SYNC_NONE opportunistic writeback. The IO is only 158 * started when this function returns, we make no guarantees on 159 * completion. Caller need not hold sb s_umount semaphore. 160 * 161 */ 162 void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages) 163 { 164 __bdi_start_writeback(bdi, nr_pages, true); 165 } 166 167 /** 168 * bdi_start_background_writeback - start background writeback 169 * @bdi: the backing device to write from 170 * 171 * Description: 172 * This makes sure WB_SYNC_NONE background writeback happens. When 173 * this function returns, it is only guaranteed that for given BDI 174 * some IO is happening if we are over background dirty threshold. 175 * Caller need not hold sb s_umount semaphore. 176 */ 177 void bdi_start_background_writeback(struct backing_dev_info *bdi) 178 { 179 /* 180 * We just wake up the flusher thread. It will perform background 181 * writeback as soon as there is no other work to do. 182 */ 183 trace_writeback_wake_background(bdi); 184 spin_lock_bh(&bdi->wb_lock); 185 bdi_wakeup_flusher(bdi); 186 spin_unlock_bh(&bdi->wb_lock); 187 } 188 189 /* 190 * Remove the inode from the writeback list it is on. 191 */ 192 void inode_wb_list_del(struct inode *inode) 193 { 194 struct backing_dev_info *bdi = inode_to_bdi(inode); 195 196 spin_lock(&bdi->wb.list_lock); 197 list_del_init(&inode->i_wb_list); 198 spin_unlock(&bdi->wb.list_lock); 199 } 200 201 /* 202 * Redirty an inode: set its when-it-was dirtied timestamp and move it to the 203 * furthest end of its superblock's dirty-inode list. 204 * 205 * Before stamping the inode's ->dirtied_when, we check to see whether it is 206 * already the most-recently-dirtied inode on the b_dirty list. If that is 207 * the case then the inode must have been redirtied while it was being written 208 * out and we don't reset its dirtied_when. 209 */ 210 static void redirty_tail(struct inode *inode, struct bdi_writeback *wb) 211 { 212 assert_spin_locked(&wb->list_lock); 213 if (!list_empty(&wb->b_dirty)) { 214 struct inode *tail; 215 216 tail = wb_inode(wb->b_dirty.next); 217 if (time_before(inode->dirtied_when, tail->dirtied_when)) 218 inode->dirtied_when = jiffies; 219 } 220 list_move(&inode->i_wb_list, &wb->b_dirty); 221 } 222 223 /* 224 * requeue inode for re-scanning after bdi->b_io list is exhausted. 225 */ 226 static void requeue_io(struct inode *inode, struct bdi_writeback *wb) 227 { 228 assert_spin_locked(&wb->list_lock); 229 list_move(&inode->i_wb_list, &wb->b_more_io); 230 } 231 232 static void inode_sync_complete(struct inode *inode) 233 { 234 /* 235 * Prevent speculative execution through 236 * spin_unlock(&wb->list_lock); 237 */ 238 239 smp_mb(); 240 wake_up_bit(&inode->i_state, __I_SYNC); 241 } 242 243 static bool inode_dirtied_after(struct inode *inode, unsigned long t) 244 { 245 bool ret = time_after(inode->dirtied_when, t); 246 #ifndef CONFIG_64BIT 247 /* 248 * For inodes being constantly redirtied, dirtied_when can get stuck. 249 * It _appears_ to be in the future, but is actually in distant past. 250 * This test is necessary to prevent such wrapped-around relative times 251 * from permanently stopping the whole bdi writeback. 252 */ 253 ret = ret && time_before_eq(inode->dirtied_when, jiffies); 254 #endif 255 return ret; 256 } 257 258 /* 259 * Move expired dirty inodes from @delaying_queue to @dispatch_queue. 260 */ 261 static int move_expired_inodes(struct list_head *delaying_queue, 262 struct list_head *dispatch_queue, 263 unsigned long *older_than_this) 264 { 265 LIST_HEAD(tmp); 266 struct list_head *pos, *node; 267 struct super_block *sb = NULL; 268 struct inode *inode; 269 int do_sb_sort = 0; 270 int moved = 0; 271 272 while (!list_empty(delaying_queue)) { 273 inode = wb_inode(delaying_queue->prev); 274 if (older_than_this && 275 inode_dirtied_after(inode, *older_than_this)) 276 break; 277 if (sb && sb != inode->i_sb) 278 do_sb_sort = 1; 279 sb = inode->i_sb; 280 list_move(&inode->i_wb_list, &tmp); 281 moved++; 282 } 283 284 /* just one sb in list, splice to dispatch_queue and we're done */ 285 if (!do_sb_sort) { 286 list_splice(&tmp, dispatch_queue); 287 goto out; 288 } 289 290 /* Move inodes from one superblock together */ 291 while (!list_empty(&tmp)) { 292 sb = wb_inode(tmp.prev)->i_sb; 293 list_for_each_prev_safe(pos, node, &tmp) { 294 inode = wb_inode(pos); 295 if (inode->i_sb == sb) 296 list_move(&inode->i_wb_list, dispatch_queue); 297 } 298 } 299 out: 300 return moved; 301 } 302 303 /* 304 * Queue all expired dirty inodes for io, eldest first. 305 * Before 306 * newly dirtied b_dirty b_io b_more_io 307 * =============> gf edc BA 308 * After 309 * newly dirtied b_dirty b_io b_more_io 310 * =============> g fBAedc 311 * | 312 * +--> dequeue for IO 313 */ 314 static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this) 315 { 316 int moved; 317 assert_spin_locked(&wb->list_lock); 318 list_splice_init(&wb->b_more_io, &wb->b_io); 319 moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); 320 trace_writeback_queue_io(wb, older_than_this, moved); 321 } 322 323 static int write_inode(struct inode *inode, struct writeback_control *wbc) 324 { 325 if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) 326 return inode->i_sb->s_op->write_inode(inode, wbc); 327 return 0; 328 } 329 330 /* 331 * Wait for writeback on an inode to complete. 332 */ 333 static void inode_wait_for_writeback(struct inode *inode, 334 struct bdi_writeback *wb) 335 { 336 DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC); 337 wait_queue_head_t *wqh; 338 339 wqh = bit_waitqueue(&inode->i_state, __I_SYNC); 340 while (inode->i_state & I_SYNC) { 341 spin_unlock(&inode->i_lock); 342 spin_unlock(&wb->list_lock); 343 __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); 344 spin_lock(&wb->list_lock); 345 spin_lock(&inode->i_lock); 346 } 347 } 348 349 /* 350 * Write out an inode's dirty pages. Called under wb->list_lock and 351 * inode->i_lock. Either the caller has an active reference on the inode or 352 * the inode has I_WILL_FREE set. 353 * 354 * If `wait' is set, wait on the writeout. 355 * 356 * The whole writeout design is quite complex and fragile. We want to avoid 357 * starvation of particular inodes when others are being redirtied, prevent 358 * livelocks, etc. 359 */ 360 static int 361 writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, 362 struct writeback_control *wbc) 363 { 364 struct address_space *mapping = inode->i_mapping; 365 long nr_to_write = wbc->nr_to_write; 366 unsigned dirty; 367 int ret; 368 369 assert_spin_locked(&wb->list_lock); 370 assert_spin_locked(&inode->i_lock); 371 372 if (!atomic_read(&inode->i_count)) 373 WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING))); 374 else 375 WARN_ON(inode->i_state & I_WILL_FREE); 376 377 if (inode->i_state & I_SYNC) { 378 /* 379 * If this inode is locked for writeback and we are not doing 380 * writeback-for-data-integrity, move it to b_more_io so that 381 * writeback can proceed with the other inodes on s_io. 382 * 383 * We'll have another go at writing back this inode when we 384 * completed a full scan of b_io. 385 */ 386 if (wbc->sync_mode != WB_SYNC_ALL) { 387 requeue_io(inode, wb); 388 trace_writeback_single_inode_requeue(inode, wbc, 389 nr_to_write); 390 return 0; 391 } 392 393 /* 394 * It's a data-integrity sync. We must wait. 395 */ 396 inode_wait_for_writeback(inode, wb); 397 } 398 399 BUG_ON(inode->i_state & I_SYNC); 400 401 /* Set I_SYNC, reset I_DIRTY_PAGES */ 402 inode->i_state |= I_SYNC; 403 inode->i_state &= ~I_DIRTY_PAGES; 404 spin_unlock(&inode->i_lock); 405 spin_unlock(&wb->list_lock); 406 407 ret = do_writepages(mapping, wbc); 408 409 /* 410 * Make sure to wait on the data before writing out the metadata. 411 * This is important for filesystems that modify metadata on data 412 * I/O completion. 413 */ 414 if (wbc->sync_mode == WB_SYNC_ALL) { 415 int err = filemap_fdatawait(mapping); 416 if (ret == 0) 417 ret = err; 418 } 419 420 /* 421 * Some filesystems may redirty the inode during the writeback 422 * due to delalloc, clear dirty metadata flags right before 423 * write_inode() 424 */ 425 spin_lock(&inode->i_lock); 426 dirty = inode->i_state & I_DIRTY; 427 inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC); 428 spin_unlock(&inode->i_lock); 429 /* Don't write the inode if only I_DIRTY_PAGES was set */ 430 if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { 431 int err = write_inode(inode, wbc); 432 if (ret == 0) 433 ret = err; 434 } 435 436 spin_lock(&wb->list_lock); 437 spin_lock(&inode->i_lock); 438 inode->i_state &= ~I_SYNC; 439 if (!(inode->i_state & I_FREEING)) { 440 /* 441 * Sync livelock prevention. Each inode is tagged and synced in 442 * one shot. If still dirty, it will be redirty_tail()'ed below. 443 * Update the dirty time to prevent enqueue and sync it again. 444 */ 445 if ((inode->i_state & I_DIRTY) && 446 (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)) 447 inode->dirtied_when = jiffies; 448 449 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { 450 /* 451 * We didn't write back all the pages. nfs_writepages() 452 * sometimes bales out without doing anything. 453 */ 454 inode->i_state |= I_DIRTY_PAGES; 455 if (wbc->nr_to_write <= 0) { 456 /* 457 * slice used up: queue for next turn 458 */ 459 requeue_io(inode, wb); 460 } else { 461 /* 462 * Writeback blocked by something other than 463 * congestion. Delay the inode for some time to 464 * avoid spinning on the CPU (100% iowait) 465 * retrying writeback of the dirty page/inode 466 * that cannot be performed immediately. 467 */ 468 redirty_tail(inode, wb); 469 } 470 } else if (inode->i_state & I_DIRTY) { 471 /* 472 * Filesystems can dirty the inode during writeback 473 * operations, such as delayed allocation during 474 * submission or metadata updates after data IO 475 * completion. 476 */ 477 redirty_tail(inode, wb); 478 } else { 479 /* 480 * The inode is clean. At this point we either have 481 * a reference to the inode or it's on it's way out. 482 * No need to add it back to the LRU. 483 */ 484 list_del_init(&inode->i_wb_list); 485 } 486 } 487 inode_sync_complete(inode); 488 trace_writeback_single_inode(inode, wbc, nr_to_write); 489 return ret; 490 } 491 492 /* 493 * For background writeback the caller does not have the sb pinned 494 * before calling writeback. So make sure that we do pin it, so it doesn't 495 * go away while we are writing inodes from it. 496 */ 497 static bool pin_sb_for_writeback(struct super_block *sb) 498 { 499 spin_lock(&sb_lock); 500 if (list_empty(&sb->s_instances)) { 501 spin_unlock(&sb_lock); 502 return false; 503 } 504 505 sb->s_count++; 506 spin_unlock(&sb_lock); 507 508 if (down_read_trylock(&sb->s_umount)) { 509 if (sb->s_root) 510 return true; 511 up_read(&sb->s_umount); 512 } 513 514 put_super(sb); 515 return false; 516 } 517 518 static long writeback_chunk_size(struct wb_writeback_work *work) 519 { 520 long pages; 521 522 /* 523 * WB_SYNC_ALL mode does livelock avoidance by syncing dirty 524 * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX 525 * here avoids calling into writeback_inodes_wb() more than once. 526 * 527 * The intended call sequence for WB_SYNC_ALL writeback is: 528 * 529 * wb_writeback() 530 * writeback_sb_inodes() <== called only once 531 * write_cache_pages() <== called once for each inode 532 * (quickly) tag currently dirty pages 533 * (maybe slowly) sync all tagged pages 534 */ 535 if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages) 536 pages = LONG_MAX; 537 else 538 pages = min(MAX_WRITEBACK_PAGES, work->nr_pages); 539 540 return pages; 541 } 542 543 /* 544 * Write a portion of b_io inodes which belong to @sb. 545 * 546 * If @only_this_sb is true, then find and write all such 547 * inodes. Otherwise write only ones which go sequentially 548 * in reverse order. 549 * 550 * Return the number of pages and/or inodes written. 551 */ 552 static long writeback_sb_inodes(struct super_block *sb, 553 struct bdi_writeback *wb, 554 struct wb_writeback_work *work) 555 { 556 struct writeback_control wbc = { 557 .sync_mode = work->sync_mode, 558 .tagged_writepages = work->tagged_writepages, 559 .for_kupdate = work->for_kupdate, 560 .for_background = work->for_background, 561 .range_cyclic = work->range_cyclic, 562 .range_start = 0, 563 .range_end = LLONG_MAX, 564 }; 565 unsigned long start_time = jiffies; 566 long write_chunk; 567 long wrote = 0; /* count both pages and inodes */ 568 569 while (!list_empty(&wb->b_io)) { 570 struct inode *inode = wb_inode(wb->b_io.prev); 571 572 if (inode->i_sb != sb) { 573 if (work->sb) { 574 /* 575 * We only want to write back data for this 576 * superblock, move all inodes not belonging 577 * to it back onto the dirty list. 578 */ 579 redirty_tail(inode, wb); 580 continue; 581 } 582 583 /* 584 * The inode belongs to a different superblock. 585 * Bounce back to the caller to unpin this and 586 * pin the next superblock. 587 */ 588 break; 589 } 590 591 /* 592 * Don't bother with new inodes or inodes beeing freed, first 593 * kind does not need peridic writeout yet, and for the latter 594 * kind writeout is handled by the freer. 595 */ 596 spin_lock(&inode->i_lock); 597 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { 598 spin_unlock(&inode->i_lock); 599 requeue_io(inode, wb); 600 continue; 601 } 602 __iget(inode); 603 write_chunk = writeback_chunk_size(work); 604 wbc.nr_to_write = write_chunk; 605 wbc.pages_skipped = 0; 606 607 writeback_single_inode(inode, wb, &wbc); 608 609 work->nr_pages -= write_chunk - wbc.nr_to_write; 610 wrote += write_chunk - wbc.nr_to_write; 611 if (!(inode->i_state & I_DIRTY)) 612 wrote++; 613 if (wbc.pages_skipped) { 614 /* 615 * writeback is not making progress due to locked 616 * buffers. Skip this inode for now. 617 */ 618 redirty_tail(inode, wb); 619 } 620 spin_unlock(&inode->i_lock); 621 spin_unlock(&wb->list_lock); 622 iput(inode); 623 cond_resched(); 624 spin_lock(&wb->list_lock); 625 /* 626 * bail out to wb_writeback() often enough to check 627 * background threshold and other termination conditions. 628 */ 629 if (wrote) { 630 if (time_is_before_jiffies(start_time + HZ / 10UL)) 631 break; 632 if (work->nr_pages <= 0) 633 break; 634 } 635 } 636 return wrote; 637 } 638 639 static long __writeback_inodes_wb(struct bdi_writeback *wb, 640 struct wb_writeback_work *work) 641 { 642 unsigned long start_time = jiffies; 643 long wrote = 0; 644 645 while (!list_empty(&wb->b_io)) { 646 struct inode *inode = wb_inode(wb->b_io.prev); 647 struct super_block *sb = inode->i_sb; 648 649 if (!pin_sb_for_writeback(sb)) { 650 requeue_io(inode, wb); 651 continue; 652 } 653 wrote += writeback_sb_inodes(sb, wb, work); 654 drop_super(sb); 655 656 /* refer to the same tests at the end of writeback_sb_inodes */ 657 if (wrote) { 658 if (time_is_before_jiffies(start_time + HZ / 10UL)) 659 break; 660 if (work->nr_pages <= 0) 661 break; 662 } 663 } 664 /* Leave any unwritten inodes on b_io */ 665 return wrote; 666 } 667 668 long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages) 669 { 670 struct wb_writeback_work work = { 671 .nr_pages = nr_pages, 672 .sync_mode = WB_SYNC_NONE, 673 .range_cyclic = 1, 674 }; 675 676 spin_lock(&wb->list_lock); 677 if (list_empty(&wb->b_io)) 678 queue_io(wb, NULL); 679 __writeback_inodes_wb(wb, &work); 680 spin_unlock(&wb->list_lock); 681 682 return nr_pages - work.nr_pages; 683 } 684 685 static inline bool over_bground_thresh(void) 686 { 687 unsigned long background_thresh, dirty_thresh; 688 689 global_dirty_limits(&background_thresh, &dirty_thresh); 690 691 return (global_page_state(NR_FILE_DIRTY) + 692 global_page_state(NR_UNSTABLE_NFS) > background_thresh); 693 } 694 695 /* 696 * Explicit flushing or periodic writeback of "old" data. 697 * 698 * Define "old": the first time one of an inode's pages is dirtied, we mark the 699 * dirtying-time in the inode's address_space. So this periodic writeback code 700 * just walks the superblock inode list, writing back any inodes which are 701 * older than a specific point in time. 702 * 703 * Try to run once per dirty_writeback_interval. But if a writeback event 704 * takes longer than a dirty_writeback_interval interval, then leave a 705 * one-second gap. 706 * 707 * older_than_this takes precedence over nr_to_write. So we'll only write back 708 * all dirty pages if they are all attached to "old" mappings. 709 */ 710 static long wb_writeback(struct bdi_writeback *wb, 711 struct wb_writeback_work *work) 712 { 713 long nr_pages = work->nr_pages; 714 unsigned long oldest_jif; 715 struct inode *inode; 716 long progress; 717 718 oldest_jif = jiffies; 719 work->older_than_this = &oldest_jif; 720 721 spin_lock(&wb->list_lock); 722 for (;;) { 723 /* 724 * Stop writeback when nr_pages has been consumed 725 */ 726 if (work->nr_pages <= 0) 727 break; 728 729 /* 730 * Background writeout and kupdate-style writeback may 731 * run forever. Stop them if there is other work to do 732 * so that e.g. sync can proceed. They'll be restarted 733 * after the other works are all done. 734 */ 735 if ((work->for_background || work->for_kupdate) && 736 !list_empty(&wb->bdi->work_list)) 737 break; 738 739 /* 740 * For background writeout, stop when we are below the 741 * background dirty threshold 742 */ 743 if (work->for_background && !over_bground_thresh()) 744 break; 745 746 if (work->for_kupdate) { 747 oldest_jif = jiffies - 748 msecs_to_jiffies(dirty_expire_interval * 10); 749 work->older_than_this = &oldest_jif; 750 } 751 752 trace_writeback_start(wb->bdi, work); 753 if (list_empty(&wb->b_io)) 754 queue_io(wb, work->older_than_this); 755 if (work->sb) 756 progress = writeback_sb_inodes(work->sb, wb, work); 757 else 758 progress = __writeback_inodes_wb(wb, work); 759 trace_writeback_written(wb->bdi, work); 760 761 /* 762 * Did we write something? Try for more 763 * 764 * Dirty inodes are moved to b_io for writeback in batches. 765 * The completion of the current batch does not necessarily 766 * mean the overall work is done. So we keep looping as long 767 * as made some progress on cleaning pages or inodes. 768 */ 769 if (progress) 770 continue; 771 /* 772 * No more inodes for IO, bail 773 */ 774 if (list_empty(&wb->b_more_io)) 775 break; 776 /* 777 * Nothing written. Wait for some inode to 778 * become available for writeback. Otherwise 779 * we'll just busyloop. 780 */ 781 if (!list_empty(&wb->b_more_io)) { 782 trace_writeback_wait(wb->bdi, work); 783 inode = wb_inode(wb->b_more_io.prev); 784 spin_lock(&inode->i_lock); 785 inode_wait_for_writeback(inode, wb); 786 spin_unlock(&inode->i_lock); 787 } 788 } 789 spin_unlock(&wb->list_lock); 790 791 return nr_pages - work->nr_pages; 792 } 793 794 /* 795 * Return the next wb_writeback_work struct that hasn't been processed yet. 796 */ 797 static struct wb_writeback_work * 798 get_next_work_item(struct backing_dev_info *bdi) 799 { 800 struct wb_writeback_work *work = NULL; 801 802 spin_lock_bh(&bdi->wb_lock); 803 if (!list_empty(&bdi->work_list)) { 804 work = list_entry(bdi->work_list.next, 805 struct wb_writeback_work, list); 806 list_del_init(&work->list); 807 } 808 spin_unlock_bh(&bdi->wb_lock); 809 return work; 810 } 811 812 /* 813 * Add in the number of potentially dirty inodes, because each inode 814 * write can dirty pagecache in the underlying blockdev. 815 */ 816 static unsigned long get_nr_dirty_pages(void) 817 { 818 return global_page_state(NR_FILE_DIRTY) + 819 global_page_state(NR_UNSTABLE_NFS) + 820 get_nr_dirty_inodes(); 821 } 822 823 static long wb_check_background_flush(struct bdi_writeback *wb) 824 { 825 if (over_bground_thresh()) { 826 827 struct wb_writeback_work work = { 828 .nr_pages = LONG_MAX, 829 .sync_mode = WB_SYNC_NONE, 830 .for_background = 1, 831 .range_cyclic = 1, 832 }; 833 834 return wb_writeback(wb, &work); 835 } 836 837 return 0; 838 } 839 840 static long wb_check_old_data_flush(struct bdi_writeback *wb) 841 { 842 unsigned long expired; 843 long nr_pages; 844 845 /* 846 * When set to zero, disable periodic writeback 847 */ 848 if (!dirty_writeback_interval) 849 return 0; 850 851 expired = wb->last_old_flush + 852 msecs_to_jiffies(dirty_writeback_interval * 10); 853 if (time_before(jiffies, expired)) 854 return 0; 855 856 wb->last_old_flush = jiffies; 857 nr_pages = get_nr_dirty_pages(); 858 859 if (nr_pages) { 860 struct wb_writeback_work work = { 861 .nr_pages = nr_pages, 862 .sync_mode = WB_SYNC_NONE, 863 .for_kupdate = 1, 864 .range_cyclic = 1, 865 }; 866 867 return wb_writeback(wb, &work); 868 } 869 870 return 0; 871 } 872 873 /* 874 * Retrieve work items and do the writeback they describe 875 */ 876 long wb_do_writeback(struct bdi_writeback *wb, int force_wait) 877 { 878 struct backing_dev_info *bdi = wb->bdi; 879 struct wb_writeback_work *work; 880 long wrote = 0; 881 882 set_bit(BDI_writeback_running, &wb->bdi->state); 883 while ((work = get_next_work_item(bdi)) != NULL) { 884 /* 885 * Override sync mode, in case we must wait for completion 886 * because this thread is exiting now. 887 */ 888 if (force_wait) 889 work->sync_mode = WB_SYNC_ALL; 890 891 trace_writeback_exec(bdi, work); 892 893 wrote += wb_writeback(wb, work); 894 895 /* 896 * Notify the caller of completion if this is a synchronous 897 * work item, otherwise just free it. 898 */ 899 if (work->done) 900 complete(work->done); 901 else 902 kfree(work); 903 } 904 905 /* 906 * Check for periodic writeback, kupdated() style 907 */ 908 wrote += wb_check_old_data_flush(wb); 909 wrote += wb_check_background_flush(wb); 910 clear_bit(BDI_writeback_running, &wb->bdi->state); 911 912 return wrote; 913 } 914 915 /* 916 * Handle writeback of dirty data for the device backed by this bdi. Also 917 * wakes up periodically and does kupdated style flushing. 918 */ 919 int bdi_writeback_thread(void *data) 920 { 921 struct bdi_writeback *wb = data; 922 struct backing_dev_info *bdi = wb->bdi; 923 long pages_written; 924 925 current->flags |= PF_SWAPWRITE; 926 set_freezable(); 927 wb->last_active = jiffies; 928 929 /* 930 * Our parent may run at a different priority, just set us to normal 931 */ 932 set_user_nice(current, 0); 933 934 trace_writeback_thread_start(bdi); 935 936 while (!kthread_should_stop()) { 937 /* 938 * Remove own delayed wake-up timer, since we are already awake 939 * and we'll take care of the preriodic write-back. 940 */ 941 del_timer(&wb->wakeup_timer); 942 943 pages_written = wb_do_writeback(wb, 0); 944 945 trace_writeback_pages_written(pages_written); 946 947 if (pages_written) 948 wb->last_active = jiffies; 949 950 set_current_state(TASK_INTERRUPTIBLE); 951 if (!list_empty(&bdi->work_list) || kthread_should_stop()) { 952 __set_current_state(TASK_RUNNING); 953 continue; 954 } 955 956 if (wb_has_dirty_io(wb) && dirty_writeback_interval) 957 schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10)); 958 else { 959 /* 960 * We have nothing to do, so can go sleep without any 961 * timeout and save power. When a work is queued or 962 * something is made dirty - we will be woken up. 963 */ 964 schedule(); 965 } 966 967 try_to_freeze(); 968 } 969 970 /* Flush any work that raced with us exiting */ 971 if (!list_empty(&bdi->work_list)) 972 wb_do_writeback(wb, 1); 973 974 trace_writeback_thread_stop(bdi); 975 return 0; 976 } 977 978 979 /* 980 * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back 981 * the whole world. 982 */ 983 void wakeup_flusher_threads(long nr_pages) 984 { 985 struct backing_dev_info *bdi; 986 987 if (!nr_pages) { 988 nr_pages = global_page_state(NR_FILE_DIRTY) + 989 global_page_state(NR_UNSTABLE_NFS); 990 } 991 992 rcu_read_lock(); 993 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { 994 if (!bdi_has_dirty_io(bdi)) 995 continue; 996 __bdi_start_writeback(bdi, nr_pages, false); 997 } 998 rcu_read_unlock(); 999 } 1000 1001 static noinline void block_dump___mark_inode_dirty(struct inode *inode) 1002 { 1003 if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) { 1004 struct dentry *dentry; 1005 const char *name = "?"; 1006 1007 dentry = d_find_alias(inode); 1008 if (dentry) { 1009 spin_lock(&dentry->d_lock); 1010 name = (const char *) dentry->d_name.name; 1011 } 1012 printk(KERN_DEBUG 1013 "%s(%d): dirtied inode %lu (%s) on %s\n", 1014 current->comm, task_pid_nr(current), inode->i_ino, 1015 name, inode->i_sb->s_id); 1016 if (dentry) { 1017 spin_unlock(&dentry->d_lock); 1018 dput(dentry); 1019 } 1020 } 1021 } 1022 1023 /** 1024 * __mark_inode_dirty - internal function 1025 * @inode: inode to mark 1026 * @flags: what kind of dirty (i.e. I_DIRTY_SYNC) 1027 * Mark an inode as dirty. Callers should use mark_inode_dirty or 1028 * mark_inode_dirty_sync. 1029 * 1030 * Put the inode on the super block's dirty list. 1031 * 1032 * CAREFUL! We mark it dirty unconditionally, but move it onto the 1033 * dirty list only if it is hashed or if it refers to a blockdev. 1034 * If it was not hashed, it will never be added to the dirty list 1035 * even if it is later hashed, as it will have been marked dirty already. 1036 * 1037 * In short, make sure you hash any inodes _before_ you start marking 1038 * them dirty. 1039 * 1040 * Note that for blockdevs, inode->dirtied_when represents the dirtying time of 1041 * the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of 1042 * the kernel-internal blockdev inode represents the dirtying time of the 1043 * blockdev's pages. This is why for I_DIRTY_PAGES we always use 1044 * page->mapping->host, so the page-dirtying time is recorded in the internal 1045 * blockdev inode. 1046 */ 1047 void __mark_inode_dirty(struct inode *inode, int flags) 1048 { 1049 struct super_block *sb = inode->i_sb; 1050 struct backing_dev_info *bdi = NULL; 1051 1052 /* 1053 * Don't do this for I_DIRTY_PAGES - that doesn't actually 1054 * dirty the inode itself 1055 */ 1056 if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { 1057 if (sb->s_op->dirty_inode) 1058 sb->s_op->dirty_inode(inode, flags); 1059 } 1060 1061 /* 1062 * make sure that changes are seen by all cpus before we test i_state 1063 * -- mikulas 1064 */ 1065 smp_mb(); 1066 1067 /* avoid the locking if we can */ 1068 if ((inode->i_state & flags) == flags) 1069 return; 1070 1071 if (unlikely(block_dump)) 1072 block_dump___mark_inode_dirty(inode); 1073 1074 spin_lock(&inode->i_lock); 1075 if ((inode->i_state & flags) != flags) { 1076 const int was_dirty = inode->i_state & I_DIRTY; 1077 1078 inode->i_state |= flags; 1079 1080 /* 1081 * If the inode is being synced, just update its dirty state. 1082 * The unlocker will place the inode on the appropriate 1083 * superblock list, based upon its state. 1084 */ 1085 if (inode->i_state & I_SYNC) 1086 goto out_unlock_inode; 1087 1088 /* 1089 * Only add valid (hashed) inodes to the superblock's 1090 * dirty list. Add blockdev inodes as well. 1091 */ 1092 if (!S_ISBLK(inode->i_mode)) { 1093 if (inode_unhashed(inode)) 1094 goto out_unlock_inode; 1095 } 1096 if (inode->i_state & I_FREEING) 1097 goto out_unlock_inode; 1098 1099 /* 1100 * If the inode was already on b_dirty/b_io/b_more_io, don't 1101 * reposition it (that would break b_dirty time-ordering). 1102 */ 1103 if (!was_dirty) { 1104 bool wakeup_bdi = false; 1105 bdi = inode_to_bdi(inode); 1106 1107 if (bdi_cap_writeback_dirty(bdi)) { 1108 WARN(!test_bit(BDI_registered, &bdi->state), 1109 "bdi-%s not registered\n", bdi->name); 1110 1111 /* 1112 * If this is the first dirty inode for this 1113 * bdi, we have to wake-up the corresponding 1114 * bdi thread to make sure background 1115 * write-back happens later. 1116 */ 1117 if (!wb_has_dirty_io(&bdi->wb)) 1118 wakeup_bdi = true; 1119 } 1120 1121 spin_unlock(&inode->i_lock); 1122 spin_lock(&bdi->wb.list_lock); 1123 inode->dirtied_when = jiffies; 1124 list_move(&inode->i_wb_list, &bdi->wb.b_dirty); 1125 spin_unlock(&bdi->wb.list_lock); 1126 1127 if (wakeup_bdi) 1128 bdi_wakeup_thread_delayed(bdi); 1129 return; 1130 } 1131 } 1132 out_unlock_inode: 1133 spin_unlock(&inode->i_lock); 1134 1135 } 1136 EXPORT_SYMBOL(__mark_inode_dirty); 1137 1138 /* 1139 * Write out a superblock's list of dirty inodes. A wait will be performed 1140 * upon no inodes, all inodes or the final one, depending upon sync_mode. 1141 * 1142 * If older_than_this is non-NULL, then only write out inodes which 1143 * had their first dirtying at a time earlier than *older_than_this. 1144 * 1145 * If `bdi' is non-zero then we're being asked to writeback a specific queue. 1146 * This function assumes that the blockdev superblock's inodes are backed by 1147 * a variety of queues, so all inodes are searched. For other superblocks, 1148 * assume that all inodes are backed by the same queue. 1149 * 1150 * The inodes to be written are parked on bdi->b_io. They are moved back onto 1151 * bdi->b_dirty as they are selected for writing. This way, none can be missed 1152 * on the writer throttling path, and we get decent balancing between many 1153 * throttled threads: we don't want them all piling up on inode_sync_wait. 1154 */ 1155 static void wait_sb_inodes(struct super_block *sb) 1156 { 1157 struct inode *inode, *old_inode = NULL; 1158 1159 /* 1160 * We need to be protected against the filesystem going from 1161 * r/o to r/w or vice versa. 1162 */ 1163 WARN_ON(!rwsem_is_locked(&sb->s_umount)); 1164 1165 spin_lock(&inode_sb_list_lock); 1166 1167 /* 1168 * Data integrity sync. Must wait for all pages under writeback, 1169 * because there may have been pages dirtied before our sync 1170 * call, but which had writeout started before we write it out. 1171 * In which case, the inode may not be on the dirty list, but 1172 * we still have to wait for that writeout. 1173 */ 1174 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 1175 struct address_space *mapping = inode->i_mapping; 1176 1177 spin_lock(&inode->i_lock); 1178 if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || 1179 (mapping->nrpages == 0)) { 1180 spin_unlock(&inode->i_lock); 1181 continue; 1182 } 1183 __iget(inode); 1184 spin_unlock(&inode->i_lock); 1185 spin_unlock(&inode_sb_list_lock); 1186 1187 /* 1188 * We hold a reference to 'inode' so it couldn't have been 1189 * removed from s_inodes list while we dropped the 1190 * inode_sb_list_lock. We cannot iput the inode now as we can 1191 * be holding the last reference and we cannot iput it under 1192 * inode_sb_list_lock. So we keep the reference and iput it 1193 * later. 1194 */ 1195 iput(old_inode); 1196 old_inode = inode; 1197 1198 filemap_fdatawait(mapping); 1199 1200 cond_resched(); 1201 1202 spin_lock(&inode_sb_list_lock); 1203 } 1204 spin_unlock(&inode_sb_list_lock); 1205 iput(old_inode); 1206 } 1207 1208 /** 1209 * writeback_inodes_sb_nr - writeback dirty inodes from given super_block 1210 * @sb: the superblock 1211 * @nr: the number of pages to write 1212 * 1213 * Start writeback on some inodes on this super_block. No guarantees are made 1214 * on how many (if any) will be written, and this function does not wait 1215 * for IO completion of submitted IO. 1216 */ 1217 void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr) 1218 { 1219 DECLARE_COMPLETION_ONSTACK(done); 1220 struct wb_writeback_work work = { 1221 .sb = sb, 1222 .sync_mode = WB_SYNC_NONE, 1223 .tagged_writepages = 1, 1224 .done = &done, 1225 .nr_pages = nr, 1226 }; 1227 1228 WARN_ON(!rwsem_is_locked(&sb->s_umount)); 1229 bdi_queue_work(sb->s_bdi, &work); 1230 wait_for_completion(&done); 1231 } 1232 EXPORT_SYMBOL(writeback_inodes_sb_nr); 1233 1234 /** 1235 * writeback_inodes_sb - writeback dirty inodes from given super_block 1236 * @sb: the superblock 1237 * 1238 * Start writeback on some inodes on this super_block. No guarantees are made 1239 * on how many (if any) will be written, and this function does not wait 1240 * for IO completion of submitted IO. 1241 */ 1242 void writeback_inodes_sb(struct super_block *sb) 1243 { 1244 return writeback_inodes_sb_nr(sb, get_nr_dirty_pages()); 1245 } 1246 EXPORT_SYMBOL(writeback_inodes_sb); 1247 1248 /** 1249 * writeback_inodes_sb_if_idle - start writeback if none underway 1250 * @sb: the superblock 1251 * 1252 * Invoke writeback_inodes_sb if no writeback is currently underway. 1253 * Returns 1 if writeback was started, 0 if not. 1254 */ 1255 int writeback_inodes_sb_if_idle(struct super_block *sb) 1256 { 1257 if (!writeback_in_progress(sb->s_bdi)) { 1258 down_read(&sb->s_umount); 1259 writeback_inodes_sb(sb); 1260 up_read(&sb->s_umount); 1261 return 1; 1262 } else 1263 return 0; 1264 } 1265 EXPORT_SYMBOL(writeback_inodes_sb_if_idle); 1266 1267 /** 1268 * writeback_inodes_sb_if_idle - start writeback if none underway 1269 * @sb: the superblock 1270 * @nr: the number of pages to write 1271 * 1272 * Invoke writeback_inodes_sb if no writeback is currently underway. 1273 * Returns 1 if writeback was started, 0 if not. 1274 */ 1275 int writeback_inodes_sb_nr_if_idle(struct super_block *sb, 1276 unsigned long nr) 1277 { 1278 if (!writeback_in_progress(sb->s_bdi)) { 1279 down_read(&sb->s_umount); 1280 writeback_inodes_sb_nr(sb, nr); 1281 up_read(&sb->s_umount); 1282 return 1; 1283 } else 1284 return 0; 1285 } 1286 EXPORT_SYMBOL(writeback_inodes_sb_nr_if_idle); 1287 1288 /** 1289 * sync_inodes_sb - sync sb inode pages 1290 * @sb: the superblock 1291 * 1292 * This function writes and waits on any dirty inode belonging to this 1293 * super_block. 1294 */ 1295 void sync_inodes_sb(struct super_block *sb) 1296 { 1297 DECLARE_COMPLETION_ONSTACK(done); 1298 struct wb_writeback_work work = { 1299 .sb = sb, 1300 .sync_mode = WB_SYNC_ALL, 1301 .nr_pages = LONG_MAX, 1302 .range_cyclic = 0, 1303 .done = &done, 1304 }; 1305 1306 WARN_ON(!rwsem_is_locked(&sb->s_umount)); 1307 1308 bdi_queue_work(sb->s_bdi, &work); 1309 wait_for_completion(&done); 1310 1311 wait_sb_inodes(sb); 1312 } 1313 EXPORT_SYMBOL(sync_inodes_sb); 1314 1315 /** 1316 * write_inode_now - write an inode to disk 1317 * @inode: inode to write to disk 1318 * @sync: whether the write should be synchronous or not 1319 * 1320 * This function commits an inode to disk immediately if it is dirty. This is 1321 * primarily needed by knfsd. 1322 * 1323 * The caller must either have a ref on the inode or must have set I_WILL_FREE. 1324 */ 1325 int write_inode_now(struct inode *inode, int sync) 1326 { 1327 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; 1328 int ret; 1329 struct writeback_control wbc = { 1330 .nr_to_write = LONG_MAX, 1331 .sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE, 1332 .range_start = 0, 1333 .range_end = LLONG_MAX, 1334 }; 1335 1336 if (!mapping_cap_writeback_dirty(inode->i_mapping)) 1337 wbc.nr_to_write = 0; 1338 1339 might_sleep(); 1340 spin_lock(&wb->list_lock); 1341 spin_lock(&inode->i_lock); 1342 ret = writeback_single_inode(inode, wb, &wbc); 1343 spin_unlock(&inode->i_lock); 1344 spin_unlock(&wb->list_lock); 1345 if (sync) 1346 inode_sync_wait(inode); 1347 return ret; 1348 } 1349 EXPORT_SYMBOL(write_inode_now); 1350 1351 /** 1352 * sync_inode - write an inode and its pages to disk. 1353 * @inode: the inode to sync 1354 * @wbc: controls the writeback mode 1355 * 1356 * sync_inode() will write an inode and its pages to disk. It will also 1357 * correctly update the inode on its superblock's dirty inode lists and will 1358 * update inode->i_state. 1359 * 1360 * The caller must have a ref on the inode. 1361 */ 1362 int sync_inode(struct inode *inode, struct writeback_control *wbc) 1363 { 1364 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; 1365 int ret; 1366 1367 spin_lock(&wb->list_lock); 1368 spin_lock(&inode->i_lock); 1369 ret = writeback_single_inode(inode, wb, wbc); 1370 spin_unlock(&inode->i_lock); 1371 spin_unlock(&wb->list_lock); 1372 return ret; 1373 } 1374 EXPORT_SYMBOL(sync_inode); 1375 1376 /** 1377 * sync_inode_metadata - write an inode to disk 1378 * @inode: the inode to sync 1379 * @wait: wait for I/O to complete. 1380 * 1381 * Write an inode to disk and adjust its dirty state after completion. 1382 * 1383 * Note: only writes the actual inode, no associated data or other metadata. 1384 */ 1385 int sync_inode_metadata(struct inode *inode, int wait) 1386 { 1387 struct writeback_control wbc = { 1388 .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE, 1389 .nr_to_write = 0, /* metadata-only */ 1390 }; 1391 1392 return sync_inode(inode, &wbc); 1393 } 1394 EXPORT_SYMBOL(sync_inode_metadata); 1395