1 /* 2 * fs/fs-writeback.c 3 * 4 * Copyright (C) 2002, Linus Torvalds. 5 * 6 * Contains all the functions related to writing back and waiting 7 * upon dirty inodes against superblocks, and writing back dirty 8 * pages against inodes. ie: data writeback. Writeout of the 9 * inode itself is not handled here. 10 * 11 * 10Apr2002 Andrew Morton 12 * Split out of fs/inode.c 13 * Additions for address_space-based writeback 14 */ 15 16 #include <linux/kernel.h> 17 #include <linux/module.h> 18 #include <linux/spinlock.h> 19 #include <linux/sched.h> 20 #include <linux/fs.h> 21 #include <linux/mm.h> 22 #include <linux/kthread.h> 23 #include <linux/freezer.h> 24 #include <linux/writeback.h> 25 #include <linux/blkdev.h> 26 #include <linux/backing-dev.h> 27 #include <linux/buffer_head.h> 28 #include "internal.h" 29 30 #define inode_to_bdi(inode) ((inode)->i_mapping->backing_dev_info) 31 32 /* 33 * We don't actually have pdflush, but this one is exported though /proc... 34 */ 35 int nr_pdflush_threads; 36 37 /* 38 * Passed into wb_writeback(), essentially a subset of writeback_control 39 */ 40 struct wb_writeback_args { 41 long nr_pages; 42 struct super_block *sb; 43 enum writeback_sync_modes sync_mode; 44 int for_kupdate; 45 int range_cyclic; 46 }; 47 48 /* 49 * Work items for the bdi_writeback threads 50 */ 51 struct bdi_work { 52 struct list_head list; /* pending work list */ 53 struct rcu_head rcu_head; /* for RCU free/clear of work */ 54 55 unsigned long seen; /* threads that have seen this work */ 56 atomic_t pending; /* number of threads still to do work */ 57 58 struct wb_writeback_args args; /* writeback arguments */ 59 60 unsigned long state; /* flag bits, see WS_* */ 61 }; 62 63 enum { 64 WS_USED_B = 0, 65 WS_ONSTACK_B, 66 }; 67 68 #define WS_USED (1 << WS_USED_B) 69 #define WS_ONSTACK (1 << WS_ONSTACK_B) 70 71 static inline bool bdi_work_on_stack(struct bdi_work *work) 72 { 73 return test_bit(WS_ONSTACK_B, &work->state); 74 } 75 76 static inline void bdi_work_init(struct bdi_work *work, 77 struct wb_writeback_args *args) 78 { 79 INIT_RCU_HEAD(&work->rcu_head); 80 work->args = *args; 81 work->state = WS_USED; 82 } 83 84 /** 85 * writeback_in_progress - determine whether there is writeback in progress 86 * @bdi: the device's backing_dev_info structure. 87 * 88 * Determine whether there is writeback waiting to be handled against a 89 * backing device. 90 */ 91 int writeback_in_progress(struct backing_dev_info *bdi) 92 { 93 return !list_empty(&bdi->work_list); 94 } 95 96 static void bdi_work_clear(struct bdi_work *work) 97 { 98 clear_bit(WS_USED_B, &work->state); 99 smp_mb__after_clear_bit(); 100 /* 101 * work can have disappeared at this point. bit waitq functions 102 * should be able to tolerate this, provided bdi_sched_wait does 103 * not dereference it's pointer argument. 104 */ 105 wake_up_bit(&work->state, WS_USED_B); 106 } 107 108 static void bdi_work_free(struct rcu_head *head) 109 { 110 struct bdi_work *work = container_of(head, struct bdi_work, rcu_head); 111 112 if (!bdi_work_on_stack(work)) 113 kfree(work); 114 else 115 bdi_work_clear(work); 116 } 117 118 static void wb_work_complete(struct bdi_work *work) 119 { 120 const enum writeback_sync_modes sync_mode = work->args.sync_mode; 121 int onstack = bdi_work_on_stack(work); 122 123 /* 124 * For allocated work, we can clear the done/seen bit right here. 125 * For on-stack work, we need to postpone both the clear and free 126 * to after the RCU grace period, since the stack could be invalidated 127 * as soon as bdi_work_clear() has done the wakeup. 128 */ 129 if (!onstack) 130 bdi_work_clear(work); 131 if (sync_mode == WB_SYNC_NONE || onstack) 132 call_rcu(&work->rcu_head, bdi_work_free); 133 } 134 135 static void wb_clear_pending(struct bdi_writeback *wb, struct bdi_work *work) 136 { 137 /* 138 * The caller has retrieved the work arguments from this work, 139 * drop our reference. If this is the last ref, delete and free it 140 */ 141 if (atomic_dec_and_test(&work->pending)) { 142 struct backing_dev_info *bdi = wb->bdi; 143 144 spin_lock(&bdi->wb_lock); 145 list_del_rcu(&work->list); 146 spin_unlock(&bdi->wb_lock); 147 148 wb_work_complete(work); 149 } 150 } 151 152 static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work) 153 { 154 work->seen = bdi->wb_mask; 155 BUG_ON(!work->seen); 156 atomic_set(&work->pending, bdi->wb_cnt); 157 BUG_ON(!bdi->wb_cnt); 158 159 /* 160 * list_add_tail_rcu() contains the necessary barriers to 161 * make sure the above stores are seen before the item is 162 * noticed on the list 163 */ 164 spin_lock(&bdi->wb_lock); 165 list_add_tail_rcu(&work->list, &bdi->work_list); 166 spin_unlock(&bdi->wb_lock); 167 168 /* 169 * If the default thread isn't there, make sure we add it. When 170 * it gets created and wakes up, we'll run this work. 171 */ 172 if (unlikely(list_empty_careful(&bdi->wb_list))) 173 wake_up_process(default_backing_dev_info.wb.task); 174 else { 175 struct bdi_writeback *wb = &bdi->wb; 176 177 if (wb->task) 178 wake_up_process(wb->task); 179 } 180 } 181 182 /* 183 * Used for on-stack allocated work items. The caller needs to wait until 184 * the wb threads have acked the work before it's safe to continue. 185 */ 186 static void bdi_wait_on_work_clear(struct bdi_work *work) 187 { 188 wait_on_bit(&work->state, WS_USED_B, bdi_sched_wait, 189 TASK_UNINTERRUPTIBLE); 190 } 191 192 static void bdi_alloc_queue_work(struct backing_dev_info *bdi, 193 struct wb_writeback_args *args) 194 { 195 struct bdi_work *work; 196 197 /* 198 * This is WB_SYNC_NONE writeback, so if allocation fails just 199 * wakeup the thread for old dirty data writeback 200 */ 201 work = kmalloc(sizeof(*work), GFP_ATOMIC); 202 if (work) { 203 bdi_work_init(work, args); 204 bdi_queue_work(bdi, work); 205 } else { 206 struct bdi_writeback *wb = &bdi->wb; 207 208 if (wb->task) 209 wake_up_process(wb->task); 210 } 211 } 212 213 /** 214 * bdi_sync_writeback - start and wait for writeback 215 * @bdi: the backing device to write from 216 * @sb: write inodes from this super_block 217 * 218 * Description: 219 * This does WB_SYNC_ALL data integrity writeback and waits for the 220 * IO to complete. Callers must hold the sb s_umount semaphore for 221 * reading, to avoid having the super disappear before we are done. 222 */ 223 static void bdi_sync_writeback(struct backing_dev_info *bdi, 224 struct super_block *sb) 225 { 226 struct wb_writeback_args args = { 227 .sb = sb, 228 .sync_mode = WB_SYNC_ALL, 229 .nr_pages = LONG_MAX, 230 .range_cyclic = 0, 231 }; 232 struct bdi_work work; 233 234 bdi_work_init(&work, &args); 235 work.state |= WS_ONSTACK; 236 237 bdi_queue_work(bdi, &work); 238 bdi_wait_on_work_clear(&work); 239 } 240 241 /** 242 * bdi_start_writeback - start writeback 243 * @bdi: the backing device to write from 244 * @nr_pages: the number of pages to write 245 * 246 * Description: 247 * This does WB_SYNC_NONE opportunistic writeback. The IO is only 248 * started when this function returns, we make no guarentees on 249 * completion. Caller need not hold sb s_umount semaphore. 250 * 251 */ 252 void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages) 253 { 254 struct wb_writeback_args args = { 255 .sync_mode = WB_SYNC_NONE, 256 .nr_pages = nr_pages, 257 .range_cyclic = 1, 258 }; 259 260 bdi_alloc_queue_work(bdi, &args); 261 } 262 263 /* 264 * Redirty an inode: set its when-it-was dirtied timestamp and move it to the 265 * furthest end of its superblock's dirty-inode list. 266 * 267 * Before stamping the inode's ->dirtied_when, we check to see whether it is 268 * already the most-recently-dirtied inode on the b_dirty list. If that is 269 * the case then the inode must have been redirtied while it was being written 270 * out and we don't reset its dirtied_when. 271 */ 272 static void redirty_tail(struct inode *inode) 273 { 274 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; 275 276 if (!list_empty(&wb->b_dirty)) { 277 struct inode *tail; 278 279 tail = list_entry(wb->b_dirty.next, struct inode, i_list); 280 if (time_before(inode->dirtied_when, tail->dirtied_when)) 281 inode->dirtied_when = jiffies; 282 } 283 list_move(&inode->i_list, &wb->b_dirty); 284 } 285 286 /* 287 * requeue inode for re-scanning after bdi->b_io list is exhausted. 288 */ 289 static void requeue_io(struct inode *inode) 290 { 291 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; 292 293 list_move(&inode->i_list, &wb->b_more_io); 294 } 295 296 static void inode_sync_complete(struct inode *inode) 297 { 298 /* 299 * Prevent speculative execution through spin_unlock(&inode_lock); 300 */ 301 smp_mb(); 302 wake_up_bit(&inode->i_state, __I_SYNC); 303 } 304 305 static bool inode_dirtied_after(struct inode *inode, unsigned long t) 306 { 307 bool ret = time_after(inode->dirtied_when, t); 308 #ifndef CONFIG_64BIT 309 /* 310 * For inodes being constantly redirtied, dirtied_when can get stuck. 311 * It _appears_ to be in the future, but is actually in distant past. 312 * This test is necessary to prevent such wrapped-around relative times 313 * from permanently stopping the whole pdflush writeback. 314 */ 315 ret = ret && time_before_eq(inode->dirtied_when, jiffies); 316 #endif 317 return ret; 318 } 319 320 /* 321 * Move expired dirty inodes from @delaying_queue to @dispatch_queue. 322 */ 323 static void move_expired_inodes(struct list_head *delaying_queue, 324 struct list_head *dispatch_queue, 325 unsigned long *older_than_this) 326 { 327 while (!list_empty(delaying_queue)) { 328 struct inode *inode = list_entry(delaying_queue->prev, 329 struct inode, i_list); 330 if (older_than_this && 331 inode_dirtied_after(inode, *older_than_this)) 332 break; 333 list_move(&inode->i_list, dispatch_queue); 334 } 335 } 336 337 /* 338 * Queue all expired dirty inodes for io, eldest first. 339 */ 340 static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this) 341 { 342 list_splice_init(&wb->b_more_io, wb->b_io.prev); 343 move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); 344 } 345 346 static int write_inode(struct inode *inode, int sync) 347 { 348 if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) 349 return inode->i_sb->s_op->write_inode(inode, sync); 350 return 0; 351 } 352 353 /* 354 * Wait for writeback on an inode to complete. 355 */ 356 static void inode_wait_for_writeback(struct inode *inode) 357 { 358 DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC); 359 wait_queue_head_t *wqh; 360 361 wqh = bit_waitqueue(&inode->i_state, __I_SYNC); 362 do { 363 spin_unlock(&inode_lock); 364 __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); 365 spin_lock(&inode_lock); 366 } while (inode->i_state & I_SYNC); 367 } 368 369 /* 370 * Write out an inode's dirty pages. Called under inode_lock. Either the 371 * caller has ref on the inode (either via __iget or via syscall against an fd) 372 * or the inode has I_WILL_FREE set (via generic_forget_inode) 373 * 374 * If `wait' is set, wait on the writeout. 375 * 376 * The whole writeout design is quite complex and fragile. We want to avoid 377 * starvation of particular inodes when others are being redirtied, prevent 378 * livelocks, etc. 379 * 380 * Called under inode_lock. 381 */ 382 static int 383 writeback_single_inode(struct inode *inode, struct writeback_control *wbc) 384 { 385 struct address_space *mapping = inode->i_mapping; 386 int wait = wbc->sync_mode == WB_SYNC_ALL; 387 unsigned dirty; 388 int ret; 389 390 if (!atomic_read(&inode->i_count)) 391 WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING))); 392 else 393 WARN_ON(inode->i_state & I_WILL_FREE); 394 395 if (inode->i_state & I_SYNC) { 396 /* 397 * If this inode is locked for writeback and we are not doing 398 * writeback-for-data-integrity, move it to b_more_io so that 399 * writeback can proceed with the other inodes on s_io. 400 * 401 * We'll have another go at writing back this inode when we 402 * completed a full scan of b_io. 403 */ 404 if (!wait) { 405 requeue_io(inode); 406 return 0; 407 } 408 409 /* 410 * It's a data-integrity sync. We must wait. 411 */ 412 inode_wait_for_writeback(inode); 413 } 414 415 BUG_ON(inode->i_state & I_SYNC); 416 417 /* Set I_SYNC, reset I_DIRTY */ 418 dirty = inode->i_state & I_DIRTY; 419 inode->i_state |= I_SYNC; 420 inode->i_state &= ~I_DIRTY; 421 422 spin_unlock(&inode_lock); 423 424 ret = do_writepages(mapping, wbc); 425 426 /* Don't write the inode if only I_DIRTY_PAGES was set */ 427 if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { 428 int err = write_inode(inode, wait); 429 if (ret == 0) 430 ret = err; 431 } 432 433 if (wait) { 434 int err = filemap_fdatawait(mapping); 435 if (ret == 0) 436 ret = err; 437 } 438 439 spin_lock(&inode_lock); 440 inode->i_state &= ~I_SYNC; 441 if (!(inode->i_state & (I_FREEING | I_CLEAR))) { 442 if (!(inode->i_state & I_DIRTY) && 443 mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { 444 /* 445 * We didn't write back all the pages. nfs_writepages() 446 * sometimes bales out without doing anything. Redirty 447 * the inode; Move it from b_io onto b_more_io/b_dirty. 448 */ 449 /* 450 * akpm: if the caller was the kupdate function we put 451 * this inode at the head of b_dirty so it gets first 452 * consideration. Otherwise, move it to the tail, for 453 * the reasons described there. I'm not really sure 454 * how much sense this makes. Presumably I had a good 455 * reasons for doing it this way, and I'd rather not 456 * muck with it at present. 457 */ 458 if (wbc->for_kupdate) { 459 /* 460 * For the kupdate function we move the inode 461 * to b_more_io so it will get more writeout as 462 * soon as the queue becomes uncongested. 463 */ 464 inode->i_state |= I_DIRTY_PAGES; 465 if (wbc->nr_to_write <= 0) { 466 /* 467 * slice used up: queue for next turn 468 */ 469 requeue_io(inode); 470 } else { 471 /* 472 * somehow blocked: retry later 473 */ 474 redirty_tail(inode); 475 } 476 } else { 477 /* 478 * Otherwise fully redirty the inode so that 479 * other inodes on this superblock will get some 480 * writeout. Otherwise heavy writing to one 481 * file would indefinitely suspend writeout of 482 * all the other files. 483 */ 484 inode->i_state |= I_DIRTY_PAGES; 485 redirty_tail(inode); 486 } 487 } else if (inode->i_state & I_DIRTY) { 488 /* 489 * Someone redirtied the inode while were writing back 490 * the pages. 491 */ 492 redirty_tail(inode); 493 } else if (atomic_read(&inode->i_count)) { 494 /* 495 * The inode is clean, inuse 496 */ 497 list_move(&inode->i_list, &inode_in_use); 498 } else { 499 /* 500 * The inode is clean, unused 501 */ 502 list_move(&inode->i_list, &inode_unused); 503 } 504 } 505 inode_sync_complete(inode); 506 return ret; 507 } 508 509 /* 510 * For WB_SYNC_NONE writeback, the caller does not have the sb pinned 511 * before calling writeback. So make sure that we do pin it, so it doesn't 512 * go away while we are writing inodes from it. 513 * 514 * Returns 0 if the super was successfully pinned (or pinning wasn't needed), 515 * 1 if we failed. 516 */ 517 static int pin_sb_for_writeback(struct writeback_control *wbc, 518 struct inode *inode) 519 { 520 struct super_block *sb = inode->i_sb; 521 522 /* 523 * Caller must already hold the ref for this 524 */ 525 if (wbc->sync_mode == WB_SYNC_ALL) { 526 WARN_ON(!rwsem_is_locked(&sb->s_umount)); 527 return 0; 528 } 529 530 spin_lock(&sb_lock); 531 sb->s_count++; 532 if (down_read_trylock(&sb->s_umount)) { 533 if (sb->s_root) { 534 spin_unlock(&sb_lock); 535 return 0; 536 } 537 /* 538 * umounted, drop rwsem again and fall through to failure 539 */ 540 up_read(&sb->s_umount); 541 } 542 543 sb->s_count--; 544 spin_unlock(&sb_lock); 545 return 1; 546 } 547 548 static void unpin_sb_for_writeback(struct writeback_control *wbc, 549 struct inode *inode) 550 { 551 struct super_block *sb = inode->i_sb; 552 553 if (wbc->sync_mode == WB_SYNC_ALL) 554 return; 555 556 up_read(&sb->s_umount); 557 put_super(sb); 558 } 559 560 static void writeback_inodes_wb(struct bdi_writeback *wb, 561 struct writeback_control *wbc) 562 { 563 struct super_block *sb = wbc->sb; 564 const int is_blkdev_sb = sb_is_blkdev_sb(sb); 565 const unsigned long start = jiffies; /* livelock avoidance */ 566 567 spin_lock(&inode_lock); 568 569 if (!wbc->for_kupdate || list_empty(&wb->b_io)) 570 queue_io(wb, wbc->older_than_this); 571 572 while (!list_empty(&wb->b_io)) { 573 struct inode *inode = list_entry(wb->b_io.prev, 574 struct inode, i_list); 575 long pages_skipped; 576 577 /* 578 * super block given and doesn't match, skip this inode 579 */ 580 if (sb && sb != inode->i_sb) { 581 redirty_tail(inode); 582 continue; 583 } 584 585 if (!bdi_cap_writeback_dirty(wb->bdi)) { 586 redirty_tail(inode); 587 if (is_blkdev_sb) { 588 /* 589 * Dirty memory-backed blockdev: the ramdisk 590 * driver does this. Skip just this inode 591 */ 592 continue; 593 } 594 /* 595 * Dirty memory-backed inode against a filesystem other 596 * than the kernel-internal bdev filesystem. Skip the 597 * entire superblock. 598 */ 599 break; 600 } 601 602 if (inode->i_state & (I_NEW | I_WILL_FREE)) { 603 requeue_io(inode); 604 continue; 605 } 606 607 if (wbc->nonblocking && bdi_write_congested(wb->bdi)) { 608 wbc->encountered_congestion = 1; 609 if (!is_blkdev_sb) 610 break; /* Skip a congested fs */ 611 requeue_io(inode); 612 continue; /* Skip a congested blockdev */ 613 } 614 615 /* 616 * Was this inode dirtied after sync_sb_inodes was called? 617 * This keeps sync from extra jobs and livelock. 618 */ 619 if (inode_dirtied_after(inode, start)) 620 break; 621 622 if (pin_sb_for_writeback(wbc, inode)) { 623 requeue_io(inode); 624 continue; 625 } 626 627 BUG_ON(inode->i_state & (I_FREEING | I_CLEAR)); 628 __iget(inode); 629 pages_skipped = wbc->pages_skipped; 630 writeback_single_inode(inode, wbc); 631 unpin_sb_for_writeback(wbc, inode); 632 if (wbc->pages_skipped != pages_skipped) { 633 /* 634 * writeback is not making progress due to locked 635 * buffers. Skip this inode for now. 636 */ 637 redirty_tail(inode); 638 } 639 spin_unlock(&inode_lock); 640 iput(inode); 641 cond_resched(); 642 spin_lock(&inode_lock); 643 if (wbc->nr_to_write <= 0) { 644 wbc->more_io = 1; 645 break; 646 } 647 if (!list_empty(&wb->b_more_io)) 648 wbc->more_io = 1; 649 } 650 651 spin_unlock(&inode_lock); 652 /* Leave any unwritten inodes on b_io */ 653 } 654 655 void writeback_inodes_wbc(struct writeback_control *wbc) 656 { 657 struct backing_dev_info *bdi = wbc->bdi; 658 659 writeback_inodes_wb(&bdi->wb, wbc); 660 } 661 662 /* 663 * The maximum number of pages to writeout in a single bdi flush/kupdate 664 * operation. We do this so we don't hold I_SYNC against an inode for 665 * enormous amounts of time, which would block a userspace task which has 666 * been forced to throttle against that inode. Also, the code reevaluates 667 * the dirty each time it has written this many pages. 668 */ 669 #define MAX_WRITEBACK_PAGES 1024 670 671 static inline bool over_bground_thresh(void) 672 { 673 unsigned long background_thresh, dirty_thresh; 674 675 get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); 676 677 return (global_page_state(NR_FILE_DIRTY) + 678 global_page_state(NR_UNSTABLE_NFS) >= background_thresh); 679 } 680 681 /* 682 * Explicit flushing or periodic writeback of "old" data. 683 * 684 * Define "old": the first time one of an inode's pages is dirtied, we mark the 685 * dirtying-time in the inode's address_space. So this periodic writeback code 686 * just walks the superblock inode list, writing back any inodes which are 687 * older than a specific point in time. 688 * 689 * Try to run once per dirty_writeback_interval. But if a writeback event 690 * takes longer than a dirty_writeback_interval interval, then leave a 691 * one-second gap. 692 * 693 * older_than_this takes precedence over nr_to_write. So we'll only write back 694 * all dirty pages if they are all attached to "old" mappings. 695 */ 696 static long wb_writeback(struct bdi_writeback *wb, 697 struct wb_writeback_args *args) 698 { 699 struct writeback_control wbc = { 700 .bdi = wb->bdi, 701 .sb = args->sb, 702 .sync_mode = args->sync_mode, 703 .older_than_this = NULL, 704 .for_kupdate = args->for_kupdate, 705 .range_cyclic = args->range_cyclic, 706 }; 707 unsigned long oldest_jif; 708 long wrote = 0; 709 710 if (wbc.for_kupdate) { 711 wbc.older_than_this = &oldest_jif; 712 oldest_jif = jiffies - 713 msecs_to_jiffies(dirty_expire_interval * 10); 714 } 715 if (!wbc.range_cyclic) { 716 wbc.range_start = 0; 717 wbc.range_end = LLONG_MAX; 718 } 719 720 for (;;) { 721 /* 722 * Don't flush anything for non-integrity writeback where 723 * no nr_pages was given 724 */ 725 if (!args->for_kupdate && args->nr_pages <= 0 && 726 args->sync_mode == WB_SYNC_NONE) 727 break; 728 729 /* 730 * If no specific pages were given and this is just a 731 * periodic background writeout and we are below the 732 * background dirty threshold, don't do anything 733 */ 734 if (args->for_kupdate && args->nr_pages <= 0 && 735 !over_bground_thresh()) 736 break; 737 738 wbc.more_io = 0; 739 wbc.encountered_congestion = 0; 740 wbc.nr_to_write = MAX_WRITEBACK_PAGES; 741 wbc.pages_skipped = 0; 742 writeback_inodes_wb(wb, &wbc); 743 args->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; 744 wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write; 745 746 /* 747 * If we ran out of stuff to write, bail unless more_io got set 748 */ 749 if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) { 750 if (wbc.more_io && !wbc.for_kupdate) 751 continue; 752 break; 753 } 754 } 755 756 return wrote; 757 } 758 759 /* 760 * Return the next bdi_work struct that hasn't been processed by this 761 * wb thread yet. ->seen is initially set for each thread that exists 762 * for this device, when a thread first notices a piece of work it 763 * clears its bit. Depending on writeback type, the thread will notify 764 * completion on either receiving the work (WB_SYNC_NONE) or after 765 * it is done (WB_SYNC_ALL). 766 */ 767 static struct bdi_work *get_next_work_item(struct backing_dev_info *bdi, 768 struct bdi_writeback *wb) 769 { 770 struct bdi_work *work, *ret = NULL; 771 772 rcu_read_lock(); 773 774 list_for_each_entry_rcu(work, &bdi->work_list, list) { 775 if (!test_bit(wb->nr, &work->seen)) 776 continue; 777 clear_bit(wb->nr, &work->seen); 778 779 ret = work; 780 break; 781 } 782 783 rcu_read_unlock(); 784 return ret; 785 } 786 787 static long wb_check_old_data_flush(struct bdi_writeback *wb) 788 { 789 unsigned long expired; 790 long nr_pages; 791 792 expired = wb->last_old_flush + 793 msecs_to_jiffies(dirty_writeback_interval * 10); 794 if (time_before(jiffies, expired)) 795 return 0; 796 797 wb->last_old_flush = jiffies; 798 nr_pages = global_page_state(NR_FILE_DIRTY) + 799 global_page_state(NR_UNSTABLE_NFS) + 800 (inodes_stat.nr_inodes - inodes_stat.nr_unused); 801 802 if (nr_pages) { 803 struct wb_writeback_args args = { 804 .nr_pages = nr_pages, 805 .sync_mode = WB_SYNC_NONE, 806 .for_kupdate = 1, 807 .range_cyclic = 1, 808 }; 809 810 return wb_writeback(wb, &args); 811 } 812 813 return 0; 814 } 815 816 /* 817 * Retrieve work items and do the writeback they describe 818 */ 819 long wb_do_writeback(struct bdi_writeback *wb, int force_wait) 820 { 821 struct backing_dev_info *bdi = wb->bdi; 822 struct bdi_work *work; 823 long wrote = 0; 824 825 while ((work = get_next_work_item(bdi, wb)) != NULL) { 826 struct wb_writeback_args args = work->args; 827 828 /* 829 * Override sync mode, in case we must wait for completion 830 */ 831 if (force_wait) 832 work->args.sync_mode = args.sync_mode = WB_SYNC_ALL; 833 834 /* 835 * If this isn't a data integrity operation, just notify 836 * that we have seen this work and we are now starting it. 837 */ 838 if (args.sync_mode == WB_SYNC_NONE) 839 wb_clear_pending(wb, work); 840 841 wrote += wb_writeback(wb, &args); 842 843 /* 844 * This is a data integrity writeback, so only do the 845 * notification when we have completed the work. 846 */ 847 if (args.sync_mode == WB_SYNC_ALL) 848 wb_clear_pending(wb, work); 849 } 850 851 /* 852 * Check for periodic writeback, kupdated() style 853 */ 854 wrote += wb_check_old_data_flush(wb); 855 856 return wrote; 857 } 858 859 /* 860 * Handle writeback of dirty data for the device backed by this bdi. Also 861 * wakes up periodically and does kupdated style flushing. 862 */ 863 int bdi_writeback_task(struct bdi_writeback *wb) 864 { 865 unsigned long last_active = jiffies; 866 unsigned long wait_jiffies = -1UL; 867 long pages_written; 868 869 while (!kthread_should_stop()) { 870 pages_written = wb_do_writeback(wb, 0); 871 872 if (pages_written) 873 last_active = jiffies; 874 else if (wait_jiffies != -1UL) { 875 unsigned long max_idle; 876 877 /* 878 * Longest period of inactivity that we tolerate. If we 879 * see dirty data again later, the task will get 880 * recreated automatically. 881 */ 882 max_idle = max(5UL * 60 * HZ, wait_jiffies); 883 if (time_after(jiffies, max_idle + last_active)) 884 break; 885 } 886 887 wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10); 888 schedule_timeout_interruptible(wait_jiffies); 889 try_to_freeze(); 890 } 891 892 return 0; 893 } 894 895 /* 896 * Schedule writeback for all backing devices. This does WB_SYNC_NONE 897 * writeback, for integrity writeback see bdi_sync_writeback(). 898 */ 899 static void bdi_writeback_all(struct super_block *sb, long nr_pages) 900 { 901 struct wb_writeback_args args = { 902 .sb = sb, 903 .nr_pages = nr_pages, 904 .sync_mode = WB_SYNC_NONE, 905 }; 906 struct backing_dev_info *bdi; 907 908 rcu_read_lock(); 909 910 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { 911 if (!bdi_has_dirty_io(bdi)) 912 continue; 913 914 bdi_alloc_queue_work(bdi, &args); 915 } 916 917 rcu_read_unlock(); 918 } 919 920 /* 921 * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back 922 * the whole world. 923 */ 924 void wakeup_flusher_threads(long nr_pages) 925 { 926 if (nr_pages == 0) 927 nr_pages = global_page_state(NR_FILE_DIRTY) + 928 global_page_state(NR_UNSTABLE_NFS); 929 bdi_writeback_all(NULL, nr_pages); 930 } 931 932 static noinline void block_dump___mark_inode_dirty(struct inode *inode) 933 { 934 if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) { 935 struct dentry *dentry; 936 const char *name = "?"; 937 938 dentry = d_find_alias(inode); 939 if (dentry) { 940 spin_lock(&dentry->d_lock); 941 name = (const char *) dentry->d_name.name; 942 } 943 printk(KERN_DEBUG 944 "%s(%d): dirtied inode %lu (%s) on %s\n", 945 current->comm, task_pid_nr(current), inode->i_ino, 946 name, inode->i_sb->s_id); 947 if (dentry) { 948 spin_unlock(&dentry->d_lock); 949 dput(dentry); 950 } 951 } 952 } 953 954 /** 955 * __mark_inode_dirty - internal function 956 * @inode: inode to mark 957 * @flags: what kind of dirty (i.e. I_DIRTY_SYNC) 958 * Mark an inode as dirty. Callers should use mark_inode_dirty or 959 * mark_inode_dirty_sync. 960 * 961 * Put the inode on the super block's dirty list. 962 * 963 * CAREFUL! We mark it dirty unconditionally, but move it onto the 964 * dirty list only if it is hashed or if it refers to a blockdev. 965 * If it was not hashed, it will never be added to the dirty list 966 * even if it is later hashed, as it will have been marked dirty already. 967 * 968 * In short, make sure you hash any inodes _before_ you start marking 969 * them dirty. 970 * 971 * This function *must* be atomic for the I_DIRTY_PAGES case - 972 * set_page_dirty() is called under spinlock in several places. 973 * 974 * Note that for blockdevs, inode->dirtied_when represents the dirtying time of 975 * the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of 976 * the kernel-internal blockdev inode represents the dirtying time of the 977 * blockdev's pages. This is why for I_DIRTY_PAGES we always use 978 * page->mapping->host, so the page-dirtying time is recorded in the internal 979 * blockdev inode. 980 */ 981 void __mark_inode_dirty(struct inode *inode, int flags) 982 { 983 struct super_block *sb = inode->i_sb; 984 985 /* 986 * Don't do this for I_DIRTY_PAGES - that doesn't actually 987 * dirty the inode itself 988 */ 989 if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { 990 if (sb->s_op->dirty_inode) 991 sb->s_op->dirty_inode(inode); 992 } 993 994 /* 995 * make sure that changes are seen by all cpus before we test i_state 996 * -- mikulas 997 */ 998 smp_mb(); 999 1000 /* avoid the locking if we can */ 1001 if ((inode->i_state & flags) == flags) 1002 return; 1003 1004 if (unlikely(block_dump)) 1005 block_dump___mark_inode_dirty(inode); 1006 1007 spin_lock(&inode_lock); 1008 if ((inode->i_state & flags) != flags) { 1009 const int was_dirty = inode->i_state & I_DIRTY; 1010 1011 inode->i_state |= flags; 1012 1013 /* 1014 * If the inode is being synced, just update its dirty state. 1015 * The unlocker will place the inode on the appropriate 1016 * superblock list, based upon its state. 1017 */ 1018 if (inode->i_state & I_SYNC) 1019 goto out; 1020 1021 /* 1022 * Only add valid (hashed) inodes to the superblock's 1023 * dirty list. Add blockdev inodes as well. 1024 */ 1025 if (!S_ISBLK(inode->i_mode)) { 1026 if (hlist_unhashed(&inode->i_hash)) 1027 goto out; 1028 } 1029 if (inode->i_state & (I_FREEING|I_CLEAR)) 1030 goto out; 1031 1032 /* 1033 * If the inode was already on b_dirty/b_io/b_more_io, don't 1034 * reposition it (that would break b_dirty time-ordering). 1035 */ 1036 if (!was_dirty) { 1037 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; 1038 struct backing_dev_info *bdi = wb->bdi; 1039 1040 if (bdi_cap_writeback_dirty(bdi) && 1041 !test_bit(BDI_registered, &bdi->state)) { 1042 WARN_ON(1); 1043 printk(KERN_ERR "bdi-%s not registered\n", 1044 bdi->name); 1045 } 1046 1047 inode->dirtied_when = jiffies; 1048 list_move(&inode->i_list, &wb->b_dirty); 1049 } 1050 } 1051 out: 1052 spin_unlock(&inode_lock); 1053 } 1054 EXPORT_SYMBOL(__mark_inode_dirty); 1055 1056 /* 1057 * Write out a superblock's list of dirty inodes. A wait will be performed 1058 * upon no inodes, all inodes or the final one, depending upon sync_mode. 1059 * 1060 * If older_than_this is non-NULL, then only write out inodes which 1061 * had their first dirtying at a time earlier than *older_than_this. 1062 * 1063 * If we're a pdlfush thread, then implement pdflush collision avoidance 1064 * against the entire list. 1065 * 1066 * If `bdi' is non-zero then we're being asked to writeback a specific queue. 1067 * This function assumes that the blockdev superblock's inodes are backed by 1068 * a variety of queues, so all inodes are searched. For other superblocks, 1069 * assume that all inodes are backed by the same queue. 1070 * 1071 * The inodes to be written are parked on bdi->b_io. They are moved back onto 1072 * bdi->b_dirty as they are selected for writing. This way, none can be missed 1073 * on the writer throttling path, and we get decent balancing between many 1074 * throttled threads: we don't want them all piling up on inode_sync_wait. 1075 */ 1076 static void wait_sb_inodes(struct super_block *sb) 1077 { 1078 struct inode *inode, *old_inode = NULL; 1079 1080 /* 1081 * We need to be protected against the filesystem going from 1082 * r/o to r/w or vice versa. 1083 */ 1084 WARN_ON(!rwsem_is_locked(&sb->s_umount)); 1085 1086 spin_lock(&inode_lock); 1087 1088 /* 1089 * Data integrity sync. Must wait for all pages under writeback, 1090 * because there may have been pages dirtied before our sync 1091 * call, but which had writeout started before we write it out. 1092 * In which case, the inode may not be on the dirty list, but 1093 * we still have to wait for that writeout. 1094 */ 1095 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 1096 struct address_space *mapping; 1097 1098 if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW)) 1099 continue; 1100 mapping = inode->i_mapping; 1101 if (mapping->nrpages == 0) 1102 continue; 1103 __iget(inode); 1104 spin_unlock(&inode_lock); 1105 /* 1106 * We hold a reference to 'inode' so it couldn't have 1107 * been removed from s_inodes list while we dropped the 1108 * inode_lock. We cannot iput the inode now as we can 1109 * be holding the last reference and we cannot iput it 1110 * under inode_lock. So we keep the reference and iput 1111 * it later. 1112 */ 1113 iput(old_inode); 1114 old_inode = inode; 1115 1116 filemap_fdatawait(mapping); 1117 1118 cond_resched(); 1119 1120 spin_lock(&inode_lock); 1121 } 1122 spin_unlock(&inode_lock); 1123 iput(old_inode); 1124 } 1125 1126 /** 1127 * writeback_inodes_sb - writeback dirty inodes from given super_block 1128 * @sb: the superblock 1129 * 1130 * Start writeback on some inodes on this super_block. No guarantees are made 1131 * on how many (if any) will be written, and this function does not wait 1132 * for IO completion of submitted IO. The number of pages submitted is 1133 * returned. 1134 */ 1135 void writeback_inodes_sb(struct super_block *sb) 1136 { 1137 unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY); 1138 unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS); 1139 long nr_to_write; 1140 1141 nr_to_write = nr_dirty + nr_unstable + 1142 (inodes_stat.nr_inodes - inodes_stat.nr_unused); 1143 1144 bdi_writeback_all(sb, nr_to_write); 1145 } 1146 EXPORT_SYMBOL(writeback_inodes_sb); 1147 1148 /** 1149 * sync_inodes_sb - sync sb inode pages 1150 * @sb: the superblock 1151 * 1152 * This function writes and waits on any dirty inode belonging to this 1153 * super_block. The number of pages synced is returned. 1154 */ 1155 void sync_inodes_sb(struct super_block *sb) 1156 { 1157 bdi_sync_writeback(sb->s_bdi, sb); 1158 wait_sb_inodes(sb); 1159 } 1160 EXPORT_SYMBOL(sync_inodes_sb); 1161 1162 /** 1163 * write_inode_now - write an inode to disk 1164 * @inode: inode to write to disk 1165 * @sync: whether the write should be synchronous or not 1166 * 1167 * This function commits an inode to disk immediately if it is dirty. This is 1168 * primarily needed by knfsd. 1169 * 1170 * The caller must either have a ref on the inode or must have set I_WILL_FREE. 1171 */ 1172 int write_inode_now(struct inode *inode, int sync) 1173 { 1174 int ret; 1175 struct writeback_control wbc = { 1176 .nr_to_write = LONG_MAX, 1177 .sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE, 1178 .range_start = 0, 1179 .range_end = LLONG_MAX, 1180 }; 1181 1182 if (!mapping_cap_writeback_dirty(inode->i_mapping)) 1183 wbc.nr_to_write = 0; 1184 1185 might_sleep(); 1186 spin_lock(&inode_lock); 1187 ret = writeback_single_inode(inode, &wbc); 1188 spin_unlock(&inode_lock); 1189 if (sync) 1190 inode_sync_wait(inode); 1191 return ret; 1192 } 1193 EXPORT_SYMBOL(write_inode_now); 1194 1195 /** 1196 * sync_inode - write an inode and its pages to disk. 1197 * @inode: the inode to sync 1198 * @wbc: controls the writeback mode 1199 * 1200 * sync_inode() will write an inode and its pages to disk. It will also 1201 * correctly update the inode on its superblock's dirty inode lists and will 1202 * update inode->i_state. 1203 * 1204 * The caller must have a ref on the inode. 1205 */ 1206 int sync_inode(struct inode *inode, struct writeback_control *wbc) 1207 { 1208 int ret; 1209 1210 spin_lock(&inode_lock); 1211 ret = writeback_single_inode(inode, wbc); 1212 spin_unlock(&inode_lock); 1213 return ret; 1214 } 1215 EXPORT_SYMBOL(sync_inode); 1216