fs-writeback.c (36b2e922b5acd291051fab25bc7535274ce49532) | fs-writeback.c (d46db3d58233be4be980eb1e42eebe7808bcabab) |
---|---|
1/* 2 * fs/fs-writeback.c 3 * 4 * Copyright (C) 2002, Linus Torvalds. 5 * 6 * Contains all the functions related to writing back and waiting 7 * upon dirty inodes against superblocks, and writing back dirty 8 * pages against inodes. ie: data writeback. Writeout of the --- 16 unchanged lines hidden (view full) --- 25#include <linux/writeback.h> 26#include <linux/blkdev.h> 27#include <linux/backing-dev.h> 28#include <linux/buffer_head.h> 29#include <linux/tracepoint.h> 30#include "internal.h" 31 32/* | 1/* 2 * fs/fs-writeback.c 3 * 4 * Copyright (C) 2002, Linus Torvalds. 5 * 6 * Contains all the functions related to writing back and waiting 7 * upon dirty inodes against superblocks, and writing back dirty 8 * pages against inodes. ie: data writeback. Writeout of the --- 16 unchanged lines hidden (view full) --- 25#include <linux/writeback.h> 26#include <linux/blkdev.h> 27#include <linux/backing-dev.h> 28#include <linux/buffer_head.h> 29#include <linux/tracepoint.h> 30#include "internal.h" 31 32/* |
33 * The maximum number of pages to writeout in a single bdi flush/kupdate 34 * operation. We do this so we don't hold I_SYNC against an inode for 35 * enormous amounts of time, which would block a userspace task which has 36 * been forced to throttle against that inode. Also, the code reevaluates 37 * the dirty each time it has written this many pages. 38 */ 39#define MAX_WRITEBACK_PAGES 1024L 40 41/* |
|
33 * Passed into wb_writeback(), essentially a subset of writeback_control 34 */ 35struct wb_writeback_work { 36 long nr_pages; 37 struct super_block *sb; | 42 * Passed into wb_writeback(), essentially a subset of writeback_control 43 */ 44struct wb_writeback_work { 45 long nr_pages; 46 struct super_block *sb; |
47 unsigned long *older_than_this; |
|
38 enum writeback_sync_modes sync_mode; | 48 enum writeback_sync_modes sync_mode; |
49 unsigned int tagged_writepages:1; |
|
39 unsigned int for_kupdate:1; 40 unsigned int range_cyclic:1; 41 unsigned int for_background:1; 42 43 struct list_head list; /* pending work list */ 44 struct completion *done; /* set if the caller waits */ 45}; 46 --- 128 unchanged lines hidden (view full) --- 175 spin_unlock_bh(&bdi->wb_lock); 176} 177 178/* 179 * Remove the inode from the writeback list it is on. 180 */ 181void inode_wb_list_del(struct inode *inode) 182{ | 50 unsigned int for_kupdate:1; 51 unsigned int range_cyclic:1; 52 unsigned int for_background:1; 53 54 struct list_head list; /* pending work list */ 55 struct completion *done; /* set if the caller waits */ 56}; 57 --- 128 unchanged lines hidden (view full) --- 186 spin_unlock_bh(&bdi->wb_lock); 187} 188 189/* 190 * Remove the inode from the writeback list it is on. 191 */ 192void inode_wb_list_del(struct inode *inode) 193{ |
183 spin_lock(&inode_wb_list_lock); | 194 struct backing_dev_info *bdi = inode_to_bdi(inode); 195 196 spin_lock(&bdi->wb.list_lock); |
184 list_del_init(&inode->i_wb_list); | 197 list_del_init(&inode->i_wb_list); |
185 spin_unlock(&inode_wb_list_lock); | 198 spin_unlock(&bdi->wb.list_lock); |
186} 187 | 199} 200 |
188 | |
189/* 190 * Redirty an inode: set its when-it-was dirtied timestamp and move it to the 191 * furthest end of its superblock's dirty-inode list. 192 * 193 * Before stamping the inode's ->dirtied_when, we check to see whether it is 194 * already the most-recently-dirtied inode on the b_dirty list. If that is 195 * the case then the inode must have been redirtied while it was being written 196 * out and we don't reset its dirtied_when. 197 */ | 201/* 202 * Redirty an inode: set its when-it-was dirtied timestamp and move it to the 203 * furthest end of its superblock's dirty-inode list. 204 * 205 * Before stamping the inode's ->dirtied_when, we check to see whether it is 206 * already the most-recently-dirtied inode on the b_dirty list. If that is 207 * the case then the inode must have been redirtied while it was being written 208 * out and we don't reset its dirtied_when. 209 */ |
198static void redirty_tail(struct inode *inode) | 210static void redirty_tail(struct inode *inode, struct bdi_writeback *wb) |
199{ | 211{ |
200 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; 201 202 assert_spin_locked(&inode_wb_list_lock); | 212 assert_spin_locked(&wb->list_lock); |
203 if (!list_empty(&wb->b_dirty)) { 204 struct inode *tail; 205 206 tail = wb_inode(wb->b_dirty.next); 207 if (time_before(inode->dirtied_when, tail->dirtied_when)) 208 inode->dirtied_when = jiffies; 209 } 210 list_move(&inode->i_wb_list, &wb->b_dirty); 211} 212 213/* 214 * requeue inode for re-scanning after bdi->b_io list is exhausted. 215 */ | 213 if (!list_empty(&wb->b_dirty)) { 214 struct inode *tail; 215 216 tail = wb_inode(wb->b_dirty.next); 217 if (time_before(inode->dirtied_when, tail->dirtied_when)) 218 inode->dirtied_when = jiffies; 219 } 220 list_move(&inode->i_wb_list, &wb->b_dirty); 221} 222 223/* 224 * requeue inode for re-scanning after bdi->b_io list is exhausted. 225 */ |
216static void requeue_io(struct inode *inode) | 226static void requeue_io(struct inode *inode, struct bdi_writeback *wb) |
217{ | 227{ |
218 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; 219 220 assert_spin_locked(&inode_wb_list_lock); | 228 assert_spin_locked(&wb->list_lock); |
221 list_move(&inode->i_wb_list, &wb->b_more_io); 222} 223 224static void inode_sync_complete(struct inode *inode) 225{ 226 /* 227 * Prevent speculative execution through | 229 list_move(&inode->i_wb_list, &wb->b_more_io); 230} 231 232static void inode_sync_complete(struct inode *inode) 233{ 234 /* 235 * Prevent speculative execution through |
228 * spin_unlock(&inode_wb_list_lock); | 236 * spin_unlock(&wb->list_lock); |
229 */ 230 231 smp_mb(); 232 wake_up_bit(&inode->i_state, __I_SYNC); 233} 234 235static bool inode_dirtied_after(struct inode *inode, unsigned long t) 236{ --- 8 unchanged lines hidden (view full) --- 245 ret = ret && time_before_eq(inode->dirtied_when, jiffies); 246#endif 247 return ret; 248} 249 250/* 251 * Move expired dirty inodes from @delaying_queue to @dispatch_queue. 252 */ | 237 */ 238 239 smp_mb(); 240 wake_up_bit(&inode->i_state, __I_SYNC); 241} 242 243static bool inode_dirtied_after(struct inode *inode, unsigned long t) 244{ --- 8 unchanged lines hidden (view full) --- 253 ret = ret && time_before_eq(inode->dirtied_when, jiffies); 254#endif 255 return ret; 256} 257 258/* 259 * Move expired dirty inodes from @delaying_queue to @dispatch_queue. 260 */ |
253static void move_expired_inodes(struct list_head *delaying_queue, | 261static int move_expired_inodes(struct list_head *delaying_queue, |
254 struct list_head *dispatch_queue, | 262 struct list_head *dispatch_queue, |
255 unsigned long *older_than_this) | 263 unsigned long *older_than_this) |
256{ 257 LIST_HEAD(tmp); 258 struct list_head *pos, *node; 259 struct super_block *sb = NULL; 260 struct inode *inode; 261 int do_sb_sort = 0; | 264{ 265 LIST_HEAD(tmp); 266 struct list_head *pos, *node; 267 struct super_block *sb = NULL; 268 struct inode *inode; 269 int do_sb_sort = 0; |
270 int moved = 0; |
|
262 263 while (!list_empty(delaying_queue)) { 264 inode = wb_inode(delaying_queue->prev); 265 if (older_than_this && 266 inode_dirtied_after(inode, *older_than_this)) 267 break; 268 if (sb && sb != inode->i_sb) 269 do_sb_sort = 1; 270 sb = inode->i_sb; 271 list_move(&inode->i_wb_list, &tmp); | 271 272 while (!list_empty(delaying_queue)) { 273 inode = wb_inode(delaying_queue->prev); 274 if (older_than_this && 275 inode_dirtied_after(inode, *older_than_this)) 276 break; 277 if (sb && sb != inode->i_sb) 278 do_sb_sort = 1; 279 sb = inode->i_sb; 280 list_move(&inode->i_wb_list, &tmp); |
281 moved++; |
|
272 } 273 274 /* just one sb in list, splice to dispatch_queue and we're done */ 275 if (!do_sb_sort) { 276 list_splice(&tmp, dispatch_queue); | 282 } 283 284 /* just one sb in list, splice to dispatch_queue and we're done */ 285 if (!do_sb_sort) { 286 list_splice(&tmp, dispatch_queue); |
277 return; | 287 goto out; |
278 } 279 280 /* Move inodes from one superblock together */ 281 while (!list_empty(&tmp)) { 282 sb = wb_inode(tmp.prev)->i_sb; 283 list_for_each_prev_safe(pos, node, &tmp) { 284 inode = wb_inode(pos); 285 if (inode->i_sb == sb) 286 list_move(&inode->i_wb_list, dispatch_queue); 287 } 288 } | 288 } 289 290 /* Move inodes from one superblock together */ 291 while (!list_empty(&tmp)) { 292 sb = wb_inode(tmp.prev)->i_sb; 293 list_for_each_prev_safe(pos, node, &tmp) { 294 inode = wb_inode(pos); 295 if (inode->i_sb == sb) 296 list_move(&inode->i_wb_list, dispatch_queue); 297 } 298 } |
299out: 300 return moved; |
|
289} 290 291/* 292 * Queue all expired dirty inodes for io, eldest first. 293 * Before 294 * newly dirtied b_dirty b_io b_more_io 295 * =============> gf edc BA 296 * After 297 * newly dirtied b_dirty b_io b_more_io 298 * =============> g fBAedc 299 * | 300 * +--> dequeue for IO 301 */ 302static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this) 303{ | 301} 302 303/* 304 * Queue all expired dirty inodes for io, eldest first. 305 * Before 306 * newly dirtied b_dirty b_io b_more_io 307 * =============> gf edc BA 308 * After 309 * newly dirtied b_dirty b_io b_more_io 310 * =============> g fBAedc 311 * | 312 * +--> dequeue for IO 313 */ 314static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this) 315{ |
304 assert_spin_locked(&inode_wb_list_lock); | 316 int moved; 317 assert_spin_locked(&wb->list_lock); |
305 list_splice_init(&wb->b_more_io, &wb->b_io); | 318 list_splice_init(&wb->b_more_io, &wb->b_io); |
306 move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); | 319 moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); 320 trace_writeback_queue_io(wb, older_than_this, moved); |
307} 308 309static int write_inode(struct inode *inode, struct writeback_control *wbc) 310{ 311 if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) 312 return inode->i_sb->s_op->write_inode(inode, wbc); 313 return 0; 314} 315 316/* 317 * Wait for writeback on an inode to complete. 318 */ | 321} 322 323static int write_inode(struct inode *inode, struct writeback_control *wbc) 324{ 325 if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) 326 return inode->i_sb->s_op->write_inode(inode, wbc); 327 return 0; 328} 329 330/* 331 * Wait for writeback on an inode to complete. 332 */ |
319static void inode_wait_for_writeback(struct inode *inode) | 333static void inode_wait_for_writeback(struct inode *inode, 334 struct bdi_writeback *wb) |
320{ 321 DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC); 322 wait_queue_head_t *wqh; 323 324 wqh = bit_waitqueue(&inode->i_state, __I_SYNC); 325 while (inode->i_state & I_SYNC) { 326 spin_unlock(&inode->i_lock); | 335{ 336 DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC); 337 wait_queue_head_t *wqh; 338 339 wqh = bit_waitqueue(&inode->i_state, __I_SYNC); 340 while (inode->i_state & I_SYNC) { 341 spin_unlock(&inode->i_lock); |
327 spin_unlock(&inode_wb_list_lock); | 342 spin_unlock(&wb->list_lock); |
328 __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); | 343 __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); |
329 spin_lock(&inode_wb_list_lock); | 344 spin_lock(&wb->list_lock); |
330 spin_lock(&inode->i_lock); 331 } 332} 333 334/* | 345 spin_lock(&inode->i_lock); 346 } 347} 348 349/* |
335 * Write out an inode's dirty pages. Called under inode_wb_list_lock and | 350 * Write out an inode's dirty pages. Called under wb->list_lock and |
336 * inode->i_lock. Either the caller has an active reference on the inode or 337 * the inode has I_WILL_FREE set. 338 * 339 * If `wait' is set, wait on the writeout. 340 * 341 * The whole writeout design is quite complex and fragile. We want to avoid 342 * starvation of particular inodes when others are being redirtied, prevent 343 * livelocks, etc. 344 */ 345static int | 351 * inode->i_lock. Either the caller has an active reference on the inode or 352 * the inode has I_WILL_FREE set. 353 * 354 * If `wait' is set, wait on the writeout. 355 * 356 * The whole writeout design is quite complex and fragile. We want to avoid 357 * starvation of particular inodes when others are being redirtied, prevent 358 * livelocks, etc. 359 */ 360static int |
346writeback_single_inode(struct inode *inode, struct writeback_control *wbc) | 361writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, 362 struct writeback_control *wbc) |
347{ 348 struct address_space *mapping = inode->i_mapping; | 363{ 364 struct address_space *mapping = inode->i_mapping; |
365 long nr_to_write = wbc->nr_to_write; |
|
349 unsigned dirty; 350 int ret; 351 | 366 unsigned dirty; 367 int ret; 368 |
352 assert_spin_locked(&inode_wb_list_lock); | 369 assert_spin_locked(&wb->list_lock); |
353 assert_spin_locked(&inode->i_lock); 354 355 if (!atomic_read(&inode->i_count)) 356 WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING))); 357 else 358 WARN_ON(inode->i_state & I_WILL_FREE); 359 360 if (inode->i_state & I_SYNC) { 361 /* 362 * If this inode is locked for writeback and we are not doing 363 * writeback-for-data-integrity, move it to b_more_io so that 364 * writeback can proceed with the other inodes on s_io. 365 * 366 * We'll have another go at writing back this inode when we 367 * completed a full scan of b_io. 368 */ 369 if (wbc->sync_mode != WB_SYNC_ALL) { | 370 assert_spin_locked(&inode->i_lock); 371 372 if (!atomic_read(&inode->i_count)) 373 WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING))); 374 else 375 WARN_ON(inode->i_state & I_WILL_FREE); 376 377 if (inode->i_state & I_SYNC) { 378 /* 379 * If this inode is locked for writeback and we are not doing 380 * writeback-for-data-integrity, move it to b_more_io so that 381 * writeback can proceed with the other inodes on s_io. 382 * 383 * We'll have another go at writing back this inode when we 384 * completed a full scan of b_io. 385 */ 386 if (wbc->sync_mode != WB_SYNC_ALL) { |
370 requeue_io(inode); | 387 requeue_io(inode, wb); 388 trace_writeback_single_inode_requeue(inode, wbc, 389 nr_to_write); |
371 return 0; 372 } 373 374 /* 375 * It's a data-integrity sync. We must wait. 376 */ | 390 return 0; 391 } 392 393 /* 394 * It's a data-integrity sync. We must wait. 395 */ |
377 inode_wait_for_writeback(inode); | 396 inode_wait_for_writeback(inode, wb); |
378 } 379 380 BUG_ON(inode->i_state & I_SYNC); 381 382 /* Set I_SYNC, reset I_DIRTY_PAGES */ 383 inode->i_state |= I_SYNC; 384 inode->i_state &= ~I_DIRTY_PAGES; 385 spin_unlock(&inode->i_lock); | 397 } 398 399 BUG_ON(inode->i_state & I_SYNC); 400 401 /* Set I_SYNC, reset I_DIRTY_PAGES */ 402 inode->i_state |= I_SYNC; 403 inode->i_state &= ~I_DIRTY_PAGES; 404 spin_unlock(&inode->i_lock); |
386 spin_unlock(&inode_wb_list_lock); | 405 spin_unlock(&wb->list_lock); |
387 388 ret = do_writepages(mapping, wbc); 389 390 /* 391 * Make sure to wait on the data before writing out the metadata. 392 * This is important for filesystems that modify metadata on data 393 * I/O completion. 394 */ --- 14 unchanged lines hidden (view full) --- 409 spin_unlock(&inode->i_lock); 410 /* Don't write the inode if only I_DIRTY_PAGES was set */ 411 if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { 412 int err = write_inode(inode, wbc); 413 if (ret == 0) 414 ret = err; 415 } 416 | 406 407 ret = do_writepages(mapping, wbc); 408 409 /* 410 * Make sure to wait on the data before writing out the metadata. 411 * This is important for filesystems that modify metadata on data 412 * I/O completion. 413 */ --- 14 unchanged lines hidden (view full) --- 428 spin_unlock(&inode->i_lock); 429 /* Don't write the inode if only I_DIRTY_PAGES was set */ 430 if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { 431 int err = write_inode(inode, wbc); 432 if (ret == 0) 433 ret = err; 434 } 435 |
417 spin_lock(&inode_wb_list_lock); | 436 spin_lock(&wb->list_lock); |
418 spin_lock(&inode->i_lock); 419 inode->i_state &= ~I_SYNC; 420 if (!(inode->i_state & I_FREEING)) { | 437 spin_lock(&inode->i_lock); 438 inode->i_state &= ~I_SYNC; 439 if (!(inode->i_state & I_FREEING)) { |
440 /* 441 * Sync livelock prevention. Each inode is tagged and synced in 442 * one shot. If still dirty, it will be redirty_tail()'ed below. 443 * Update the dirty time to prevent enqueue and sync it again. 444 */ 445 if ((inode->i_state & I_DIRTY) && 446 (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)) 447 inode->dirtied_when = jiffies; 448 |
|
421 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { 422 /* 423 * We didn't write back all the pages. nfs_writepages() 424 * sometimes bales out without doing anything. 425 */ 426 inode->i_state |= I_DIRTY_PAGES; 427 if (wbc->nr_to_write <= 0) { 428 /* 429 * slice used up: queue for next turn 430 */ | 449 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { 450 /* 451 * We didn't write back all the pages. nfs_writepages() 452 * sometimes bales out without doing anything. 453 */ 454 inode->i_state |= I_DIRTY_PAGES; 455 if (wbc->nr_to_write <= 0) { 456 /* 457 * slice used up: queue for next turn 458 */ |
431 requeue_io(inode); | 459 requeue_io(inode, wb); |
432 } else { 433 /* 434 * Writeback blocked by something other than 435 * congestion. Delay the inode for some time to 436 * avoid spinning on the CPU (100% iowait) 437 * retrying writeback of the dirty page/inode 438 * that cannot be performed immediately. 439 */ | 460 } else { 461 /* 462 * Writeback blocked by something other than 463 * congestion. Delay the inode for some time to 464 * avoid spinning on the CPU (100% iowait) 465 * retrying writeback of the dirty page/inode 466 * that cannot be performed immediately. 467 */ |
440 redirty_tail(inode); | 468 redirty_tail(inode, wb); |
441 } 442 } else if (inode->i_state & I_DIRTY) { 443 /* 444 * Filesystems can dirty the inode during writeback 445 * operations, such as delayed allocation during 446 * submission or metadata updates after data IO 447 * completion. 448 */ | 469 } 470 } else if (inode->i_state & I_DIRTY) { 471 /* 472 * Filesystems can dirty the inode during writeback 473 * operations, such as delayed allocation during 474 * submission or metadata updates after data IO 475 * completion. 476 */ |
449 redirty_tail(inode); | 477 redirty_tail(inode, wb); |
450 } else { 451 /* 452 * The inode is clean. At this point we either have 453 * a reference to the inode or it's on it's way out. 454 * No need to add it back to the LRU. 455 */ 456 list_del_init(&inode->i_wb_list); 457 } 458 } 459 inode_sync_complete(inode); | 478 } else { 479 /* 480 * The inode is clean. At this point we either have 481 * a reference to the inode or it's on it's way out. 482 * No need to add it back to the LRU. 483 */ 484 list_del_init(&inode->i_wb_list); 485 } 486 } 487 inode_sync_complete(inode); |
488 trace_writeback_single_inode(inode, wbc, nr_to_write); |
|
460 return ret; 461} 462 463/* 464 * For background writeback the caller does not have the sb pinned 465 * before calling writeback. So make sure that we do pin it, so it doesn't 466 * go away while we are writing inodes from it. 467 */ --- 13 unchanged lines hidden (view full) --- 481 return true; 482 up_read(&sb->s_umount); 483 } 484 485 put_super(sb); 486 return false; 487} 488 | 489 return ret; 490} 491 492/* 493 * For background writeback the caller does not have the sb pinned 494 * before calling writeback. So make sure that we do pin it, so it doesn't 495 * go away while we are writing inodes from it. 496 */ --- 13 unchanged lines hidden (view full) --- 510 return true; 511 up_read(&sb->s_umount); 512 } 513 514 put_super(sb); 515 return false; 516} 517 |
518static long writeback_chunk_size(struct wb_writeback_work *work) 519{ 520 long pages; 521 522 /* 523 * WB_SYNC_ALL mode does livelock avoidance by syncing dirty 524 * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX 525 * here avoids calling into writeback_inodes_wb() more than once. 526 * 527 * The intended call sequence for WB_SYNC_ALL writeback is: 528 * 529 * wb_writeback() 530 * writeback_sb_inodes() <== called only once 531 * write_cache_pages() <== called once for each inode 532 * (quickly) tag currently dirty pages 533 * (maybe slowly) sync all tagged pages 534 */ 535 if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages) 536 pages = LONG_MAX; 537 else 538 pages = min(MAX_WRITEBACK_PAGES, work->nr_pages); 539 540 return pages; 541} 542 |
|
489/* 490 * Write a portion of b_io inodes which belong to @sb. 491 * 492 * If @only_this_sb is true, then find and write all such 493 * inodes. Otherwise write only ones which go sequentially 494 * in reverse order. 495 * | 543/* 544 * Write a portion of b_io inodes which belong to @sb. 545 * 546 * If @only_this_sb is true, then find and write all such 547 * inodes. Otherwise write only ones which go sequentially 548 * in reverse order. 549 * |
496 * Return 1, if the caller writeback routine should be 497 * interrupted. Otherwise return 0. | 550 * Return the number of pages and/or inodes written. |
498 */ | 551 */ |
499static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, 500 struct writeback_control *wbc, bool only_this_sb) | 552static long writeback_sb_inodes(struct super_block *sb, 553 struct bdi_writeback *wb, 554 struct wb_writeback_work *work) |
501{ | 555{ |
556 struct writeback_control wbc = { 557 .sync_mode = work->sync_mode, 558 .tagged_writepages = work->tagged_writepages, 559 .for_kupdate = work->for_kupdate, 560 .for_background = work->for_background, 561 .range_cyclic = work->range_cyclic, 562 .range_start = 0, 563 .range_end = LLONG_MAX, 564 }; 565 unsigned long start_time = jiffies; 566 long write_chunk; 567 long wrote = 0; /* count both pages and inodes */ 568 |
|
502 while (!list_empty(&wb->b_io)) { | 569 while (!list_empty(&wb->b_io)) { |
503 long pages_skipped; | |
504 struct inode *inode = wb_inode(wb->b_io.prev); 505 506 if (inode->i_sb != sb) { | 570 struct inode *inode = wb_inode(wb->b_io.prev); 571 572 if (inode->i_sb != sb) { |
507 if (only_this_sb) { | 573 if (work->sb) { |
508 /* 509 * We only want to write back data for this 510 * superblock, move all inodes not belonging 511 * to it back onto the dirty list. 512 */ | 574 /* 575 * We only want to write back data for this 576 * superblock, move all inodes not belonging 577 * to it back onto the dirty list. 578 */ |
513 redirty_tail(inode); | 579 redirty_tail(inode, wb); |
514 continue; 515 } 516 517 /* 518 * The inode belongs to a different superblock. 519 * Bounce back to the caller to unpin this and 520 * pin the next superblock. 521 */ | 580 continue; 581 } 582 583 /* 584 * The inode belongs to a different superblock. 585 * Bounce back to the caller to unpin this and 586 * pin the next superblock. 587 */ |
522 return 0; | 588 break; |
523 } 524 525 /* 526 * Don't bother with new inodes or inodes beeing freed, first 527 * kind does not need peridic writeout yet, and for the latter 528 * kind writeout is handled by the freer. 529 */ 530 spin_lock(&inode->i_lock); 531 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { 532 spin_unlock(&inode->i_lock); | 589 } 590 591 /* 592 * Don't bother with new inodes or inodes beeing freed, first 593 * kind does not need peridic writeout yet, and for the latter 594 * kind writeout is handled by the freer. 595 */ 596 spin_lock(&inode->i_lock); 597 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { 598 spin_unlock(&inode->i_lock); |
533 requeue_io(inode); | 599 requeue_io(inode, wb); |
534 continue; 535 } | 600 continue; 601 } |
536 537 /* 538 * Was this inode dirtied after sync_sb_inodes was called? 539 * This keeps sync from extra jobs and livelock. 540 */ 541 if (inode_dirtied_after(inode, wbc->wb_start)) { 542 spin_unlock(&inode->i_lock); 543 return 1; 544 } 545 | |
546 __iget(inode); | 602 __iget(inode); |
603 write_chunk = writeback_chunk_size(work); 604 wbc.nr_to_write = write_chunk; 605 wbc.pages_skipped = 0; |
|
547 | 606 |
548 pages_skipped = wbc->pages_skipped; 549 writeback_single_inode(inode, wbc); 550 if (wbc->pages_skipped != pages_skipped) { | 607 writeback_single_inode(inode, wb, &wbc); 608 609 work->nr_pages -= write_chunk - wbc.nr_to_write; 610 wrote += write_chunk - wbc.nr_to_write; 611 if (!(inode->i_state & I_DIRTY)) 612 wrote++; 613 if (wbc.pages_skipped) { |
551 /* 552 * writeback is not making progress due to locked 553 * buffers. Skip this inode for now. 554 */ | 614 /* 615 * writeback is not making progress due to locked 616 * buffers. Skip this inode for now. 617 */ |
555 redirty_tail(inode); | 618 redirty_tail(inode, wb); |
556 } 557 spin_unlock(&inode->i_lock); | 619 } 620 spin_unlock(&inode->i_lock); |
558 spin_unlock(&inode_wb_list_lock); | 621 spin_unlock(&wb->list_lock); |
559 iput(inode); 560 cond_resched(); | 622 iput(inode); 623 cond_resched(); |
561 spin_lock(&inode_wb_list_lock); 562 if (wbc->nr_to_write <= 0) { 563 wbc->more_io = 1; 564 return 1; | 624 spin_lock(&wb->list_lock); 625 /* 626 * bail out to wb_writeback() often enough to check 627 * background threshold and other termination conditions. 628 */ 629 if (wrote) { 630 if (time_is_before_jiffies(start_time + HZ / 10UL)) 631 break; 632 if (work->nr_pages <= 0) 633 break; |
565 } | 634 } |
566 if (!list_empty(&wb->b_more_io)) 567 wbc->more_io = 1; | |
568 } | 635 } |
569 /* b_io is empty */ 570 return 1; | 636 return wrote; |
571} 572 | 637} 638 |
573void writeback_inodes_wb(struct bdi_writeback *wb, 574 struct writeback_control *wbc) | 639static long __writeback_inodes_wb(struct bdi_writeback *wb, 640 struct wb_writeback_work *work) |
575{ | 641{ |
576 int ret = 0; | 642 unsigned long start_time = jiffies; 643 long wrote = 0; |
577 | 644 |
578 if (!wbc->wb_start) 579 wbc->wb_start = jiffies; /* livelock avoidance */ 580 spin_lock(&inode_wb_list_lock); 581 if (!wbc->for_kupdate || list_empty(&wb->b_io)) 582 queue_io(wb, wbc->older_than_this); 583 | |
584 while (!list_empty(&wb->b_io)) { 585 struct inode *inode = wb_inode(wb->b_io.prev); 586 struct super_block *sb = inode->i_sb; 587 588 if (!pin_sb_for_writeback(sb)) { | 645 while (!list_empty(&wb->b_io)) { 646 struct inode *inode = wb_inode(wb->b_io.prev); 647 struct super_block *sb = inode->i_sb; 648 649 if (!pin_sb_for_writeback(sb)) { |
589 requeue_io(inode); | 650 requeue_io(inode, wb); |
590 continue; 591 } | 651 continue; 652 } |
592 ret = writeback_sb_inodes(sb, wb, wbc, false); | 653 wrote += writeback_sb_inodes(sb, wb, work); |
593 drop_super(sb); 594 | 654 drop_super(sb); 655 |
595 if (ret) 596 break; | 656 /* refer to the same tests at the end of writeback_sb_inodes */ 657 if (wrote) { 658 if (time_is_before_jiffies(start_time + HZ / 10UL)) 659 break; 660 if (work->nr_pages <= 0) 661 break; 662 } |
597 } | 663 } |
598 spin_unlock(&inode_wb_list_lock); | |
599 /* Leave any unwritten inodes on b_io */ | 664 /* Leave any unwritten inodes on b_io */ |
665 return wrote; |
|
600} 601 | 666} 667 |
602static void __writeback_inodes_sb(struct super_block *sb, 603 struct bdi_writeback *wb, struct writeback_control *wbc) | 668long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages) |
604{ | 669{ |
605 WARN_ON(!rwsem_is_locked(&sb->s_umount)); | 670 struct wb_writeback_work work = { 671 .nr_pages = nr_pages, 672 .sync_mode = WB_SYNC_NONE, 673 .range_cyclic = 1, 674 }; |
606 | 675 |
607 spin_lock(&inode_wb_list_lock); 608 if (!wbc->for_kupdate || list_empty(&wb->b_io)) 609 queue_io(wb, wbc->older_than_this); 610 writeback_sb_inodes(sb, wb, wbc, true); 611 spin_unlock(&inode_wb_list_lock); | 676 spin_lock(&wb->list_lock); 677 if (list_empty(&wb->b_io)) 678 queue_io(wb, NULL); 679 __writeback_inodes_wb(wb, &work); 680 spin_unlock(&wb->list_lock); 681 682 return nr_pages - work.nr_pages; |
612} 613 | 683} 684 |
614/* 615 * The maximum number of pages to writeout in a single bdi flush/kupdate 616 * operation. We do this so we don't hold I_SYNC against an inode for 617 * enormous amounts of time, which would block a userspace task which has 618 * been forced to throttle against that inode. Also, the code reevaluates 619 * the dirty each time it has written this many pages. 620 */ 621#define MAX_WRITEBACK_PAGES 1024 622 | |
623static inline bool over_bground_thresh(void) 624{ 625 unsigned long background_thresh, dirty_thresh; 626 627 global_dirty_limits(&background_thresh, &dirty_thresh); 628 629 return (global_page_state(NR_FILE_DIRTY) + 630 global_page_state(NR_UNSTABLE_NFS) > background_thresh); --- 12 unchanged lines hidden (view full) --- 643 * one-second gap. 644 * 645 * older_than_this takes precedence over nr_to_write. So we'll only write back 646 * all dirty pages if they are all attached to "old" mappings. 647 */ 648static long wb_writeback(struct bdi_writeback *wb, 649 struct wb_writeback_work *work) 650{ | 685static inline bool over_bground_thresh(void) 686{ 687 unsigned long background_thresh, dirty_thresh; 688 689 global_dirty_limits(&background_thresh, &dirty_thresh); 690 691 return (global_page_state(NR_FILE_DIRTY) + 692 global_page_state(NR_UNSTABLE_NFS) > background_thresh); --- 12 unchanged lines hidden (view full) --- 705 * one-second gap. 706 * 707 * older_than_this takes precedence over nr_to_write. So we'll only write back 708 * all dirty pages if they are all attached to "old" mappings. 709 */ 710static long wb_writeback(struct bdi_writeback *wb, 711 struct wb_writeback_work *work) 712{ |
651 struct writeback_control wbc = { 652 .sync_mode = work->sync_mode, 653 .older_than_this = NULL, 654 .for_kupdate = work->for_kupdate, 655 .for_background = work->for_background, 656 .range_cyclic = work->range_cyclic, 657 }; | 713 long nr_pages = work->nr_pages; |
658 unsigned long oldest_jif; | 714 unsigned long oldest_jif; |
659 long wrote = 0; 660 long write_chunk; | |
661 struct inode *inode; | 715 struct inode *inode; |
716 long progress; |
|
662 | 717 |
663 if (wbc.for_kupdate) { 664 wbc.older_than_this = &oldest_jif; 665 oldest_jif = jiffies - 666 msecs_to_jiffies(dirty_expire_interval * 10); 667 } 668 if (!wbc.range_cyclic) { 669 wbc.range_start = 0; 670 wbc.range_end = LLONG_MAX; 671 } | 718 oldest_jif = jiffies; 719 work->older_than_this = &oldest_jif; |
672 | 720 |
673 /* 674 * WB_SYNC_ALL mode does livelock avoidance by syncing dirty 675 * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX 676 * here avoids calling into writeback_inodes_wb() more than once. 677 * 678 * The intended call sequence for WB_SYNC_ALL writeback is: 679 * 680 * wb_writeback() 681 * __writeback_inodes_sb() <== called only once 682 * write_cache_pages() <== called once for each inode 683 * (quickly) tag currently dirty pages 684 * (maybe slowly) sync all tagged pages 685 */ 686 if (wbc.sync_mode == WB_SYNC_NONE) 687 write_chunk = MAX_WRITEBACK_PAGES; 688 else 689 write_chunk = LONG_MAX; 690 691 wbc.wb_start = jiffies; /* livelock avoidance */ | 721 spin_lock(&wb->list_lock); |
692 for (;;) { 693 /* 694 * Stop writeback when nr_pages has been consumed 695 */ 696 if (work->nr_pages <= 0) 697 break; 698 699 /* --- 8 unchanged lines hidden (view full) --- 708 709 /* 710 * For background writeout, stop when we are below the 711 * background dirty threshold 712 */ 713 if (work->for_background && !over_bground_thresh()) 714 break; 715 | 722 for (;;) { 723 /* 724 * Stop writeback when nr_pages has been consumed 725 */ 726 if (work->nr_pages <= 0) 727 break; 728 729 /* --- 8 unchanged lines hidden (view full) --- 738 739 /* 740 * For background writeout, stop when we are below the 741 * background dirty threshold 742 */ 743 if (work->for_background && !over_bground_thresh()) 744 break; 745 |
716 wbc.more_io = 0; 717 wbc.nr_to_write = write_chunk; 718 wbc.pages_skipped = 0; | 746 if (work->for_kupdate) { 747 oldest_jif = jiffies - 748 msecs_to_jiffies(dirty_expire_interval * 10); 749 work->older_than_this = &oldest_jif; 750 } |
719 | 751 |
720 trace_wbc_writeback_start(&wbc, wb->bdi); | 752 trace_writeback_start(wb->bdi, work); 753 if (list_empty(&wb->b_io)) 754 queue_io(wb, work->older_than_this); |
721 if (work->sb) | 755 if (work->sb) |
722 __writeback_inodes_sb(work->sb, wb, &wbc); | 756 progress = writeback_sb_inodes(work->sb, wb, work); |
723 else | 757 else |
724 writeback_inodes_wb(wb, &wbc); 725 trace_wbc_writeback_written(&wbc, wb->bdi); | 758 progress = __writeback_inodes_wb(wb, work); 759 trace_writeback_written(wb->bdi, work); |
726 | 760 |
727 work->nr_pages -= write_chunk - wbc.nr_to_write; 728 wrote += write_chunk - wbc.nr_to_write; 729 | |
730 /* | 761 /* |
731 * If we consumed everything, see if we have more | 762 * Did we write something? Try for more 763 * 764 * Dirty inodes are moved to b_io for writeback in batches. 765 * The completion of the current batch does not necessarily 766 * mean the overall work is done. So we keep looping as long 767 * as made some progress on cleaning pages or inodes. |
732 */ | 768 */ |
733 if (wbc.nr_to_write <= 0) | 769 if (progress) |
734 continue; 735 /* | 770 continue; 771 /* |
736 * Didn't write everything and we don't have more IO, bail | 772 * No more inodes for IO, bail |
737 */ | 773 */ |
738 if (!wbc.more_io) | 774 if (list_empty(&wb->b_more_io)) |
739 break; 740 /* | 775 break; 776 /* |
741 * Did we write something? Try for more 742 */ 743 if (wbc.nr_to_write < write_chunk) 744 continue; 745 /* | |
746 * Nothing written. Wait for some inode to 747 * become available for writeback. Otherwise 748 * we'll just busyloop. 749 */ | 777 * Nothing written. Wait for some inode to 778 * become available for writeback. Otherwise 779 * we'll just busyloop. 780 */ |
750 spin_lock(&inode_wb_list_lock); | |
751 if (!list_empty(&wb->b_more_io)) { | 781 if (!list_empty(&wb->b_more_io)) { |
782 trace_writeback_wait(wb->bdi, work); |
|
752 inode = wb_inode(wb->b_more_io.prev); | 783 inode = wb_inode(wb->b_more_io.prev); |
753 trace_wbc_writeback_wait(&wbc, wb->bdi); | |
754 spin_lock(&inode->i_lock); | 784 spin_lock(&inode->i_lock); |
755 inode_wait_for_writeback(inode); | 785 inode_wait_for_writeback(inode, wb); |
756 spin_unlock(&inode->i_lock); 757 } | 786 spin_unlock(&inode->i_lock); 787 } |
758 spin_unlock(&inode_wb_list_lock); | |
759 } | 788 } |
789 spin_unlock(&wb->list_lock); |
|
760 | 790 |
761 return wrote; | 791 return nr_pages - work->nr_pages; |
762} 763 764/* 765 * Return the next wb_writeback_work struct that hasn't been processed yet. 766 */ 767static struct wb_writeback_work * 768get_next_work_item(struct backing_dev_info *bdi) 769{ --- 314 unchanged lines hidden (view full) --- 1084 * bdi thread to make sure background 1085 * write-back happens later. 1086 */ 1087 if (!wb_has_dirty_io(&bdi->wb)) 1088 wakeup_bdi = true; 1089 } 1090 1091 spin_unlock(&inode->i_lock); | 792} 793 794/* 795 * Return the next wb_writeback_work struct that hasn't been processed yet. 796 */ 797static struct wb_writeback_work * 798get_next_work_item(struct backing_dev_info *bdi) 799{ --- 314 unchanged lines hidden (view full) --- 1114 * bdi thread to make sure background 1115 * write-back happens later. 1116 */ 1117 if (!wb_has_dirty_io(&bdi->wb)) 1118 wakeup_bdi = true; 1119 } 1120 1121 spin_unlock(&inode->i_lock); |
1092 spin_lock(&inode_wb_list_lock); | 1122 spin_lock(&bdi->wb.list_lock); |
1093 inode->dirtied_when = jiffies; 1094 list_move(&inode->i_wb_list, &bdi->wb.b_dirty); | 1123 inode->dirtied_when = jiffies; 1124 list_move(&inode->i_wb_list, &bdi->wb.b_dirty); |
1095 spin_unlock(&inode_wb_list_lock); | 1125 spin_unlock(&bdi->wb.list_lock); |
1096 1097 if (wakeup_bdi) 1098 bdi_wakeup_thread_delayed(bdi); 1099 return; 1100 } 1101 } 1102out_unlock_inode: 1103 spin_unlock(&inode->i_lock); --- 79 unchanged lines hidden (view full) --- 1183 * Start writeback on some inodes on this super_block. No guarantees are made 1184 * on how many (if any) will be written, and this function does not wait 1185 * for IO completion of submitted IO. 1186 */ 1187void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr) 1188{ 1189 DECLARE_COMPLETION_ONSTACK(done); 1190 struct wb_writeback_work work = { | 1126 1127 if (wakeup_bdi) 1128 bdi_wakeup_thread_delayed(bdi); 1129 return; 1130 } 1131 } 1132out_unlock_inode: 1133 spin_unlock(&inode->i_lock); --- 79 unchanged lines hidden (view full) --- 1213 * Start writeback on some inodes on this super_block. No guarantees are made 1214 * on how many (if any) will be written, and this function does not wait 1215 * for IO completion of submitted IO. 1216 */ 1217void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr) 1218{ 1219 DECLARE_COMPLETION_ONSTACK(done); 1220 struct wb_writeback_work work = { |
1191 .sb = sb, 1192 .sync_mode = WB_SYNC_NONE, 1193 .done = &done, 1194 .nr_pages = nr, | 1221 .sb = sb, 1222 .sync_mode = WB_SYNC_NONE, 1223 .tagged_writepages = 1, 1224 .done = &done, 1225 .nr_pages = nr, |
1195 }; 1196 1197 WARN_ON(!rwsem_is_locked(&sb->s_umount)); 1198 bdi_queue_work(sb->s_bdi, &work); 1199 wait_for_completion(&done); 1200} 1201EXPORT_SYMBOL(writeback_inodes_sb_nr); 1202 --- 85 unchanged lines hidden (view full) --- 1288 * 1289 * This function commits an inode to disk immediately if it is dirty. This is 1290 * primarily needed by knfsd. 1291 * 1292 * The caller must either have a ref on the inode or must have set I_WILL_FREE. 1293 */ 1294int write_inode_now(struct inode *inode, int sync) 1295{ | 1226 }; 1227 1228 WARN_ON(!rwsem_is_locked(&sb->s_umount)); 1229 bdi_queue_work(sb->s_bdi, &work); 1230 wait_for_completion(&done); 1231} 1232EXPORT_SYMBOL(writeback_inodes_sb_nr); 1233 --- 85 unchanged lines hidden (view full) --- 1319 * 1320 * This function commits an inode to disk immediately if it is dirty. This is 1321 * primarily needed by knfsd. 1322 * 1323 * The caller must either have a ref on the inode or must have set I_WILL_FREE. 1324 */ 1325int write_inode_now(struct inode *inode, int sync) 1326{ |
1327 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; |
|
1296 int ret; 1297 struct writeback_control wbc = { 1298 .nr_to_write = LONG_MAX, 1299 .sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE, 1300 .range_start = 0, 1301 .range_end = LLONG_MAX, 1302 }; 1303 1304 if (!mapping_cap_writeback_dirty(inode->i_mapping)) 1305 wbc.nr_to_write = 0; 1306 1307 might_sleep(); | 1328 int ret; 1329 struct writeback_control wbc = { 1330 .nr_to_write = LONG_MAX, 1331 .sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE, 1332 .range_start = 0, 1333 .range_end = LLONG_MAX, 1334 }; 1335 1336 if (!mapping_cap_writeback_dirty(inode->i_mapping)) 1337 wbc.nr_to_write = 0; 1338 1339 might_sleep(); |
1308 spin_lock(&inode_wb_list_lock); | 1340 spin_lock(&wb->list_lock); |
1309 spin_lock(&inode->i_lock); | 1341 spin_lock(&inode->i_lock); |
1310 ret = writeback_single_inode(inode, &wbc); | 1342 ret = writeback_single_inode(inode, wb, &wbc); |
1311 spin_unlock(&inode->i_lock); | 1343 spin_unlock(&inode->i_lock); |
1312 spin_unlock(&inode_wb_list_lock); | 1344 spin_unlock(&wb->list_lock); |
1313 if (sync) 1314 inode_sync_wait(inode); 1315 return ret; 1316} 1317EXPORT_SYMBOL(write_inode_now); 1318 1319/** 1320 * sync_inode - write an inode and its pages to disk. 1321 * @inode: the inode to sync 1322 * @wbc: controls the writeback mode 1323 * 1324 * sync_inode() will write an inode and its pages to disk. It will also 1325 * correctly update the inode on its superblock's dirty inode lists and will 1326 * update inode->i_state. 1327 * 1328 * The caller must have a ref on the inode. 1329 */ 1330int sync_inode(struct inode *inode, struct writeback_control *wbc) 1331{ | 1345 if (sync) 1346 inode_sync_wait(inode); 1347 return ret; 1348} 1349EXPORT_SYMBOL(write_inode_now); 1350 1351/** 1352 * sync_inode - write an inode and its pages to disk. 1353 * @inode: the inode to sync 1354 * @wbc: controls the writeback mode 1355 * 1356 * sync_inode() will write an inode and its pages to disk. It will also 1357 * correctly update the inode on its superblock's dirty inode lists and will 1358 * update inode->i_state. 1359 * 1360 * The caller must have a ref on the inode. 1361 */ 1362int sync_inode(struct inode *inode, struct writeback_control *wbc) 1363{ |
1364 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; |
|
1332 int ret; 1333 | 1365 int ret; 1366 |
1334 spin_lock(&inode_wb_list_lock); | 1367 spin_lock(&wb->list_lock); |
1335 spin_lock(&inode->i_lock); | 1368 spin_lock(&inode->i_lock); |
1336 ret = writeback_single_inode(inode, wbc); | 1369 ret = writeback_single_inode(inode, wb, wbc); |
1337 spin_unlock(&inode->i_lock); | 1370 spin_unlock(&inode->i_lock); |
1338 spin_unlock(&inode_wb_list_lock); | 1371 spin_unlock(&wb->list_lock); |
1339 return ret; 1340} 1341EXPORT_SYMBOL(sync_inode); 1342 1343/** 1344 * sync_inode_metadata - write an inode to disk 1345 * @inode: the inode to sync 1346 * @wait: wait for I/O to complete. --- 15 unchanged lines hidden --- | 1372 return ret; 1373} 1374EXPORT_SYMBOL(sync_inode); 1375 1376/** 1377 * sync_inode_metadata - write an inode to disk 1378 * @inode: the inode to sync 1379 * @wait: wait for I/O to complete. --- 15 unchanged lines hidden --- |