1 /* 2 * mm/truncate.c - code for taking down pages from address_spaces 3 * 4 * Copyright (C) 2002, Linus Torvalds 5 * 6 * 10Sep2002 Andrew Morton 7 * Initial version. 8 */ 9 10 #include <linux/kernel.h> 11 #include <linux/backing-dev.h> 12 #include <linux/gfp.h> 13 #include <linux/mm.h> 14 #include <linux/swap.h> 15 #include <linux/export.h> 16 #include <linux/pagemap.h> 17 #include <linux/highmem.h> 18 #include <linux/pagevec.h> 19 #include <linux/task_io_accounting_ops.h> 20 #include <linux/buffer_head.h> /* grr. try_to_release_page, 21 do_invalidatepage */ 22 #include <linux/cleancache.h> 23 #include "internal.h" 24 25 26 /** 27 * do_invalidatepage - invalidate part or all of a page 28 * @page: the page which is affected 29 * @offset: the index of the truncation point 30 * 31 * do_invalidatepage() is called when all or part of the page has become 32 * invalidated by a truncate operation. 33 * 34 * do_invalidatepage() does not have to release all buffers, but it must 35 * ensure that no dirty buffer is left outside @offset and that no I/O 36 * is underway against any of the blocks which are outside the truncation 37 * point. Because the caller is about to free (and possibly reuse) those 38 * blocks on-disk. 39 */ 40 void do_invalidatepage(struct page *page, unsigned long offset) 41 { 42 void (*invalidatepage)(struct page *, unsigned long); 43 invalidatepage = page->mapping->a_ops->invalidatepage; 44 #ifdef CONFIG_BLOCK 45 if (!invalidatepage) 46 invalidatepage = block_invalidatepage; 47 #endif 48 if (invalidatepage) 49 (*invalidatepage)(page, offset); 50 } 51 52 static inline void truncate_partial_page(struct page *page, unsigned partial) 53 { 54 zero_user_segment(page, partial, PAGE_CACHE_SIZE); 55 cleancache_invalidate_page(page->mapping, page); 56 if (page_has_private(page)) 57 do_invalidatepage(page, partial); 58 } 59 60 /* 61 * This cancels just the dirty bit on the kernel page itself, it 62 * does NOT actually remove dirty bits on any mmap's that may be 63 * around. It also leaves the page tagged dirty, so any sync 64 * activity will still find it on the dirty lists, and in particular, 65 * clear_page_dirty_for_io() will still look at the dirty bits in 66 * the VM. 67 * 68 * Doing this should *normally* only ever be done when a page 69 * is truncated, and is not actually mapped anywhere at all. However, 70 * fs/buffer.c does this when it notices that somebody has cleaned 71 * out all the buffers on a page without actually doing it through 72 * the VM. Can you say "ext3 is horribly ugly"? Tought you could. 73 */ 74 void cancel_dirty_page(struct page *page, unsigned int account_size) 75 { 76 if (TestClearPageDirty(page)) { 77 struct address_space *mapping = page->mapping; 78 if (mapping && mapping_cap_account_dirty(mapping)) { 79 dec_zone_page_state(page, NR_FILE_DIRTY); 80 dec_bdi_stat(mapping->backing_dev_info, 81 BDI_RECLAIMABLE); 82 if (account_size) 83 task_io_account_cancelled_write(account_size); 84 } 85 } 86 } 87 EXPORT_SYMBOL(cancel_dirty_page); 88 89 /* 90 * If truncate cannot remove the fs-private metadata from the page, the page 91 * becomes orphaned. It will be left on the LRU and may even be mapped into 92 * user pagetables if we're racing with filemap_fault(). 93 * 94 * We need to bale out if page->mapping is no longer equal to the original 95 * mapping. This happens a) when the VM reclaimed the page while we waited on 96 * its lock, b) when a concurrent invalidate_mapping_pages got there first and 97 * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space. 98 */ 99 static int 100 truncate_complete_page(struct address_space *mapping, struct page *page) 101 { 102 if (page->mapping != mapping) 103 return -EIO; 104 105 if (page_has_private(page)) 106 do_invalidatepage(page, 0); 107 108 cancel_dirty_page(page, PAGE_CACHE_SIZE); 109 110 clear_page_mlock(page); 111 ClearPageMappedToDisk(page); 112 delete_from_page_cache(page); 113 return 0; 114 } 115 116 /* 117 * This is for invalidate_mapping_pages(). That function can be called at 118 * any time, and is not supposed to throw away dirty pages. But pages can 119 * be marked dirty at any time too, so use remove_mapping which safely 120 * discards clean, unused pages. 121 * 122 * Returns non-zero if the page was successfully invalidated. 123 */ 124 static int 125 invalidate_complete_page(struct address_space *mapping, struct page *page) 126 { 127 int ret; 128 129 if (page->mapping != mapping) 130 return 0; 131 132 if (page_has_private(page) && !try_to_release_page(page, 0)) 133 return 0; 134 135 clear_page_mlock(page); 136 ret = remove_mapping(mapping, page); 137 138 return ret; 139 } 140 141 int truncate_inode_page(struct address_space *mapping, struct page *page) 142 { 143 if (page_mapped(page)) { 144 unmap_mapping_range(mapping, 145 (loff_t)page->index << PAGE_CACHE_SHIFT, 146 PAGE_CACHE_SIZE, 0); 147 } 148 return truncate_complete_page(mapping, page); 149 } 150 151 /* 152 * Used to get rid of pages on hardware memory corruption. 153 */ 154 int generic_error_remove_page(struct address_space *mapping, struct page *page) 155 { 156 if (!mapping) 157 return -EINVAL; 158 /* 159 * Only punch for normal data pages for now. 160 * Handling other types like directories would need more auditing. 161 */ 162 if (!S_ISREG(mapping->host->i_mode)) 163 return -EIO; 164 return truncate_inode_page(mapping, page); 165 } 166 EXPORT_SYMBOL(generic_error_remove_page); 167 168 /* 169 * Safely invalidate one page from its pagecache mapping. 170 * It only drops clean, unused pages. The page must be locked. 171 * 172 * Returns 1 if the page is successfully invalidated, otherwise 0. 173 */ 174 int invalidate_inode_page(struct page *page) 175 { 176 struct address_space *mapping = page_mapping(page); 177 if (!mapping) 178 return 0; 179 if (PageDirty(page) || PageWriteback(page)) 180 return 0; 181 if (page_mapped(page)) 182 return 0; 183 return invalidate_complete_page(mapping, page); 184 } 185 186 /** 187 * truncate_inode_pages_range - truncate range of pages specified by start & end byte offsets 188 * @mapping: mapping to truncate 189 * @lstart: offset from which to truncate 190 * @lend: offset to which to truncate 191 * 192 * Truncate the page cache, removing the pages that are between 193 * specified offsets (and zeroing out partial page 194 * (if lstart is not page aligned)). 195 * 196 * Truncate takes two passes - the first pass is nonblocking. It will not 197 * block on page locks and it will not block on writeback. The second pass 198 * will wait. This is to prevent as much IO as possible in the affected region. 199 * The first pass will remove most pages, so the search cost of the second pass 200 * is low. 201 * 202 * We pass down the cache-hot hint to the page freeing code. Even if the 203 * mapping is large, it is probably the case that the final pages are the most 204 * recently touched, and freeing happens in ascending file offset order. 205 */ 206 void truncate_inode_pages_range(struct address_space *mapping, 207 loff_t lstart, loff_t lend) 208 { 209 const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; 210 const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); 211 struct pagevec pvec; 212 pgoff_t index; 213 pgoff_t end; 214 int i; 215 216 cleancache_invalidate_inode(mapping); 217 if (mapping->nrpages == 0) 218 return; 219 220 BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1)); 221 end = (lend >> PAGE_CACHE_SHIFT); 222 223 pagevec_init(&pvec, 0); 224 index = start; 225 while (index <= end && pagevec_lookup(&pvec, mapping, index, 226 min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { 227 mem_cgroup_uncharge_start(); 228 for (i = 0; i < pagevec_count(&pvec); i++) { 229 struct page *page = pvec.pages[i]; 230 231 /* We rely upon deletion not changing page->index */ 232 index = page->index; 233 if (index > end) 234 break; 235 236 if (!trylock_page(page)) 237 continue; 238 WARN_ON(page->index != index); 239 if (PageWriteback(page)) { 240 unlock_page(page); 241 continue; 242 } 243 truncate_inode_page(mapping, page); 244 unlock_page(page); 245 } 246 pagevec_release(&pvec); 247 mem_cgroup_uncharge_end(); 248 cond_resched(); 249 index++; 250 } 251 252 if (partial) { 253 struct page *page = find_lock_page(mapping, start - 1); 254 if (page) { 255 wait_on_page_writeback(page); 256 truncate_partial_page(page, partial); 257 unlock_page(page); 258 page_cache_release(page); 259 } 260 } 261 262 index = start; 263 for ( ; ; ) { 264 cond_resched(); 265 if (!pagevec_lookup(&pvec, mapping, index, 266 min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { 267 if (index == start) 268 break; 269 index = start; 270 continue; 271 } 272 if (index == start && pvec.pages[0]->index > end) { 273 pagevec_release(&pvec); 274 break; 275 } 276 mem_cgroup_uncharge_start(); 277 for (i = 0; i < pagevec_count(&pvec); i++) { 278 struct page *page = pvec.pages[i]; 279 280 /* We rely upon deletion not changing page->index */ 281 index = page->index; 282 if (index > end) 283 break; 284 285 lock_page(page); 286 WARN_ON(page->index != index); 287 wait_on_page_writeback(page); 288 truncate_inode_page(mapping, page); 289 unlock_page(page); 290 } 291 pagevec_release(&pvec); 292 mem_cgroup_uncharge_end(); 293 index++; 294 } 295 cleancache_invalidate_inode(mapping); 296 } 297 EXPORT_SYMBOL(truncate_inode_pages_range); 298 299 /** 300 * truncate_inode_pages - truncate *all* the pages from an offset 301 * @mapping: mapping to truncate 302 * @lstart: offset from which to truncate 303 * 304 * Called under (and serialised by) inode->i_mutex. 305 * 306 * Note: When this function returns, there can be a page in the process of 307 * deletion (inside __delete_from_page_cache()) in the specified range. Thus 308 * mapping->nrpages can be non-zero when this function returns even after 309 * truncation of the whole mapping. 310 */ 311 void truncate_inode_pages(struct address_space *mapping, loff_t lstart) 312 { 313 truncate_inode_pages_range(mapping, lstart, (loff_t)-1); 314 } 315 EXPORT_SYMBOL(truncate_inode_pages); 316 317 /** 318 * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode 319 * @mapping: the address_space which holds the pages to invalidate 320 * @start: the offset 'from' which to invalidate 321 * @end: the offset 'to' which to invalidate (inclusive) 322 * 323 * This function only removes the unlocked pages, if you want to 324 * remove all the pages of one inode, you must call truncate_inode_pages. 325 * 326 * invalidate_mapping_pages() will not block on IO activity. It will not 327 * invalidate pages which are dirty, locked, under writeback or mapped into 328 * pagetables. 329 */ 330 unsigned long invalidate_mapping_pages(struct address_space *mapping, 331 pgoff_t start, pgoff_t end) 332 { 333 struct pagevec pvec; 334 pgoff_t index = start; 335 unsigned long ret; 336 unsigned long count = 0; 337 int i; 338 339 /* 340 * Note: this function may get called on a shmem/tmpfs mapping: 341 * pagevec_lookup() might then return 0 prematurely (because it 342 * got a gangful of swap entries); but it's hardly worth worrying 343 * about - it can rarely have anything to free from such a mapping 344 * (most pages are dirty), and already skips over any difficulties. 345 */ 346 347 pagevec_init(&pvec, 0); 348 while (index <= end && pagevec_lookup(&pvec, mapping, index, 349 min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { 350 mem_cgroup_uncharge_start(); 351 for (i = 0; i < pagevec_count(&pvec); i++) { 352 struct page *page = pvec.pages[i]; 353 354 /* We rely upon deletion not changing page->index */ 355 index = page->index; 356 if (index > end) 357 break; 358 359 if (!trylock_page(page)) 360 continue; 361 WARN_ON(page->index != index); 362 ret = invalidate_inode_page(page); 363 unlock_page(page); 364 /* 365 * Invalidation is a hint that the page is no longer 366 * of interest and try to speed up its reclaim. 367 */ 368 if (!ret) 369 deactivate_page(page); 370 count += ret; 371 } 372 pagevec_release(&pvec); 373 mem_cgroup_uncharge_end(); 374 cond_resched(); 375 index++; 376 } 377 return count; 378 } 379 EXPORT_SYMBOL(invalidate_mapping_pages); 380 381 /* 382 * This is like invalidate_complete_page(), except it ignores the page's 383 * refcount. We do this because invalidate_inode_pages2() needs stronger 384 * invalidation guarantees, and cannot afford to leave pages behind because 385 * shrink_page_list() has a temp ref on them, or because they're transiently 386 * sitting in the lru_cache_add() pagevecs. 387 */ 388 static int 389 invalidate_complete_page2(struct address_space *mapping, struct page *page) 390 { 391 if (page->mapping != mapping) 392 return 0; 393 394 if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) 395 return 0; 396 397 spin_lock_irq(&mapping->tree_lock); 398 if (PageDirty(page)) 399 goto failed; 400 401 clear_page_mlock(page); 402 BUG_ON(page_has_private(page)); 403 __delete_from_page_cache(page); 404 spin_unlock_irq(&mapping->tree_lock); 405 mem_cgroup_uncharge_cache_page(page); 406 407 if (mapping->a_ops->freepage) 408 mapping->a_ops->freepage(page); 409 410 page_cache_release(page); /* pagecache ref */ 411 return 1; 412 failed: 413 spin_unlock_irq(&mapping->tree_lock); 414 return 0; 415 } 416 417 static int do_launder_page(struct address_space *mapping, struct page *page) 418 { 419 if (!PageDirty(page)) 420 return 0; 421 if (page->mapping != mapping || mapping->a_ops->launder_page == NULL) 422 return 0; 423 return mapping->a_ops->launder_page(page); 424 } 425 426 /** 427 * invalidate_inode_pages2_range - remove range of pages from an address_space 428 * @mapping: the address_space 429 * @start: the page offset 'from' which to invalidate 430 * @end: the page offset 'to' which to invalidate (inclusive) 431 * 432 * Any pages which are found to be mapped into pagetables are unmapped prior to 433 * invalidation. 434 * 435 * Returns -EBUSY if any pages could not be invalidated. 436 */ 437 int invalidate_inode_pages2_range(struct address_space *mapping, 438 pgoff_t start, pgoff_t end) 439 { 440 struct pagevec pvec; 441 pgoff_t index; 442 int i; 443 int ret = 0; 444 int ret2 = 0; 445 int did_range_unmap = 0; 446 447 cleancache_invalidate_inode(mapping); 448 pagevec_init(&pvec, 0); 449 index = start; 450 while (index <= end && pagevec_lookup(&pvec, mapping, index, 451 min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { 452 mem_cgroup_uncharge_start(); 453 for (i = 0; i < pagevec_count(&pvec); i++) { 454 struct page *page = pvec.pages[i]; 455 456 /* We rely upon deletion not changing page->index */ 457 index = page->index; 458 if (index > end) 459 break; 460 461 lock_page(page); 462 WARN_ON(page->index != index); 463 if (page->mapping != mapping) { 464 unlock_page(page); 465 continue; 466 } 467 wait_on_page_writeback(page); 468 if (page_mapped(page)) { 469 if (!did_range_unmap) { 470 /* 471 * Zap the rest of the file in one hit. 472 */ 473 unmap_mapping_range(mapping, 474 (loff_t)index << PAGE_CACHE_SHIFT, 475 (loff_t)(1 + end - index) 476 << PAGE_CACHE_SHIFT, 477 0); 478 did_range_unmap = 1; 479 } else { 480 /* 481 * Just zap this page 482 */ 483 unmap_mapping_range(mapping, 484 (loff_t)index << PAGE_CACHE_SHIFT, 485 PAGE_CACHE_SIZE, 0); 486 } 487 } 488 BUG_ON(page_mapped(page)); 489 ret2 = do_launder_page(mapping, page); 490 if (ret2 == 0) { 491 if (!invalidate_complete_page2(mapping, page)) 492 ret2 = -EBUSY; 493 } 494 if (ret2 < 0) 495 ret = ret2; 496 unlock_page(page); 497 } 498 pagevec_release(&pvec); 499 mem_cgroup_uncharge_end(); 500 cond_resched(); 501 index++; 502 } 503 cleancache_invalidate_inode(mapping); 504 return ret; 505 } 506 EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range); 507 508 /** 509 * invalidate_inode_pages2 - remove all pages from an address_space 510 * @mapping: the address_space 511 * 512 * Any pages which are found to be mapped into pagetables are unmapped prior to 513 * invalidation. 514 * 515 * Returns -EBUSY if any pages could not be invalidated. 516 */ 517 int invalidate_inode_pages2(struct address_space *mapping) 518 { 519 return invalidate_inode_pages2_range(mapping, 0, -1); 520 } 521 EXPORT_SYMBOL_GPL(invalidate_inode_pages2); 522 523 /** 524 * truncate_pagecache - unmap and remove pagecache that has been truncated 525 * @inode: inode 526 * @oldsize: old file size 527 * @newsize: new file size 528 * 529 * inode's new i_size must already be written before truncate_pagecache 530 * is called. 531 * 532 * This function should typically be called before the filesystem 533 * releases resources associated with the freed range (eg. deallocates 534 * blocks). This way, pagecache will always stay logically coherent 535 * with on-disk format, and the filesystem would not have to deal with 536 * situations such as writepage being called for a page that has already 537 * had its underlying blocks deallocated. 538 */ 539 void truncate_pagecache(struct inode *inode, loff_t oldsize, loff_t newsize) 540 { 541 struct address_space *mapping = inode->i_mapping; 542 loff_t holebegin = round_up(newsize, PAGE_SIZE); 543 544 /* 545 * unmap_mapping_range is called twice, first simply for 546 * efficiency so that truncate_inode_pages does fewer 547 * single-page unmaps. However after this first call, and 548 * before truncate_inode_pages finishes, it is possible for 549 * private pages to be COWed, which remain after 550 * truncate_inode_pages finishes, hence the second 551 * unmap_mapping_range call must be made for correctness. 552 */ 553 unmap_mapping_range(mapping, holebegin, 0, 1); 554 truncate_inode_pages(mapping, newsize); 555 unmap_mapping_range(mapping, holebegin, 0, 1); 556 } 557 EXPORT_SYMBOL(truncate_pagecache); 558 559 /** 560 * truncate_setsize - update inode and pagecache for a new file size 561 * @inode: inode 562 * @newsize: new file size 563 * 564 * truncate_setsize updates i_size and performs pagecache truncation (if 565 * necessary) to @newsize. It will be typically be called from the filesystem's 566 * setattr function when ATTR_SIZE is passed in. 567 * 568 * Must be called with inode_mutex held and before all filesystem specific 569 * block truncation has been performed. 570 */ 571 void truncate_setsize(struct inode *inode, loff_t newsize) 572 { 573 loff_t oldsize; 574 575 oldsize = inode->i_size; 576 i_size_write(inode, newsize); 577 578 truncate_pagecache(inode, oldsize, newsize); 579 } 580 EXPORT_SYMBOL(truncate_setsize); 581 582 /** 583 * vmtruncate - unmap mappings "freed" by truncate() syscall 584 * @inode: inode of the file used 585 * @newsize: file offset to start truncating 586 * 587 * This function is deprecated and truncate_setsize or truncate_pagecache 588 * should be used instead, together with filesystem specific block truncation. 589 */ 590 int vmtruncate(struct inode *inode, loff_t newsize) 591 { 592 int error; 593 594 error = inode_newsize_ok(inode, newsize); 595 if (error) 596 return error; 597 598 truncate_setsize(inode, newsize); 599 if (inode->i_op->truncate) 600 inode->i_op->truncate(inode); 601 return 0; 602 } 603 EXPORT_SYMBOL(vmtruncate); 604 605 int vmtruncate_range(struct inode *inode, loff_t lstart, loff_t lend) 606 { 607 struct address_space *mapping = inode->i_mapping; 608 loff_t holebegin = round_up(lstart, PAGE_SIZE); 609 loff_t holelen = 1 + lend - holebegin; 610 611 /* 612 * If the underlying filesystem is not going to provide 613 * a way to truncate a range of blocks (punch a hole) - 614 * we should return failure right now. 615 */ 616 if (!inode->i_op->truncate_range) 617 return -ENOSYS; 618 619 mutex_lock(&inode->i_mutex); 620 inode_dio_wait(inode); 621 unmap_mapping_range(mapping, holebegin, holelen, 1); 622 inode->i_op->truncate_range(inode, lstart, lend); 623 /* unmap again to remove racily COWed private pages */ 624 unmap_mapping_range(mapping, holebegin, holelen, 1); 625 mutex_unlock(&inode->i_mutex); 626 627 return 0; 628 } 629 630 /** 631 * truncate_pagecache_range - unmap and remove pagecache that is hole-punched 632 * @inode: inode 633 * @lstart: offset of beginning of hole 634 * @lend: offset of last byte of hole 635 * 636 * This function should typically be called before the filesystem 637 * releases resources associated with the freed range (eg. deallocates 638 * blocks). This way, pagecache will always stay logically coherent 639 * with on-disk format, and the filesystem would not have to deal with 640 * situations such as writepage being called for a page that has already 641 * had its underlying blocks deallocated. 642 */ 643 void truncate_pagecache_range(struct inode *inode, loff_t lstart, loff_t lend) 644 { 645 struct address_space *mapping = inode->i_mapping; 646 loff_t unmap_start = round_up(lstart, PAGE_SIZE); 647 loff_t unmap_end = round_down(1 + lend, PAGE_SIZE) - 1; 648 /* 649 * This rounding is currently just for example: unmap_mapping_range 650 * expands its hole outwards, whereas we want it to contract the hole 651 * inwards. However, existing callers of truncate_pagecache_range are 652 * doing their own page rounding first; and truncate_inode_pages_range 653 * currently BUGs if lend is not pagealigned-1 (it handles partial 654 * page at start of hole, but not partial page at end of hole). Note 655 * unmap_mapping_range allows holelen 0 for all, and we allow lend -1. 656 */ 657 658 /* 659 * Unlike in truncate_pagecache, unmap_mapping_range is called only 660 * once (before truncating pagecache), and without "even_cows" flag: 661 * hole-punching should not remove private COWed pages from the hole. 662 */ 663 if ((u64)unmap_end > (u64)unmap_start) 664 unmap_mapping_range(mapping, unmap_start, 665 1 + unmap_end - unmap_start, 0); 666 truncate_inode_pages_range(mapping, lstart, lend); 667 } 668 EXPORT_SYMBOL(truncate_pagecache_range); 669