1 /* 2 * mm/truncate.c - code for taking down pages from address_spaces 3 * 4 * Copyright (C) 2002, Linus Torvalds 5 * 6 * 10Sep2002 Andrew Morton 7 * Initial version. 8 */ 9 10 #include <linux/kernel.h> 11 #include <linux/backing-dev.h> 12 #include <linux/gfp.h> 13 #include <linux/mm.h> 14 #include <linux/swap.h> 15 #include <linux/module.h> 16 #include <linux/pagemap.h> 17 #include <linux/highmem.h> 18 #include <linux/pagevec.h> 19 #include <linux/task_io_accounting_ops.h> 20 #include <linux/buffer_head.h> /* grr. try_to_release_page, 21 do_invalidatepage */ 22 #include <linux/cleancache.h> 23 #include "internal.h" 24 25 26 /** 27 * do_invalidatepage - invalidate part or all of a page 28 * @page: the page which is affected 29 * @offset: the index of the truncation point 30 * 31 * do_invalidatepage() is called when all or part of the page has become 32 * invalidated by a truncate operation. 33 * 34 * do_invalidatepage() does not have to release all buffers, but it must 35 * ensure that no dirty buffer is left outside @offset and that no I/O 36 * is underway against any of the blocks which are outside the truncation 37 * point. Because the caller is about to free (and possibly reuse) those 38 * blocks on-disk. 39 */ 40 void do_invalidatepage(struct page *page, unsigned long offset) 41 { 42 void (*invalidatepage)(struct page *, unsigned long); 43 invalidatepage = page->mapping->a_ops->invalidatepage; 44 #ifdef CONFIG_BLOCK 45 if (!invalidatepage) 46 invalidatepage = block_invalidatepage; 47 #endif 48 if (invalidatepage) 49 (*invalidatepage)(page, offset); 50 } 51 52 static inline void truncate_partial_page(struct page *page, unsigned partial) 53 { 54 zero_user_segment(page, partial, PAGE_CACHE_SIZE); 55 cleancache_flush_page(page->mapping, page); 56 if (page_has_private(page)) 57 do_invalidatepage(page, partial); 58 } 59 60 /* 61 * This cancels just the dirty bit on the kernel page itself, it 62 * does NOT actually remove dirty bits on any mmap's that may be 63 * around. It also leaves the page tagged dirty, so any sync 64 * activity will still find it on the dirty lists, and in particular, 65 * clear_page_dirty_for_io() will still look at the dirty bits in 66 * the VM. 67 * 68 * Doing this should *normally* only ever be done when a page 69 * is truncated, and is not actually mapped anywhere at all. However, 70 * fs/buffer.c does this when it notices that somebody has cleaned 71 * out all the buffers on a page without actually doing it through 72 * the VM. Can you say "ext3 is horribly ugly"? Tought you could. 73 */ 74 void cancel_dirty_page(struct page *page, unsigned int account_size) 75 { 76 if (TestClearPageDirty(page)) { 77 struct address_space *mapping = page->mapping; 78 if (mapping && mapping_cap_account_dirty(mapping)) { 79 dec_zone_page_state(page, NR_FILE_DIRTY); 80 dec_bdi_stat(mapping->backing_dev_info, 81 BDI_RECLAIMABLE); 82 if (account_size) 83 task_io_account_cancelled_write(account_size); 84 } 85 } 86 } 87 EXPORT_SYMBOL(cancel_dirty_page); 88 89 /* 90 * If truncate cannot remove the fs-private metadata from the page, the page 91 * becomes orphaned. It will be left on the LRU and may even be mapped into 92 * user pagetables if we're racing with filemap_fault(). 93 * 94 * We need to bale out if page->mapping is no longer equal to the original 95 * mapping. This happens a) when the VM reclaimed the page while we waited on 96 * its lock, b) when a concurrent invalidate_mapping_pages got there first and 97 * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space. 98 */ 99 static int 100 truncate_complete_page(struct address_space *mapping, struct page *page) 101 { 102 if (page->mapping != mapping) 103 return -EIO; 104 105 if (page_has_private(page)) 106 do_invalidatepage(page, 0); 107 108 cancel_dirty_page(page, PAGE_CACHE_SIZE); 109 110 clear_page_mlock(page); 111 ClearPageMappedToDisk(page); 112 delete_from_page_cache(page); 113 return 0; 114 } 115 116 /* 117 * This is for invalidate_mapping_pages(). That function can be called at 118 * any time, and is not supposed to throw away dirty pages. But pages can 119 * be marked dirty at any time too, so use remove_mapping which safely 120 * discards clean, unused pages. 121 * 122 * Returns non-zero if the page was successfully invalidated. 123 */ 124 static int 125 invalidate_complete_page(struct address_space *mapping, struct page *page) 126 { 127 int ret; 128 129 if (page->mapping != mapping) 130 return 0; 131 132 if (page_has_private(page) && !try_to_release_page(page, 0)) 133 return 0; 134 135 clear_page_mlock(page); 136 ret = remove_mapping(mapping, page); 137 138 return ret; 139 } 140 141 int truncate_inode_page(struct address_space *mapping, struct page *page) 142 { 143 if (page_mapped(page)) { 144 unmap_mapping_range(mapping, 145 (loff_t)page->index << PAGE_CACHE_SHIFT, 146 PAGE_CACHE_SIZE, 0); 147 } 148 return truncate_complete_page(mapping, page); 149 } 150 151 /* 152 * Used to get rid of pages on hardware memory corruption. 153 */ 154 int generic_error_remove_page(struct address_space *mapping, struct page *page) 155 { 156 if (!mapping) 157 return -EINVAL; 158 /* 159 * Only punch for normal data pages for now. 160 * Handling other types like directories would need more auditing. 161 */ 162 if (!S_ISREG(mapping->host->i_mode)) 163 return -EIO; 164 return truncate_inode_page(mapping, page); 165 } 166 EXPORT_SYMBOL(generic_error_remove_page); 167 168 /* 169 * Safely invalidate one page from its pagecache mapping. 170 * It only drops clean, unused pages. The page must be locked. 171 * 172 * Returns 1 if the page is successfully invalidated, otherwise 0. 173 */ 174 int invalidate_inode_page(struct page *page) 175 { 176 struct address_space *mapping = page_mapping(page); 177 if (!mapping) 178 return 0; 179 if (PageDirty(page) || PageWriteback(page)) 180 return 0; 181 if (page_mapped(page)) 182 return 0; 183 return invalidate_complete_page(mapping, page); 184 } 185 186 /** 187 * truncate_inode_pages - truncate range of pages specified by start & end byte offsets 188 * @mapping: mapping to truncate 189 * @lstart: offset from which to truncate 190 * @lend: offset to which to truncate 191 * 192 * Truncate the page cache, removing the pages that are between 193 * specified offsets (and zeroing out partial page 194 * (if lstart is not page aligned)). 195 * 196 * Truncate takes two passes - the first pass is nonblocking. It will not 197 * block on page locks and it will not block on writeback. The second pass 198 * will wait. This is to prevent as much IO as possible in the affected region. 199 * The first pass will remove most pages, so the search cost of the second pass 200 * is low. 201 * 202 * When looking at page->index outside the page lock we need to be careful to 203 * copy it into a local to avoid races (it could change at any time). 204 * 205 * We pass down the cache-hot hint to the page freeing code. Even if the 206 * mapping is large, it is probably the case that the final pages are the most 207 * recently touched, and freeing happens in ascending file offset order. 208 */ 209 void truncate_inode_pages_range(struct address_space *mapping, 210 loff_t lstart, loff_t lend) 211 { 212 const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; 213 pgoff_t end; 214 const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); 215 struct pagevec pvec; 216 pgoff_t next; 217 int i; 218 219 cleancache_flush_inode(mapping); 220 if (mapping->nrpages == 0) 221 return; 222 223 BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1)); 224 end = (lend >> PAGE_CACHE_SHIFT); 225 226 pagevec_init(&pvec, 0); 227 next = start; 228 while (next <= end && 229 pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { 230 mem_cgroup_uncharge_start(); 231 for (i = 0; i < pagevec_count(&pvec); i++) { 232 struct page *page = pvec.pages[i]; 233 pgoff_t page_index = page->index; 234 235 if (page_index > end) { 236 next = page_index; 237 break; 238 } 239 240 if (page_index > next) 241 next = page_index; 242 next++; 243 if (!trylock_page(page)) 244 continue; 245 if (PageWriteback(page)) { 246 unlock_page(page); 247 continue; 248 } 249 truncate_inode_page(mapping, page); 250 unlock_page(page); 251 } 252 pagevec_release(&pvec); 253 mem_cgroup_uncharge_end(); 254 cond_resched(); 255 } 256 257 if (partial) { 258 struct page *page = find_lock_page(mapping, start - 1); 259 if (page) { 260 wait_on_page_writeback(page); 261 truncate_partial_page(page, partial); 262 unlock_page(page); 263 page_cache_release(page); 264 } 265 } 266 267 next = start; 268 for ( ; ; ) { 269 cond_resched(); 270 if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { 271 if (next == start) 272 break; 273 next = start; 274 continue; 275 } 276 if (pvec.pages[0]->index > end) { 277 pagevec_release(&pvec); 278 break; 279 } 280 mem_cgroup_uncharge_start(); 281 for (i = 0; i < pagevec_count(&pvec); i++) { 282 struct page *page = pvec.pages[i]; 283 284 if (page->index > end) 285 break; 286 lock_page(page); 287 wait_on_page_writeback(page); 288 truncate_inode_page(mapping, page); 289 if (page->index > next) 290 next = page->index; 291 next++; 292 unlock_page(page); 293 } 294 pagevec_release(&pvec); 295 mem_cgroup_uncharge_end(); 296 } 297 cleancache_flush_inode(mapping); 298 } 299 EXPORT_SYMBOL(truncate_inode_pages_range); 300 301 /** 302 * truncate_inode_pages - truncate *all* the pages from an offset 303 * @mapping: mapping to truncate 304 * @lstart: offset from which to truncate 305 * 306 * Called under (and serialised by) inode->i_mutex. 307 * 308 * Note: When this function returns, there can be a page in the process of 309 * deletion (inside __delete_from_page_cache()) in the specified range. Thus 310 * mapping->nrpages can be non-zero when this function returns even after 311 * truncation of the whole mapping. 312 */ 313 void truncate_inode_pages(struct address_space *mapping, loff_t lstart) 314 { 315 truncate_inode_pages_range(mapping, lstart, (loff_t)-1); 316 } 317 EXPORT_SYMBOL(truncate_inode_pages); 318 319 /** 320 * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode 321 * @mapping: the address_space which holds the pages to invalidate 322 * @start: the offset 'from' which to invalidate 323 * @end: the offset 'to' which to invalidate (inclusive) 324 * 325 * This function only removes the unlocked pages, if you want to 326 * remove all the pages of one inode, you must call truncate_inode_pages. 327 * 328 * invalidate_mapping_pages() will not block on IO activity. It will not 329 * invalidate pages which are dirty, locked, under writeback or mapped into 330 * pagetables. 331 */ 332 unsigned long invalidate_mapping_pages(struct address_space *mapping, 333 pgoff_t start, pgoff_t end) 334 { 335 struct pagevec pvec; 336 pgoff_t next = start; 337 unsigned long ret; 338 unsigned long count = 0; 339 int i; 340 341 pagevec_init(&pvec, 0); 342 while (next <= end && 343 pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { 344 mem_cgroup_uncharge_start(); 345 for (i = 0; i < pagevec_count(&pvec); i++) { 346 struct page *page = pvec.pages[i]; 347 pgoff_t index; 348 int lock_failed; 349 350 lock_failed = !trylock_page(page); 351 352 /* 353 * We really shouldn't be looking at the ->index of an 354 * unlocked page. But we're not allowed to lock these 355 * pages. So we rely upon nobody altering the ->index 356 * of this (pinned-by-us) page. 357 */ 358 index = page->index; 359 if (index > next) 360 next = index; 361 next++; 362 if (lock_failed) 363 continue; 364 365 ret = invalidate_inode_page(page); 366 unlock_page(page); 367 /* 368 * Invalidation is a hint that the page is no longer 369 * of interest and try to speed up its reclaim. 370 */ 371 if (!ret) 372 deactivate_page(page); 373 count += ret; 374 if (next > end) 375 break; 376 } 377 pagevec_release(&pvec); 378 mem_cgroup_uncharge_end(); 379 cond_resched(); 380 } 381 return count; 382 } 383 EXPORT_SYMBOL(invalidate_mapping_pages); 384 385 /* 386 * This is like invalidate_complete_page(), except it ignores the page's 387 * refcount. We do this because invalidate_inode_pages2() needs stronger 388 * invalidation guarantees, and cannot afford to leave pages behind because 389 * shrink_page_list() has a temp ref on them, or because they're transiently 390 * sitting in the lru_cache_add() pagevecs. 391 */ 392 static int 393 invalidate_complete_page2(struct address_space *mapping, struct page *page) 394 { 395 if (page->mapping != mapping) 396 return 0; 397 398 if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) 399 return 0; 400 401 spin_lock_irq(&mapping->tree_lock); 402 if (PageDirty(page)) 403 goto failed; 404 405 clear_page_mlock(page); 406 BUG_ON(page_has_private(page)); 407 __delete_from_page_cache(page); 408 spin_unlock_irq(&mapping->tree_lock); 409 mem_cgroup_uncharge_cache_page(page); 410 411 if (mapping->a_ops->freepage) 412 mapping->a_ops->freepage(page); 413 414 page_cache_release(page); /* pagecache ref */ 415 return 1; 416 failed: 417 spin_unlock_irq(&mapping->tree_lock); 418 return 0; 419 } 420 421 static int do_launder_page(struct address_space *mapping, struct page *page) 422 { 423 if (!PageDirty(page)) 424 return 0; 425 if (page->mapping != mapping || mapping->a_ops->launder_page == NULL) 426 return 0; 427 return mapping->a_ops->launder_page(page); 428 } 429 430 /** 431 * invalidate_inode_pages2_range - remove range of pages from an address_space 432 * @mapping: the address_space 433 * @start: the page offset 'from' which to invalidate 434 * @end: the page offset 'to' which to invalidate (inclusive) 435 * 436 * Any pages which are found to be mapped into pagetables are unmapped prior to 437 * invalidation. 438 * 439 * Returns -EBUSY if any pages could not be invalidated. 440 */ 441 int invalidate_inode_pages2_range(struct address_space *mapping, 442 pgoff_t start, pgoff_t end) 443 { 444 struct pagevec pvec; 445 pgoff_t next; 446 int i; 447 int ret = 0; 448 int ret2 = 0; 449 int did_range_unmap = 0; 450 int wrapped = 0; 451 452 cleancache_flush_inode(mapping); 453 pagevec_init(&pvec, 0); 454 next = start; 455 while (next <= end && !wrapped && 456 pagevec_lookup(&pvec, mapping, next, 457 min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { 458 mem_cgroup_uncharge_start(); 459 for (i = 0; i < pagevec_count(&pvec); i++) { 460 struct page *page = pvec.pages[i]; 461 pgoff_t page_index; 462 463 lock_page(page); 464 if (page->mapping != mapping) { 465 unlock_page(page); 466 continue; 467 } 468 page_index = page->index; 469 next = page_index + 1; 470 if (next == 0) 471 wrapped = 1; 472 if (page_index > end) { 473 unlock_page(page); 474 break; 475 } 476 wait_on_page_writeback(page); 477 if (page_mapped(page)) { 478 if (!did_range_unmap) { 479 /* 480 * Zap the rest of the file in one hit. 481 */ 482 unmap_mapping_range(mapping, 483 (loff_t)page_index<<PAGE_CACHE_SHIFT, 484 (loff_t)(end - page_index + 1) 485 << PAGE_CACHE_SHIFT, 486 0); 487 did_range_unmap = 1; 488 } else { 489 /* 490 * Just zap this page 491 */ 492 unmap_mapping_range(mapping, 493 (loff_t)page_index<<PAGE_CACHE_SHIFT, 494 PAGE_CACHE_SIZE, 0); 495 } 496 } 497 BUG_ON(page_mapped(page)); 498 ret2 = do_launder_page(mapping, page); 499 if (ret2 == 0) { 500 if (!invalidate_complete_page2(mapping, page)) 501 ret2 = -EBUSY; 502 } 503 if (ret2 < 0) 504 ret = ret2; 505 unlock_page(page); 506 } 507 pagevec_release(&pvec); 508 mem_cgroup_uncharge_end(); 509 cond_resched(); 510 } 511 cleancache_flush_inode(mapping); 512 return ret; 513 } 514 EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range); 515 516 /** 517 * invalidate_inode_pages2 - remove all pages from an address_space 518 * @mapping: the address_space 519 * 520 * Any pages which are found to be mapped into pagetables are unmapped prior to 521 * invalidation. 522 * 523 * Returns -EBUSY if any pages could not be invalidated. 524 */ 525 int invalidate_inode_pages2(struct address_space *mapping) 526 { 527 return invalidate_inode_pages2_range(mapping, 0, -1); 528 } 529 EXPORT_SYMBOL_GPL(invalidate_inode_pages2); 530 531 /** 532 * truncate_pagecache - unmap and remove pagecache that has been truncated 533 * @inode: inode 534 * @old: old file offset 535 * @new: new file offset 536 * 537 * inode's new i_size must already be written before truncate_pagecache 538 * is called. 539 * 540 * This function should typically be called before the filesystem 541 * releases resources associated with the freed range (eg. deallocates 542 * blocks). This way, pagecache will always stay logically coherent 543 * with on-disk format, and the filesystem would not have to deal with 544 * situations such as writepage being called for a page that has already 545 * had its underlying blocks deallocated. 546 */ 547 void truncate_pagecache(struct inode *inode, loff_t old, loff_t new) 548 { 549 struct address_space *mapping = inode->i_mapping; 550 551 /* 552 * unmap_mapping_range is called twice, first simply for 553 * efficiency so that truncate_inode_pages does fewer 554 * single-page unmaps. However after this first call, and 555 * before truncate_inode_pages finishes, it is possible for 556 * private pages to be COWed, which remain after 557 * truncate_inode_pages finishes, hence the second 558 * unmap_mapping_range call must be made for correctness. 559 */ 560 unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); 561 truncate_inode_pages(mapping, new); 562 unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); 563 } 564 EXPORT_SYMBOL(truncate_pagecache); 565 566 /** 567 * truncate_setsize - update inode and pagecache for a new file size 568 * @inode: inode 569 * @newsize: new file size 570 * 571 * truncate_setsize updates i_size and performs pagecache truncation (if 572 * necessary) to @newsize. It will be typically be called from the filesystem's 573 * setattr function when ATTR_SIZE is passed in. 574 * 575 * Must be called with inode_mutex held and before all filesystem specific 576 * block truncation has been performed. 577 */ 578 void truncate_setsize(struct inode *inode, loff_t newsize) 579 { 580 loff_t oldsize; 581 582 oldsize = inode->i_size; 583 i_size_write(inode, newsize); 584 585 truncate_pagecache(inode, oldsize, newsize); 586 } 587 EXPORT_SYMBOL(truncate_setsize); 588 589 /** 590 * vmtruncate - unmap mappings "freed" by truncate() syscall 591 * @inode: inode of the file used 592 * @offset: file offset to start truncating 593 * 594 * This function is deprecated and truncate_setsize or truncate_pagecache 595 * should be used instead, together with filesystem specific block truncation. 596 */ 597 int vmtruncate(struct inode *inode, loff_t offset) 598 { 599 int error; 600 601 error = inode_newsize_ok(inode, offset); 602 if (error) 603 return error; 604 605 truncate_setsize(inode, offset); 606 if (inode->i_op->truncate) 607 inode->i_op->truncate(inode); 608 return 0; 609 } 610 EXPORT_SYMBOL(vmtruncate); 611 612 int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) 613 { 614 struct address_space *mapping = inode->i_mapping; 615 616 /* 617 * If the underlying filesystem is not going to provide 618 * a way to truncate a range of blocks (punch a hole) - 619 * we should return failure right now. 620 */ 621 if (!inode->i_op->truncate_range) 622 return -ENOSYS; 623 624 mutex_lock(&inode->i_mutex); 625 down_write(&inode->i_alloc_sem); 626 unmap_mapping_range(mapping, offset, (end - offset), 1); 627 inode->i_op->truncate_range(inode, offset, end); 628 /* unmap again to remove racily COWed private pages */ 629 unmap_mapping_range(mapping, offset, (end - offset), 1); 630 up_write(&inode->i_alloc_sem); 631 mutex_unlock(&inode->i_mutex); 632 633 return 0; 634 } 635