1 /* 2 * mm/truncate.c - code for taking down pages from address_spaces 3 * 4 * Copyright (C) 2002, Linus Torvalds 5 * 6 * 10Sep2002 Andrew Morton 7 * Initial version. 8 */ 9 10 #include <linux/kernel.h> 11 #include <linux/backing-dev.h> 12 #include <linux/gfp.h> 13 #include <linux/mm.h> 14 #include <linux/swap.h> 15 #include <linux/module.h> 16 #include <linux/pagemap.h> 17 #include <linux/highmem.h> 18 #include <linux/pagevec.h> 19 #include <linux/task_io_accounting_ops.h> 20 #include <linux/buffer_head.h> /* grr. try_to_release_page, 21 do_invalidatepage */ 22 #include <linux/cleancache.h> 23 #include "internal.h" 24 25 26 /** 27 * do_invalidatepage - invalidate part or all of a page 28 * @page: the page which is affected 29 * @offset: the index of the truncation point 30 * 31 * do_invalidatepage() is called when all or part of the page has become 32 * invalidated by a truncate operation. 33 * 34 * do_invalidatepage() does not have to release all buffers, but it must 35 * ensure that no dirty buffer is left outside @offset and that no I/O 36 * is underway against any of the blocks which are outside the truncation 37 * point. Because the caller is about to free (and possibly reuse) those 38 * blocks on-disk. 39 */ 40 void do_invalidatepage(struct page *page, unsigned long offset) 41 { 42 void (*invalidatepage)(struct page *, unsigned long); 43 invalidatepage = page->mapping->a_ops->invalidatepage; 44 #ifdef CONFIG_BLOCK 45 if (!invalidatepage) 46 invalidatepage = block_invalidatepage; 47 #endif 48 if (invalidatepage) 49 (*invalidatepage)(page, offset); 50 } 51 52 static inline void truncate_partial_page(struct page *page, unsigned partial) 53 { 54 zero_user_segment(page, partial, PAGE_CACHE_SIZE); 55 cleancache_flush_page(page->mapping, page); 56 if (page_has_private(page)) 57 do_invalidatepage(page, partial); 58 } 59 60 /* 61 * This cancels just the dirty bit on the kernel page itself, it 62 * does NOT actually remove dirty bits on any mmap's that may be 63 * around. It also leaves the page tagged dirty, so any sync 64 * activity will still find it on the dirty lists, and in particular, 65 * clear_page_dirty_for_io() will still look at the dirty bits in 66 * the VM. 67 * 68 * Doing this should *normally* only ever be done when a page 69 * is truncated, and is not actually mapped anywhere at all. However, 70 * fs/buffer.c does this when it notices that somebody has cleaned 71 * out all the buffers on a page without actually doing it through 72 * the VM. Can you say "ext3 is horribly ugly"? Tought you could. 73 */ 74 void cancel_dirty_page(struct page *page, unsigned int account_size) 75 { 76 if (TestClearPageDirty(page)) { 77 struct address_space *mapping = page->mapping; 78 if (mapping && mapping_cap_account_dirty(mapping)) { 79 dec_zone_page_state(page, NR_FILE_DIRTY); 80 dec_bdi_stat(mapping->backing_dev_info, 81 BDI_RECLAIMABLE); 82 if (account_size) 83 task_io_account_cancelled_write(account_size); 84 } 85 } 86 } 87 EXPORT_SYMBOL(cancel_dirty_page); 88 89 /* 90 * If truncate cannot remove the fs-private metadata from the page, the page 91 * becomes orphaned. It will be left on the LRU and may even be mapped into 92 * user pagetables if we're racing with filemap_fault(). 93 * 94 * We need to bale out if page->mapping is no longer equal to the original 95 * mapping. This happens a) when the VM reclaimed the page while we waited on 96 * its lock, b) when a concurrent invalidate_mapping_pages got there first and 97 * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space. 98 */ 99 static int 100 truncate_complete_page(struct address_space *mapping, struct page *page) 101 { 102 if (page->mapping != mapping) 103 return -EIO; 104 105 if (page_has_private(page)) 106 do_invalidatepage(page, 0); 107 108 cancel_dirty_page(page, PAGE_CACHE_SIZE); 109 110 clear_page_mlock(page); 111 ClearPageMappedToDisk(page); 112 delete_from_page_cache(page); 113 return 0; 114 } 115 116 /* 117 * This is for invalidate_mapping_pages(). That function can be called at 118 * any time, and is not supposed to throw away dirty pages. But pages can 119 * be marked dirty at any time too, so use remove_mapping which safely 120 * discards clean, unused pages. 121 * 122 * Returns non-zero if the page was successfully invalidated. 123 */ 124 static int 125 invalidate_complete_page(struct address_space *mapping, struct page *page) 126 { 127 int ret; 128 129 if (page->mapping != mapping) 130 return 0; 131 132 if (page_has_private(page) && !try_to_release_page(page, 0)) 133 return 0; 134 135 clear_page_mlock(page); 136 ret = remove_mapping(mapping, page); 137 138 return ret; 139 } 140 141 int truncate_inode_page(struct address_space *mapping, struct page *page) 142 { 143 if (page_mapped(page)) { 144 unmap_mapping_range(mapping, 145 (loff_t)page->index << PAGE_CACHE_SHIFT, 146 PAGE_CACHE_SIZE, 0); 147 } 148 return truncate_complete_page(mapping, page); 149 } 150 151 /* 152 * Used to get rid of pages on hardware memory corruption. 153 */ 154 int generic_error_remove_page(struct address_space *mapping, struct page *page) 155 { 156 if (!mapping) 157 return -EINVAL; 158 /* 159 * Only punch for normal data pages for now. 160 * Handling other types like directories would need more auditing. 161 */ 162 if (!S_ISREG(mapping->host->i_mode)) 163 return -EIO; 164 return truncate_inode_page(mapping, page); 165 } 166 EXPORT_SYMBOL(generic_error_remove_page); 167 168 /* 169 * Safely invalidate one page from its pagecache mapping. 170 * It only drops clean, unused pages. The page must be locked. 171 * 172 * Returns 1 if the page is successfully invalidated, otherwise 0. 173 */ 174 int invalidate_inode_page(struct page *page) 175 { 176 struct address_space *mapping = page_mapping(page); 177 if (!mapping) 178 return 0; 179 if (PageDirty(page) || PageWriteback(page)) 180 return 0; 181 if (page_mapped(page)) 182 return 0; 183 return invalidate_complete_page(mapping, page); 184 } 185 186 /** 187 * truncate_inode_pages - truncate range of pages specified by start & end byte offsets 188 * @mapping: mapping to truncate 189 * @lstart: offset from which to truncate 190 * @lend: offset to which to truncate 191 * 192 * Truncate the page cache, removing the pages that are between 193 * specified offsets (and zeroing out partial page 194 * (if lstart is not page aligned)). 195 * 196 * Truncate takes two passes - the first pass is nonblocking. It will not 197 * block on page locks and it will not block on writeback. The second pass 198 * will wait. This is to prevent as much IO as possible in the affected region. 199 * The first pass will remove most pages, so the search cost of the second pass 200 * is low. 201 * 202 * When looking at page->index outside the page lock we need to be careful to 203 * copy it into a local to avoid races (it could change at any time). 204 * 205 * We pass down the cache-hot hint to the page freeing code. Even if the 206 * mapping is large, it is probably the case that the final pages are the most 207 * recently touched, and freeing happens in ascending file offset order. 208 */ 209 void truncate_inode_pages_range(struct address_space *mapping, 210 loff_t lstart, loff_t lend) 211 { 212 const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; 213 pgoff_t end; 214 const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); 215 struct pagevec pvec; 216 pgoff_t next; 217 int i; 218 219 cleancache_flush_inode(mapping); 220 if (mapping->nrpages == 0) 221 return; 222 223 BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1)); 224 end = (lend >> PAGE_CACHE_SHIFT); 225 226 pagevec_init(&pvec, 0); 227 next = start; 228 while (next <= end && 229 pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { 230 mem_cgroup_uncharge_start(); 231 for (i = 0; i < pagevec_count(&pvec); i++) { 232 struct page *page = pvec.pages[i]; 233 pgoff_t page_index = page->index; 234 235 if (page_index > end) { 236 next = page_index; 237 break; 238 } 239 240 if (page_index > next) 241 next = page_index; 242 next++; 243 if (!trylock_page(page)) 244 continue; 245 if (PageWriteback(page)) { 246 unlock_page(page); 247 continue; 248 } 249 truncate_inode_page(mapping, page); 250 unlock_page(page); 251 } 252 pagevec_release(&pvec); 253 mem_cgroup_uncharge_end(); 254 cond_resched(); 255 } 256 257 if (partial) { 258 struct page *page = find_lock_page(mapping, start - 1); 259 if (page) { 260 wait_on_page_writeback(page); 261 truncate_partial_page(page, partial); 262 unlock_page(page); 263 page_cache_release(page); 264 } 265 } 266 267 next = start; 268 for ( ; ; ) { 269 cond_resched(); 270 if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { 271 if (next == start) 272 break; 273 next = start; 274 continue; 275 } 276 if (pvec.pages[0]->index > end) { 277 pagevec_release(&pvec); 278 break; 279 } 280 mem_cgroup_uncharge_start(); 281 for (i = 0; i < pagevec_count(&pvec); i++) { 282 struct page *page = pvec.pages[i]; 283 284 if (page->index > end) 285 break; 286 lock_page(page); 287 wait_on_page_writeback(page); 288 truncate_inode_page(mapping, page); 289 if (page->index > next) 290 next = page->index; 291 next++; 292 unlock_page(page); 293 } 294 pagevec_release(&pvec); 295 mem_cgroup_uncharge_end(); 296 } 297 cleancache_flush_inode(mapping); 298 } 299 EXPORT_SYMBOL(truncate_inode_pages_range); 300 301 /** 302 * truncate_inode_pages - truncate *all* the pages from an offset 303 * @mapping: mapping to truncate 304 * @lstart: offset from which to truncate 305 * 306 * Called under (and serialised by) inode->i_mutex. 307 */ 308 void truncate_inode_pages(struct address_space *mapping, loff_t lstart) 309 { 310 truncate_inode_pages_range(mapping, lstart, (loff_t)-1); 311 } 312 EXPORT_SYMBOL(truncate_inode_pages); 313 314 /** 315 * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode 316 * @mapping: the address_space which holds the pages to invalidate 317 * @start: the offset 'from' which to invalidate 318 * @end: the offset 'to' which to invalidate (inclusive) 319 * 320 * This function only removes the unlocked pages, if you want to 321 * remove all the pages of one inode, you must call truncate_inode_pages. 322 * 323 * invalidate_mapping_pages() will not block on IO activity. It will not 324 * invalidate pages which are dirty, locked, under writeback or mapped into 325 * pagetables. 326 */ 327 unsigned long invalidate_mapping_pages(struct address_space *mapping, 328 pgoff_t start, pgoff_t end) 329 { 330 struct pagevec pvec; 331 pgoff_t next = start; 332 unsigned long ret; 333 unsigned long count = 0; 334 int i; 335 336 pagevec_init(&pvec, 0); 337 while (next <= end && 338 pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { 339 mem_cgroup_uncharge_start(); 340 for (i = 0; i < pagevec_count(&pvec); i++) { 341 struct page *page = pvec.pages[i]; 342 pgoff_t index; 343 int lock_failed; 344 345 lock_failed = !trylock_page(page); 346 347 /* 348 * We really shouldn't be looking at the ->index of an 349 * unlocked page. But we're not allowed to lock these 350 * pages. So we rely upon nobody altering the ->index 351 * of this (pinned-by-us) page. 352 */ 353 index = page->index; 354 if (index > next) 355 next = index; 356 next++; 357 if (lock_failed) 358 continue; 359 360 ret = invalidate_inode_page(page); 361 unlock_page(page); 362 /* 363 * Invalidation is a hint that the page is no longer 364 * of interest and try to speed up its reclaim. 365 */ 366 if (!ret) 367 deactivate_page(page); 368 count += ret; 369 if (next > end) 370 break; 371 } 372 pagevec_release(&pvec); 373 mem_cgroup_uncharge_end(); 374 cond_resched(); 375 } 376 return count; 377 } 378 EXPORT_SYMBOL(invalidate_mapping_pages); 379 380 /* 381 * This is like invalidate_complete_page(), except it ignores the page's 382 * refcount. We do this because invalidate_inode_pages2() needs stronger 383 * invalidation guarantees, and cannot afford to leave pages behind because 384 * shrink_page_list() has a temp ref on them, or because they're transiently 385 * sitting in the lru_cache_add() pagevecs. 386 */ 387 static int 388 invalidate_complete_page2(struct address_space *mapping, struct page *page) 389 { 390 if (page->mapping != mapping) 391 return 0; 392 393 if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) 394 return 0; 395 396 spin_lock_irq(&mapping->tree_lock); 397 if (PageDirty(page)) 398 goto failed; 399 400 clear_page_mlock(page); 401 BUG_ON(page_has_private(page)); 402 __delete_from_page_cache(page); 403 spin_unlock_irq(&mapping->tree_lock); 404 mem_cgroup_uncharge_cache_page(page); 405 406 if (mapping->a_ops->freepage) 407 mapping->a_ops->freepage(page); 408 409 page_cache_release(page); /* pagecache ref */ 410 return 1; 411 failed: 412 spin_unlock_irq(&mapping->tree_lock); 413 return 0; 414 } 415 416 static int do_launder_page(struct address_space *mapping, struct page *page) 417 { 418 if (!PageDirty(page)) 419 return 0; 420 if (page->mapping != mapping || mapping->a_ops->launder_page == NULL) 421 return 0; 422 return mapping->a_ops->launder_page(page); 423 } 424 425 /** 426 * invalidate_inode_pages2_range - remove range of pages from an address_space 427 * @mapping: the address_space 428 * @start: the page offset 'from' which to invalidate 429 * @end: the page offset 'to' which to invalidate (inclusive) 430 * 431 * Any pages which are found to be mapped into pagetables are unmapped prior to 432 * invalidation. 433 * 434 * Returns -EBUSY if any pages could not be invalidated. 435 */ 436 int invalidate_inode_pages2_range(struct address_space *mapping, 437 pgoff_t start, pgoff_t end) 438 { 439 struct pagevec pvec; 440 pgoff_t next; 441 int i; 442 int ret = 0; 443 int ret2 = 0; 444 int did_range_unmap = 0; 445 int wrapped = 0; 446 447 cleancache_flush_inode(mapping); 448 pagevec_init(&pvec, 0); 449 next = start; 450 while (next <= end && !wrapped && 451 pagevec_lookup(&pvec, mapping, next, 452 min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { 453 mem_cgroup_uncharge_start(); 454 for (i = 0; i < pagevec_count(&pvec); i++) { 455 struct page *page = pvec.pages[i]; 456 pgoff_t page_index; 457 458 lock_page(page); 459 if (page->mapping != mapping) { 460 unlock_page(page); 461 continue; 462 } 463 page_index = page->index; 464 next = page_index + 1; 465 if (next == 0) 466 wrapped = 1; 467 if (page_index > end) { 468 unlock_page(page); 469 break; 470 } 471 wait_on_page_writeback(page); 472 if (page_mapped(page)) { 473 if (!did_range_unmap) { 474 /* 475 * Zap the rest of the file in one hit. 476 */ 477 unmap_mapping_range(mapping, 478 (loff_t)page_index<<PAGE_CACHE_SHIFT, 479 (loff_t)(end - page_index + 1) 480 << PAGE_CACHE_SHIFT, 481 0); 482 did_range_unmap = 1; 483 } else { 484 /* 485 * Just zap this page 486 */ 487 unmap_mapping_range(mapping, 488 (loff_t)page_index<<PAGE_CACHE_SHIFT, 489 PAGE_CACHE_SIZE, 0); 490 } 491 } 492 BUG_ON(page_mapped(page)); 493 ret2 = do_launder_page(mapping, page); 494 if (ret2 == 0) { 495 if (!invalidate_complete_page2(mapping, page)) 496 ret2 = -EBUSY; 497 } 498 if (ret2 < 0) 499 ret = ret2; 500 unlock_page(page); 501 } 502 pagevec_release(&pvec); 503 mem_cgroup_uncharge_end(); 504 cond_resched(); 505 } 506 cleancache_flush_inode(mapping); 507 return ret; 508 } 509 EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range); 510 511 /** 512 * invalidate_inode_pages2 - remove all pages from an address_space 513 * @mapping: the address_space 514 * 515 * Any pages which are found to be mapped into pagetables are unmapped prior to 516 * invalidation. 517 * 518 * Returns -EBUSY if any pages could not be invalidated. 519 */ 520 int invalidate_inode_pages2(struct address_space *mapping) 521 { 522 return invalidate_inode_pages2_range(mapping, 0, -1); 523 } 524 EXPORT_SYMBOL_GPL(invalidate_inode_pages2); 525 526 /** 527 * truncate_pagecache - unmap and remove pagecache that has been truncated 528 * @inode: inode 529 * @old: old file offset 530 * @new: new file offset 531 * 532 * inode's new i_size must already be written before truncate_pagecache 533 * is called. 534 * 535 * This function should typically be called before the filesystem 536 * releases resources associated with the freed range (eg. deallocates 537 * blocks). This way, pagecache will always stay logically coherent 538 * with on-disk format, and the filesystem would not have to deal with 539 * situations such as writepage being called for a page that has already 540 * had its underlying blocks deallocated. 541 */ 542 void truncate_pagecache(struct inode *inode, loff_t old, loff_t new) 543 { 544 struct address_space *mapping = inode->i_mapping; 545 546 /* 547 * unmap_mapping_range is called twice, first simply for 548 * efficiency so that truncate_inode_pages does fewer 549 * single-page unmaps. However after this first call, and 550 * before truncate_inode_pages finishes, it is possible for 551 * private pages to be COWed, which remain after 552 * truncate_inode_pages finishes, hence the second 553 * unmap_mapping_range call must be made for correctness. 554 */ 555 unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); 556 truncate_inode_pages(mapping, new); 557 unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); 558 } 559 EXPORT_SYMBOL(truncate_pagecache); 560 561 /** 562 * truncate_setsize - update inode and pagecache for a new file size 563 * @inode: inode 564 * @newsize: new file size 565 * 566 * truncate_setsize updates i_size and performs pagecache truncation (if 567 * necessary) to @newsize. It will be typically be called from the filesystem's 568 * setattr function when ATTR_SIZE is passed in. 569 * 570 * Must be called with inode_mutex held and before all filesystem specific 571 * block truncation has been performed. 572 */ 573 void truncate_setsize(struct inode *inode, loff_t newsize) 574 { 575 loff_t oldsize; 576 577 oldsize = inode->i_size; 578 i_size_write(inode, newsize); 579 580 truncate_pagecache(inode, oldsize, newsize); 581 } 582 EXPORT_SYMBOL(truncate_setsize); 583 584 /** 585 * vmtruncate - unmap mappings "freed" by truncate() syscall 586 * @inode: inode of the file used 587 * @offset: file offset to start truncating 588 * 589 * This function is deprecated and truncate_setsize or truncate_pagecache 590 * should be used instead, together with filesystem specific block truncation. 591 */ 592 int vmtruncate(struct inode *inode, loff_t offset) 593 { 594 int error; 595 596 error = inode_newsize_ok(inode, offset); 597 if (error) 598 return error; 599 600 truncate_setsize(inode, offset); 601 if (inode->i_op->truncate) 602 inode->i_op->truncate(inode); 603 return 0; 604 } 605 EXPORT_SYMBOL(vmtruncate); 606