1 /* 2 * linux/mm/filemap.c 3 * 4 * Copyright (C) 1994-1999 Linus Torvalds 5 */ 6 7 /* 8 * This file handles the generic file mmap semantics used by 9 * most "normal" filesystems (but you don't /have/ to use this: 10 * the NFS filesystem used to do this differently, for example) 11 */ 12 #include <linux/export.h> 13 #include <linux/compiler.h> 14 #include <linux/dax.h> 15 #include <linux/fs.h> 16 #include <linux/uaccess.h> 17 #include <linux/capability.h> 18 #include <linux/kernel_stat.h> 19 #include <linux/gfp.h> 20 #include <linux/mm.h> 21 #include <linux/swap.h> 22 #include <linux/mman.h> 23 #include <linux/pagemap.h> 24 #include <linux/file.h> 25 #include <linux/uio.h> 26 #include <linux/hash.h> 27 #include <linux/writeback.h> 28 #include <linux/backing-dev.h> 29 #include <linux/pagevec.h> 30 #include <linux/blkdev.h> 31 #include <linux/security.h> 32 #include <linux/cpuset.h> 33 #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ 34 #include <linux/hugetlb.h> 35 #include <linux/memcontrol.h> 36 #include <linux/cleancache.h> 37 #include <linux/rmap.h> 38 #include "internal.h" 39 40 #define CREATE_TRACE_POINTS 41 #include <trace/events/filemap.h> 42 43 /* 44 * FIXME: remove all knowledge of the buffer layer from the core VM 45 */ 46 #include <linux/buffer_head.h> /* for try_to_free_buffers */ 47 48 #include <asm/mman.h> 49 50 /* 51 * Shared mappings implemented 30.11.1994. It's not fully working yet, 52 * though. 53 * 54 * Shared mappings now work. 15.8.1995 Bruno. 55 * 56 * finished 'unifying' the page and buffer cache and SMP-threaded the 57 * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com> 58 * 59 * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de> 60 */ 61 62 /* 63 * Lock ordering: 64 * 65 * ->i_mmap_rwsem (truncate_pagecache) 66 * ->private_lock (__free_pte->__set_page_dirty_buffers) 67 * ->swap_lock (exclusive_swap_page, others) 68 * ->mapping->tree_lock 69 * 70 * ->i_mutex 71 * ->i_mmap_rwsem (truncate->unmap_mapping_range) 72 * 73 * ->mmap_sem 74 * ->i_mmap_rwsem 75 * ->page_table_lock or pte_lock (various, mainly in memory.c) 76 * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock) 77 * 78 * ->mmap_sem 79 * ->lock_page (access_process_vm) 80 * 81 * ->i_mutex (generic_perform_write) 82 * ->mmap_sem (fault_in_pages_readable->do_page_fault) 83 * 84 * bdi->wb.list_lock 85 * sb_lock (fs/fs-writeback.c) 86 * ->mapping->tree_lock (__sync_single_inode) 87 * 88 * ->i_mmap_rwsem 89 * ->anon_vma.lock (vma_adjust) 90 * 91 * ->anon_vma.lock 92 * ->page_table_lock or pte_lock (anon_vma_prepare and various) 93 * 94 * ->page_table_lock or pte_lock 95 * ->swap_lock (try_to_unmap_one) 96 * ->private_lock (try_to_unmap_one) 97 * ->tree_lock (try_to_unmap_one) 98 * ->zone_lru_lock(zone) (follow_page->mark_page_accessed) 99 * ->zone_lru_lock(zone) (check_pte_range->isolate_lru_page) 100 * ->private_lock (page_remove_rmap->set_page_dirty) 101 * ->tree_lock (page_remove_rmap->set_page_dirty) 102 * bdi.wb->list_lock (page_remove_rmap->set_page_dirty) 103 * ->inode->i_lock (page_remove_rmap->set_page_dirty) 104 * ->memcg->move_lock (page_remove_rmap->lock_page_memcg) 105 * bdi.wb->list_lock (zap_pte_range->set_page_dirty) 106 * ->inode->i_lock (zap_pte_range->set_page_dirty) 107 * ->private_lock (zap_pte_range->__set_page_dirty_buffers) 108 * 109 * ->i_mmap_rwsem 110 * ->tasklist_lock (memory_failure, collect_procs_ao) 111 */ 112 113 static int page_cache_tree_insert(struct address_space *mapping, 114 struct page *page, void **shadowp) 115 { 116 struct radix_tree_node *node; 117 void **slot; 118 int error; 119 120 error = __radix_tree_create(&mapping->page_tree, page->index, 0, 121 &node, &slot); 122 if (error) 123 return error; 124 if (*slot) { 125 void *p; 126 127 p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock); 128 if (!radix_tree_exceptional_entry(p)) 129 return -EEXIST; 130 131 mapping->nrexceptional--; 132 if (!dax_mapping(mapping)) { 133 if (shadowp) 134 *shadowp = p; 135 if (node) 136 workingset_node_shadows_dec(node); 137 } else { 138 /* DAX can replace empty locked entry with a hole */ 139 WARN_ON_ONCE(p != 140 (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | 141 RADIX_DAX_ENTRY_LOCK)); 142 /* DAX accounts exceptional entries as normal pages */ 143 if (node) 144 workingset_node_pages_dec(node); 145 /* Wakeup waiters for exceptional entry lock */ 146 dax_wake_mapping_entry_waiter(mapping, page->index, 147 false); 148 } 149 } 150 radix_tree_replace_slot(slot, page); 151 mapping->nrpages++; 152 if (node) { 153 workingset_node_pages_inc(node); 154 /* 155 * Don't track node that contains actual pages. 156 * 157 * Avoid acquiring the list_lru lock if already 158 * untracked. The list_empty() test is safe as 159 * node->private_list is protected by 160 * mapping->tree_lock. 161 */ 162 if (!list_empty(&node->private_list)) 163 list_lru_del(&workingset_shadow_nodes, 164 &node->private_list); 165 } 166 return 0; 167 } 168 169 static void page_cache_tree_delete(struct address_space *mapping, 170 struct page *page, void *shadow) 171 { 172 int i, nr = PageHuge(page) ? 1 : hpage_nr_pages(page); 173 174 VM_BUG_ON_PAGE(!PageLocked(page), page); 175 VM_BUG_ON_PAGE(PageTail(page), page); 176 VM_BUG_ON_PAGE(nr != 1 && shadow, page); 177 178 for (i = 0; i < nr; i++) { 179 struct radix_tree_node *node; 180 void **slot; 181 182 __radix_tree_lookup(&mapping->page_tree, page->index + i, 183 &node, &slot); 184 185 radix_tree_clear_tags(&mapping->page_tree, node, slot); 186 187 if (!node) { 188 VM_BUG_ON_PAGE(nr != 1, page); 189 /* 190 * We need a node to properly account shadow 191 * entries. Don't plant any without. XXX 192 */ 193 shadow = NULL; 194 } 195 196 radix_tree_replace_slot(slot, shadow); 197 198 if (!node) 199 break; 200 201 workingset_node_pages_dec(node); 202 if (shadow) 203 workingset_node_shadows_inc(node); 204 else 205 if (__radix_tree_delete_node(&mapping->page_tree, node)) 206 continue; 207 208 /* 209 * Track node that only contains shadow entries. DAX mappings 210 * contain no shadow entries and may contain other exceptional 211 * entries so skip those. 212 * 213 * Avoid acquiring the list_lru lock if already tracked. 214 * The list_empty() test is safe as node->private_list is 215 * protected by mapping->tree_lock. 216 */ 217 if (!dax_mapping(mapping) && !workingset_node_pages(node) && 218 list_empty(&node->private_list)) { 219 node->private_data = mapping; 220 list_lru_add(&workingset_shadow_nodes, 221 &node->private_list); 222 } 223 } 224 225 if (shadow) { 226 mapping->nrexceptional += nr; 227 /* 228 * Make sure the nrexceptional update is committed before 229 * the nrpages update so that final truncate racing 230 * with reclaim does not see both counters 0 at the 231 * same time and miss a shadow entry. 232 */ 233 smp_wmb(); 234 } 235 mapping->nrpages -= nr; 236 } 237 238 /* 239 * Delete a page from the page cache and free it. Caller has to make 240 * sure the page is locked and that nobody else uses it - or that usage 241 * is safe. The caller must hold the mapping's tree_lock. 242 */ 243 void __delete_from_page_cache(struct page *page, void *shadow) 244 { 245 struct address_space *mapping = page->mapping; 246 int nr = hpage_nr_pages(page); 247 248 trace_mm_filemap_delete_from_page_cache(page); 249 /* 250 * if we're uptodate, flush out into the cleancache, otherwise 251 * invalidate any existing cleancache entries. We can't leave 252 * stale data around in the cleancache once our page is gone 253 */ 254 if (PageUptodate(page) && PageMappedToDisk(page)) 255 cleancache_put_page(page); 256 else 257 cleancache_invalidate_page(mapping, page); 258 259 VM_BUG_ON_PAGE(PageTail(page), page); 260 VM_BUG_ON_PAGE(page_mapped(page), page); 261 if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(page_mapped(page))) { 262 int mapcount; 263 264 pr_alert("BUG: Bad page cache in process %s pfn:%05lx\n", 265 current->comm, page_to_pfn(page)); 266 dump_page(page, "still mapped when deleted"); 267 dump_stack(); 268 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); 269 270 mapcount = page_mapcount(page); 271 if (mapping_exiting(mapping) && 272 page_count(page) >= mapcount + 2) { 273 /* 274 * All vmas have already been torn down, so it's 275 * a good bet that actually the page is unmapped, 276 * and we'd prefer not to leak it: if we're wrong, 277 * some other bad page check should catch it later. 278 */ 279 page_mapcount_reset(page); 280 page_ref_sub(page, mapcount); 281 } 282 } 283 284 page_cache_tree_delete(mapping, page, shadow); 285 286 page->mapping = NULL; 287 /* Leave page->index set: truncation lookup relies upon it */ 288 289 /* hugetlb pages do not participate in page cache accounting. */ 290 if (!PageHuge(page)) 291 __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr); 292 if (PageSwapBacked(page)) { 293 __mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr); 294 if (PageTransHuge(page)) 295 __dec_node_page_state(page, NR_SHMEM_THPS); 296 } else { 297 VM_BUG_ON_PAGE(PageTransHuge(page) && !PageHuge(page), page); 298 } 299 300 /* 301 * At this point page must be either written or cleaned by truncate. 302 * Dirty page here signals a bug and loss of unwritten data. 303 * 304 * This fixes dirty accounting after removing the page entirely but 305 * leaves PageDirty set: it has no effect for truncated page and 306 * anyway will be cleared before returning page into buddy allocator. 307 */ 308 if (WARN_ON_ONCE(PageDirty(page))) 309 account_page_cleaned(page, mapping, inode_to_wb(mapping->host)); 310 } 311 312 /** 313 * delete_from_page_cache - delete page from page cache 314 * @page: the page which the kernel is trying to remove from page cache 315 * 316 * This must be called only on pages that have been verified to be in the page 317 * cache and locked. It will never put the page into the free list, the caller 318 * has a reference on the page. 319 */ 320 void delete_from_page_cache(struct page *page) 321 { 322 struct address_space *mapping = page_mapping(page); 323 unsigned long flags; 324 void (*freepage)(struct page *); 325 326 BUG_ON(!PageLocked(page)); 327 328 freepage = mapping->a_ops->freepage; 329 330 spin_lock_irqsave(&mapping->tree_lock, flags); 331 __delete_from_page_cache(page, NULL); 332 spin_unlock_irqrestore(&mapping->tree_lock, flags); 333 334 if (freepage) 335 freepage(page); 336 337 if (PageTransHuge(page) && !PageHuge(page)) { 338 page_ref_sub(page, HPAGE_PMD_NR); 339 VM_BUG_ON_PAGE(page_count(page) <= 0, page); 340 } else { 341 put_page(page); 342 } 343 } 344 EXPORT_SYMBOL(delete_from_page_cache); 345 346 int filemap_check_errors(struct address_space *mapping) 347 { 348 int ret = 0; 349 /* Check for outstanding write errors */ 350 if (test_bit(AS_ENOSPC, &mapping->flags) && 351 test_and_clear_bit(AS_ENOSPC, &mapping->flags)) 352 ret = -ENOSPC; 353 if (test_bit(AS_EIO, &mapping->flags) && 354 test_and_clear_bit(AS_EIO, &mapping->flags)) 355 ret = -EIO; 356 return ret; 357 } 358 EXPORT_SYMBOL(filemap_check_errors); 359 360 /** 361 * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range 362 * @mapping: address space structure to write 363 * @start: offset in bytes where the range starts 364 * @end: offset in bytes where the range ends (inclusive) 365 * @sync_mode: enable synchronous operation 366 * 367 * Start writeback against all of a mapping's dirty pages that lie 368 * within the byte offsets <start, end> inclusive. 369 * 370 * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as 371 * opposed to a regular memory cleansing writeback. The difference between 372 * these two operations is that if a dirty page/buffer is encountered, it must 373 * be waited upon, and not just skipped over. 374 */ 375 int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, 376 loff_t end, int sync_mode) 377 { 378 int ret; 379 struct writeback_control wbc = { 380 .sync_mode = sync_mode, 381 .nr_to_write = LONG_MAX, 382 .range_start = start, 383 .range_end = end, 384 }; 385 386 if (!mapping_cap_writeback_dirty(mapping)) 387 return 0; 388 389 wbc_attach_fdatawrite_inode(&wbc, mapping->host); 390 ret = do_writepages(mapping, &wbc); 391 wbc_detach_inode(&wbc); 392 return ret; 393 } 394 395 static inline int __filemap_fdatawrite(struct address_space *mapping, 396 int sync_mode) 397 { 398 return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode); 399 } 400 401 int filemap_fdatawrite(struct address_space *mapping) 402 { 403 return __filemap_fdatawrite(mapping, WB_SYNC_ALL); 404 } 405 EXPORT_SYMBOL(filemap_fdatawrite); 406 407 int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, 408 loff_t end) 409 { 410 return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL); 411 } 412 EXPORT_SYMBOL(filemap_fdatawrite_range); 413 414 /** 415 * filemap_flush - mostly a non-blocking flush 416 * @mapping: target address_space 417 * 418 * This is a mostly non-blocking flush. Not suitable for data-integrity 419 * purposes - I/O may not be started against all dirty pages. 420 */ 421 int filemap_flush(struct address_space *mapping) 422 { 423 return __filemap_fdatawrite(mapping, WB_SYNC_NONE); 424 } 425 EXPORT_SYMBOL(filemap_flush); 426 427 static int __filemap_fdatawait_range(struct address_space *mapping, 428 loff_t start_byte, loff_t end_byte) 429 { 430 pgoff_t index = start_byte >> PAGE_SHIFT; 431 pgoff_t end = end_byte >> PAGE_SHIFT; 432 struct pagevec pvec; 433 int nr_pages; 434 int ret = 0; 435 436 if (end_byte < start_byte) 437 goto out; 438 439 pagevec_init(&pvec, 0); 440 while ((index <= end) && 441 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, 442 PAGECACHE_TAG_WRITEBACK, 443 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) { 444 unsigned i; 445 446 for (i = 0; i < nr_pages; i++) { 447 struct page *page = pvec.pages[i]; 448 449 /* until radix tree lookup accepts end_index */ 450 if (page->index > end) 451 continue; 452 453 wait_on_page_writeback(page); 454 if (TestClearPageError(page)) 455 ret = -EIO; 456 } 457 pagevec_release(&pvec); 458 cond_resched(); 459 } 460 out: 461 return ret; 462 } 463 464 /** 465 * filemap_fdatawait_range - wait for writeback to complete 466 * @mapping: address space structure to wait for 467 * @start_byte: offset in bytes where the range starts 468 * @end_byte: offset in bytes where the range ends (inclusive) 469 * 470 * Walk the list of under-writeback pages of the given address space 471 * in the given range and wait for all of them. Check error status of 472 * the address space and return it. 473 * 474 * Since the error status of the address space is cleared by this function, 475 * callers are responsible for checking the return value and handling and/or 476 * reporting the error. 477 */ 478 int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, 479 loff_t end_byte) 480 { 481 int ret, ret2; 482 483 ret = __filemap_fdatawait_range(mapping, start_byte, end_byte); 484 ret2 = filemap_check_errors(mapping); 485 if (!ret) 486 ret = ret2; 487 488 return ret; 489 } 490 EXPORT_SYMBOL(filemap_fdatawait_range); 491 492 /** 493 * filemap_fdatawait_keep_errors - wait for writeback without clearing errors 494 * @mapping: address space structure to wait for 495 * 496 * Walk the list of under-writeback pages of the given address space 497 * and wait for all of them. Unlike filemap_fdatawait(), this function 498 * does not clear error status of the address space. 499 * 500 * Use this function if callers don't handle errors themselves. Expected 501 * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2), 502 * fsfreeze(8) 503 */ 504 void filemap_fdatawait_keep_errors(struct address_space *mapping) 505 { 506 loff_t i_size = i_size_read(mapping->host); 507 508 if (i_size == 0) 509 return; 510 511 __filemap_fdatawait_range(mapping, 0, i_size - 1); 512 } 513 514 /** 515 * filemap_fdatawait - wait for all under-writeback pages to complete 516 * @mapping: address space structure to wait for 517 * 518 * Walk the list of under-writeback pages of the given address space 519 * and wait for all of them. Check error status of the address space 520 * and return it. 521 * 522 * Since the error status of the address space is cleared by this function, 523 * callers are responsible for checking the return value and handling and/or 524 * reporting the error. 525 */ 526 int filemap_fdatawait(struct address_space *mapping) 527 { 528 loff_t i_size = i_size_read(mapping->host); 529 530 if (i_size == 0) 531 return 0; 532 533 return filemap_fdatawait_range(mapping, 0, i_size - 1); 534 } 535 EXPORT_SYMBOL(filemap_fdatawait); 536 537 int filemap_write_and_wait(struct address_space *mapping) 538 { 539 int err = 0; 540 541 if ((!dax_mapping(mapping) && mapping->nrpages) || 542 (dax_mapping(mapping) && mapping->nrexceptional)) { 543 err = filemap_fdatawrite(mapping); 544 /* 545 * Even if the above returned error, the pages may be 546 * written partially (e.g. -ENOSPC), so we wait for it. 547 * But the -EIO is special case, it may indicate the worst 548 * thing (e.g. bug) happened, so we avoid waiting for it. 549 */ 550 if (err != -EIO) { 551 int err2 = filemap_fdatawait(mapping); 552 if (!err) 553 err = err2; 554 } 555 } else { 556 err = filemap_check_errors(mapping); 557 } 558 return err; 559 } 560 EXPORT_SYMBOL(filemap_write_and_wait); 561 562 /** 563 * filemap_write_and_wait_range - write out & wait on a file range 564 * @mapping: the address_space for the pages 565 * @lstart: offset in bytes where the range starts 566 * @lend: offset in bytes where the range ends (inclusive) 567 * 568 * Write out and wait upon file offsets lstart->lend, inclusive. 569 * 570 * Note that `lend' is inclusive (describes the last byte to be written) so 571 * that this function can be used to write to the very end-of-file (end = -1). 572 */ 573 int filemap_write_and_wait_range(struct address_space *mapping, 574 loff_t lstart, loff_t lend) 575 { 576 int err = 0; 577 578 if ((!dax_mapping(mapping) && mapping->nrpages) || 579 (dax_mapping(mapping) && mapping->nrexceptional)) { 580 err = __filemap_fdatawrite_range(mapping, lstart, lend, 581 WB_SYNC_ALL); 582 /* See comment of filemap_write_and_wait() */ 583 if (err != -EIO) { 584 int err2 = filemap_fdatawait_range(mapping, 585 lstart, lend); 586 if (!err) 587 err = err2; 588 } 589 } else { 590 err = filemap_check_errors(mapping); 591 } 592 return err; 593 } 594 EXPORT_SYMBOL(filemap_write_and_wait_range); 595 596 /** 597 * replace_page_cache_page - replace a pagecache page with a new one 598 * @old: page to be replaced 599 * @new: page to replace with 600 * @gfp_mask: allocation mode 601 * 602 * This function replaces a page in the pagecache with a new one. On 603 * success it acquires the pagecache reference for the new page and 604 * drops it for the old page. Both the old and new pages must be 605 * locked. This function does not add the new page to the LRU, the 606 * caller must do that. 607 * 608 * The remove + add is atomic. The only way this function can fail is 609 * memory allocation failure. 610 */ 611 int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) 612 { 613 int error; 614 615 VM_BUG_ON_PAGE(!PageLocked(old), old); 616 VM_BUG_ON_PAGE(!PageLocked(new), new); 617 VM_BUG_ON_PAGE(new->mapping, new); 618 619 error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); 620 if (!error) { 621 struct address_space *mapping = old->mapping; 622 void (*freepage)(struct page *); 623 unsigned long flags; 624 625 pgoff_t offset = old->index; 626 freepage = mapping->a_ops->freepage; 627 628 get_page(new); 629 new->mapping = mapping; 630 new->index = offset; 631 632 spin_lock_irqsave(&mapping->tree_lock, flags); 633 __delete_from_page_cache(old, NULL); 634 error = page_cache_tree_insert(mapping, new, NULL); 635 BUG_ON(error); 636 637 /* 638 * hugetlb pages do not participate in page cache accounting. 639 */ 640 if (!PageHuge(new)) 641 __inc_node_page_state(new, NR_FILE_PAGES); 642 if (PageSwapBacked(new)) 643 __inc_node_page_state(new, NR_SHMEM); 644 spin_unlock_irqrestore(&mapping->tree_lock, flags); 645 mem_cgroup_migrate(old, new); 646 radix_tree_preload_end(); 647 if (freepage) 648 freepage(old); 649 put_page(old); 650 } 651 652 return error; 653 } 654 EXPORT_SYMBOL_GPL(replace_page_cache_page); 655 656 static int __add_to_page_cache_locked(struct page *page, 657 struct address_space *mapping, 658 pgoff_t offset, gfp_t gfp_mask, 659 void **shadowp) 660 { 661 int huge = PageHuge(page); 662 struct mem_cgroup *memcg; 663 int error; 664 665 VM_BUG_ON_PAGE(!PageLocked(page), page); 666 VM_BUG_ON_PAGE(PageSwapBacked(page), page); 667 668 if (!huge) { 669 error = mem_cgroup_try_charge(page, current->mm, 670 gfp_mask, &memcg, false); 671 if (error) 672 return error; 673 } 674 675 error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM); 676 if (error) { 677 if (!huge) 678 mem_cgroup_cancel_charge(page, memcg, false); 679 return error; 680 } 681 682 get_page(page); 683 page->mapping = mapping; 684 page->index = offset; 685 686 spin_lock_irq(&mapping->tree_lock); 687 error = page_cache_tree_insert(mapping, page, shadowp); 688 radix_tree_preload_end(); 689 if (unlikely(error)) 690 goto err_insert; 691 692 /* hugetlb pages do not participate in page cache accounting. */ 693 if (!huge) 694 __inc_node_page_state(page, NR_FILE_PAGES); 695 spin_unlock_irq(&mapping->tree_lock); 696 if (!huge) 697 mem_cgroup_commit_charge(page, memcg, false, false); 698 trace_mm_filemap_add_to_page_cache(page); 699 return 0; 700 err_insert: 701 page->mapping = NULL; 702 /* Leave page->index set: truncation relies upon it */ 703 spin_unlock_irq(&mapping->tree_lock); 704 if (!huge) 705 mem_cgroup_cancel_charge(page, memcg, false); 706 put_page(page); 707 return error; 708 } 709 710 /** 711 * add_to_page_cache_locked - add a locked page to the pagecache 712 * @page: page to add 713 * @mapping: the page's address_space 714 * @offset: page index 715 * @gfp_mask: page allocation mode 716 * 717 * This function is used to add a page to the pagecache. It must be locked. 718 * This function does not add the page to the LRU. The caller must do that. 719 */ 720 int add_to_page_cache_locked(struct page *page, struct address_space *mapping, 721 pgoff_t offset, gfp_t gfp_mask) 722 { 723 return __add_to_page_cache_locked(page, mapping, offset, 724 gfp_mask, NULL); 725 } 726 EXPORT_SYMBOL(add_to_page_cache_locked); 727 728 int add_to_page_cache_lru(struct page *page, struct address_space *mapping, 729 pgoff_t offset, gfp_t gfp_mask) 730 { 731 void *shadow = NULL; 732 int ret; 733 734 __SetPageLocked(page); 735 ret = __add_to_page_cache_locked(page, mapping, offset, 736 gfp_mask, &shadow); 737 if (unlikely(ret)) 738 __ClearPageLocked(page); 739 else { 740 /* 741 * The page might have been evicted from cache only 742 * recently, in which case it should be activated like 743 * any other repeatedly accessed page. 744 * The exception is pages getting rewritten; evicting other 745 * data from the working set, only to cache data that will 746 * get overwritten with something else, is a waste of memory. 747 */ 748 if (!(gfp_mask & __GFP_WRITE) && 749 shadow && workingset_refault(shadow)) { 750 SetPageActive(page); 751 workingset_activation(page); 752 } else 753 ClearPageActive(page); 754 lru_cache_add(page); 755 } 756 return ret; 757 } 758 EXPORT_SYMBOL_GPL(add_to_page_cache_lru); 759 760 #ifdef CONFIG_NUMA 761 struct page *__page_cache_alloc(gfp_t gfp) 762 { 763 int n; 764 struct page *page; 765 766 if (cpuset_do_page_mem_spread()) { 767 unsigned int cpuset_mems_cookie; 768 do { 769 cpuset_mems_cookie = read_mems_allowed_begin(); 770 n = cpuset_mem_spread_node(); 771 page = __alloc_pages_node(n, gfp, 0); 772 } while (!page && read_mems_allowed_retry(cpuset_mems_cookie)); 773 774 return page; 775 } 776 return alloc_pages(gfp, 0); 777 } 778 EXPORT_SYMBOL(__page_cache_alloc); 779 #endif 780 781 /* 782 * In order to wait for pages to become available there must be 783 * waitqueues associated with pages. By using a hash table of 784 * waitqueues where the bucket discipline is to maintain all 785 * waiters on the same queue and wake all when any of the pages 786 * become available, and for the woken contexts to check to be 787 * sure the appropriate page became available, this saves space 788 * at a cost of "thundering herd" phenomena during rare hash 789 * collisions. 790 */ 791 wait_queue_head_t *page_waitqueue(struct page *page) 792 { 793 const struct zone *zone = page_zone(page); 794 795 return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)]; 796 } 797 EXPORT_SYMBOL(page_waitqueue); 798 799 void wait_on_page_bit(struct page *page, int bit_nr) 800 { 801 DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); 802 803 if (test_bit(bit_nr, &page->flags)) 804 __wait_on_bit(page_waitqueue(page), &wait, bit_wait_io, 805 TASK_UNINTERRUPTIBLE); 806 } 807 EXPORT_SYMBOL(wait_on_page_bit); 808 809 int wait_on_page_bit_killable(struct page *page, int bit_nr) 810 { 811 DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); 812 813 if (!test_bit(bit_nr, &page->flags)) 814 return 0; 815 816 return __wait_on_bit(page_waitqueue(page), &wait, 817 bit_wait_io, TASK_KILLABLE); 818 } 819 820 int wait_on_page_bit_killable_timeout(struct page *page, 821 int bit_nr, unsigned long timeout) 822 { 823 DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); 824 825 wait.key.timeout = jiffies + timeout; 826 if (!test_bit(bit_nr, &page->flags)) 827 return 0; 828 return __wait_on_bit(page_waitqueue(page), &wait, 829 bit_wait_io_timeout, TASK_KILLABLE); 830 } 831 EXPORT_SYMBOL_GPL(wait_on_page_bit_killable_timeout); 832 833 /** 834 * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue 835 * @page: Page defining the wait queue of interest 836 * @waiter: Waiter to add to the queue 837 * 838 * Add an arbitrary @waiter to the wait queue for the nominated @page. 839 */ 840 void add_page_wait_queue(struct page *page, wait_queue_t *waiter) 841 { 842 wait_queue_head_t *q = page_waitqueue(page); 843 unsigned long flags; 844 845 spin_lock_irqsave(&q->lock, flags); 846 __add_wait_queue(q, waiter); 847 spin_unlock_irqrestore(&q->lock, flags); 848 } 849 EXPORT_SYMBOL_GPL(add_page_wait_queue); 850 851 /** 852 * unlock_page - unlock a locked page 853 * @page: the page 854 * 855 * Unlocks the page and wakes up sleepers in ___wait_on_page_locked(). 856 * Also wakes sleepers in wait_on_page_writeback() because the wakeup 857 * mechanism between PageLocked pages and PageWriteback pages is shared. 858 * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep. 859 * 860 * The mb is necessary to enforce ordering between the clear_bit and the read 861 * of the waitqueue (to avoid SMP races with a parallel wait_on_page_locked()). 862 */ 863 void unlock_page(struct page *page) 864 { 865 page = compound_head(page); 866 VM_BUG_ON_PAGE(!PageLocked(page), page); 867 clear_bit_unlock(PG_locked, &page->flags); 868 smp_mb__after_atomic(); 869 wake_up_page(page, PG_locked); 870 } 871 EXPORT_SYMBOL(unlock_page); 872 873 /** 874 * end_page_writeback - end writeback against a page 875 * @page: the page 876 */ 877 void end_page_writeback(struct page *page) 878 { 879 /* 880 * TestClearPageReclaim could be used here but it is an atomic 881 * operation and overkill in this particular case. Failing to 882 * shuffle a page marked for immediate reclaim is too mild to 883 * justify taking an atomic operation penalty at the end of 884 * ever page writeback. 885 */ 886 if (PageReclaim(page)) { 887 ClearPageReclaim(page); 888 rotate_reclaimable_page(page); 889 } 890 891 if (!test_clear_page_writeback(page)) 892 BUG(); 893 894 smp_mb__after_atomic(); 895 wake_up_page(page, PG_writeback); 896 } 897 EXPORT_SYMBOL(end_page_writeback); 898 899 /* 900 * After completing I/O on a page, call this routine to update the page 901 * flags appropriately 902 */ 903 void page_endio(struct page *page, bool is_write, int err) 904 { 905 if (!is_write) { 906 if (!err) { 907 SetPageUptodate(page); 908 } else { 909 ClearPageUptodate(page); 910 SetPageError(page); 911 } 912 unlock_page(page); 913 } else { 914 if (err) { 915 SetPageError(page); 916 if (page->mapping) 917 mapping_set_error(page->mapping, err); 918 } 919 end_page_writeback(page); 920 } 921 } 922 EXPORT_SYMBOL_GPL(page_endio); 923 924 /** 925 * __lock_page - get a lock on the page, assuming we need to sleep to get it 926 * @page: the page to lock 927 */ 928 void __lock_page(struct page *page) 929 { 930 struct page *page_head = compound_head(page); 931 DEFINE_WAIT_BIT(wait, &page_head->flags, PG_locked); 932 933 __wait_on_bit_lock(page_waitqueue(page_head), &wait, bit_wait_io, 934 TASK_UNINTERRUPTIBLE); 935 } 936 EXPORT_SYMBOL(__lock_page); 937 938 int __lock_page_killable(struct page *page) 939 { 940 struct page *page_head = compound_head(page); 941 DEFINE_WAIT_BIT(wait, &page_head->flags, PG_locked); 942 943 return __wait_on_bit_lock(page_waitqueue(page_head), &wait, 944 bit_wait_io, TASK_KILLABLE); 945 } 946 EXPORT_SYMBOL_GPL(__lock_page_killable); 947 948 /* 949 * Return values: 950 * 1 - page is locked; mmap_sem is still held. 951 * 0 - page is not locked. 952 * mmap_sem has been released (up_read()), unless flags had both 953 * FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_RETRY_NOWAIT set, in 954 * which case mmap_sem is still held. 955 * 956 * If neither ALLOW_RETRY nor KILLABLE are set, will always return 1 957 * with the page locked and the mmap_sem unperturbed. 958 */ 959 int __lock_page_or_retry(struct page *page, struct mm_struct *mm, 960 unsigned int flags) 961 { 962 if (flags & FAULT_FLAG_ALLOW_RETRY) { 963 /* 964 * CAUTION! In this case, mmap_sem is not released 965 * even though return 0. 966 */ 967 if (flags & FAULT_FLAG_RETRY_NOWAIT) 968 return 0; 969 970 up_read(&mm->mmap_sem); 971 if (flags & FAULT_FLAG_KILLABLE) 972 wait_on_page_locked_killable(page); 973 else 974 wait_on_page_locked(page); 975 return 0; 976 } else { 977 if (flags & FAULT_FLAG_KILLABLE) { 978 int ret; 979 980 ret = __lock_page_killable(page); 981 if (ret) { 982 up_read(&mm->mmap_sem); 983 return 0; 984 } 985 } else 986 __lock_page(page); 987 return 1; 988 } 989 } 990 991 /** 992 * page_cache_next_hole - find the next hole (not-present entry) 993 * @mapping: mapping 994 * @index: index 995 * @max_scan: maximum range to search 996 * 997 * Search the set [index, min(index+max_scan-1, MAX_INDEX)] for the 998 * lowest indexed hole. 999 * 1000 * Returns: the index of the hole if found, otherwise returns an index 1001 * outside of the set specified (in which case 'return - index >= 1002 * max_scan' will be true). In rare cases of index wrap-around, 0 will 1003 * be returned. 1004 * 1005 * page_cache_next_hole may be called under rcu_read_lock. However, 1006 * like radix_tree_gang_lookup, this will not atomically search a 1007 * snapshot of the tree at a single point in time. For example, if a 1008 * hole is created at index 5, then subsequently a hole is created at 1009 * index 10, page_cache_next_hole covering both indexes may return 10 1010 * if called under rcu_read_lock. 1011 */ 1012 pgoff_t page_cache_next_hole(struct address_space *mapping, 1013 pgoff_t index, unsigned long max_scan) 1014 { 1015 unsigned long i; 1016 1017 for (i = 0; i < max_scan; i++) { 1018 struct page *page; 1019 1020 page = radix_tree_lookup(&mapping->page_tree, index); 1021 if (!page || radix_tree_exceptional_entry(page)) 1022 break; 1023 index++; 1024 if (index == 0) 1025 break; 1026 } 1027 1028 return index; 1029 } 1030 EXPORT_SYMBOL(page_cache_next_hole); 1031 1032 /** 1033 * page_cache_prev_hole - find the prev hole (not-present entry) 1034 * @mapping: mapping 1035 * @index: index 1036 * @max_scan: maximum range to search 1037 * 1038 * Search backwards in the range [max(index-max_scan+1, 0), index] for 1039 * the first hole. 1040 * 1041 * Returns: the index of the hole if found, otherwise returns an index 1042 * outside of the set specified (in which case 'index - return >= 1043 * max_scan' will be true). In rare cases of wrap-around, ULONG_MAX 1044 * will be returned. 1045 * 1046 * page_cache_prev_hole may be called under rcu_read_lock. However, 1047 * like radix_tree_gang_lookup, this will not atomically search a 1048 * snapshot of the tree at a single point in time. For example, if a 1049 * hole is created at index 10, then subsequently a hole is created at 1050 * index 5, page_cache_prev_hole covering both indexes may return 5 if 1051 * called under rcu_read_lock. 1052 */ 1053 pgoff_t page_cache_prev_hole(struct address_space *mapping, 1054 pgoff_t index, unsigned long max_scan) 1055 { 1056 unsigned long i; 1057 1058 for (i = 0; i < max_scan; i++) { 1059 struct page *page; 1060 1061 page = radix_tree_lookup(&mapping->page_tree, index); 1062 if (!page || radix_tree_exceptional_entry(page)) 1063 break; 1064 index--; 1065 if (index == ULONG_MAX) 1066 break; 1067 } 1068 1069 return index; 1070 } 1071 EXPORT_SYMBOL(page_cache_prev_hole); 1072 1073 /** 1074 * find_get_entry - find and get a page cache entry 1075 * @mapping: the address_space to search 1076 * @offset: the page cache index 1077 * 1078 * Looks up the page cache slot at @mapping & @offset. If there is a 1079 * page cache page, it is returned with an increased refcount. 1080 * 1081 * If the slot holds a shadow entry of a previously evicted page, or a 1082 * swap entry from shmem/tmpfs, it is returned. 1083 * 1084 * Otherwise, %NULL is returned. 1085 */ 1086 struct page *find_get_entry(struct address_space *mapping, pgoff_t offset) 1087 { 1088 void **pagep; 1089 struct page *head, *page; 1090 1091 rcu_read_lock(); 1092 repeat: 1093 page = NULL; 1094 pagep = radix_tree_lookup_slot(&mapping->page_tree, offset); 1095 if (pagep) { 1096 page = radix_tree_deref_slot(pagep); 1097 if (unlikely(!page)) 1098 goto out; 1099 if (radix_tree_exception(page)) { 1100 if (radix_tree_deref_retry(page)) 1101 goto repeat; 1102 /* 1103 * A shadow entry of a recently evicted page, 1104 * or a swap entry from shmem/tmpfs. Return 1105 * it without attempting to raise page count. 1106 */ 1107 goto out; 1108 } 1109 1110 head = compound_head(page); 1111 if (!page_cache_get_speculative(head)) 1112 goto repeat; 1113 1114 /* The page was split under us? */ 1115 if (compound_head(page) != head) { 1116 put_page(head); 1117 goto repeat; 1118 } 1119 1120 /* 1121 * Has the page moved? 1122 * This is part of the lockless pagecache protocol. See 1123 * include/linux/pagemap.h for details. 1124 */ 1125 if (unlikely(page != *pagep)) { 1126 put_page(head); 1127 goto repeat; 1128 } 1129 } 1130 out: 1131 rcu_read_unlock(); 1132 1133 return page; 1134 } 1135 EXPORT_SYMBOL(find_get_entry); 1136 1137 /** 1138 * find_lock_entry - locate, pin and lock a page cache entry 1139 * @mapping: the address_space to search 1140 * @offset: the page cache index 1141 * 1142 * Looks up the page cache slot at @mapping & @offset. If there is a 1143 * page cache page, it is returned locked and with an increased 1144 * refcount. 1145 * 1146 * If the slot holds a shadow entry of a previously evicted page, or a 1147 * swap entry from shmem/tmpfs, it is returned. 1148 * 1149 * Otherwise, %NULL is returned. 1150 * 1151 * find_lock_entry() may sleep. 1152 */ 1153 struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset) 1154 { 1155 struct page *page; 1156 1157 repeat: 1158 page = find_get_entry(mapping, offset); 1159 if (page && !radix_tree_exception(page)) { 1160 lock_page(page); 1161 /* Has the page been truncated? */ 1162 if (unlikely(page_mapping(page) != mapping)) { 1163 unlock_page(page); 1164 put_page(page); 1165 goto repeat; 1166 } 1167 VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page); 1168 } 1169 return page; 1170 } 1171 EXPORT_SYMBOL(find_lock_entry); 1172 1173 /** 1174 * pagecache_get_page - find and get a page reference 1175 * @mapping: the address_space to search 1176 * @offset: the page index 1177 * @fgp_flags: PCG flags 1178 * @gfp_mask: gfp mask to use for the page cache data page allocation 1179 * 1180 * Looks up the page cache slot at @mapping & @offset. 1181 * 1182 * PCG flags modify how the page is returned. 1183 * 1184 * FGP_ACCESSED: the page will be marked accessed 1185 * FGP_LOCK: Page is return locked 1186 * FGP_CREAT: If page is not present then a new page is allocated using 1187 * @gfp_mask and added to the page cache and the VM's LRU 1188 * list. The page is returned locked and with an increased 1189 * refcount. Otherwise, %NULL is returned. 1190 * 1191 * If FGP_LOCK or FGP_CREAT are specified then the function may sleep even 1192 * if the GFP flags specified for FGP_CREAT are atomic. 1193 * 1194 * If there is a page cache page, it is returned with an increased refcount. 1195 */ 1196 struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset, 1197 int fgp_flags, gfp_t gfp_mask) 1198 { 1199 struct page *page; 1200 1201 repeat: 1202 page = find_get_entry(mapping, offset); 1203 if (radix_tree_exceptional_entry(page)) 1204 page = NULL; 1205 if (!page) 1206 goto no_page; 1207 1208 if (fgp_flags & FGP_LOCK) { 1209 if (fgp_flags & FGP_NOWAIT) { 1210 if (!trylock_page(page)) { 1211 put_page(page); 1212 return NULL; 1213 } 1214 } else { 1215 lock_page(page); 1216 } 1217 1218 /* Has the page been truncated? */ 1219 if (unlikely(page->mapping != mapping)) { 1220 unlock_page(page); 1221 put_page(page); 1222 goto repeat; 1223 } 1224 VM_BUG_ON_PAGE(page->index != offset, page); 1225 } 1226 1227 if (page && (fgp_flags & FGP_ACCESSED)) 1228 mark_page_accessed(page); 1229 1230 no_page: 1231 if (!page && (fgp_flags & FGP_CREAT)) { 1232 int err; 1233 if ((fgp_flags & FGP_WRITE) && mapping_cap_account_dirty(mapping)) 1234 gfp_mask |= __GFP_WRITE; 1235 if (fgp_flags & FGP_NOFS) 1236 gfp_mask &= ~__GFP_FS; 1237 1238 page = __page_cache_alloc(gfp_mask); 1239 if (!page) 1240 return NULL; 1241 1242 if (WARN_ON_ONCE(!(fgp_flags & FGP_LOCK))) 1243 fgp_flags |= FGP_LOCK; 1244 1245 /* Init accessed so avoid atomic mark_page_accessed later */ 1246 if (fgp_flags & FGP_ACCESSED) 1247 __SetPageReferenced(page); 1248 1249 err = add_to_page_cache_lru(page, mapping, offset, 1250 gfp_mask & GFP_RECLAIM_MASK); 1251 if (unlikely(err)) { 1252 put_page(page); 1253 page = NULL; 1254 if (err == -EEXIST) 1255 goto repeat; 1256 } 1257 } 1258 1259 return page; 1260 } 1261 EXPORT_SYMBOL(pagecache_get_page); 1262 1263 /** 1264 * find_get_entries - gang pagecache lookup 1265 * @mapping: The address_space to search 1266 * @start: The starting page cache index 1267 * @nr_entries: The maximum number of entries 1268 * @entries: Where the resulting entries are placed 1269 * @indices: The cache indices corresponding to the entries in @entries 1270 * 1271 * find_get_entries() will search for and return a group of up to 1272 * @nr_entries entries in the mapping. The entries are placed at 1273 * @entries. find_get_entries() takes a reference against any actual 1274 * pages it returns. 1275 * 1276 * The search returns a group of mapping-contiguous page cache entries 1277 * with ascending indexes. There may be holes in the indices due to 1278 * not-present pages. 1279 * 1280 * Any shadow entries of evicted pages, or swap entries from 1281 * shmem/tmpfs, are included in the returned array. 1282 * 1283 * find_get_entries() returns the number of pages and shadow entries 1284 * which were found. 1285 */ 1286 unsigned find_get_entries(struct address_space *mapping, 1287 pgoff_t start, unsigned int nr_entries, 1288 struct page **entries, pgoff_t *indices) 1289 { 1290 void **slot; 1291 unsigned int ret = 0; 1292 struct radix_tree_iter iter; 1293 1294 if (!nr_entries) 1295 return 0; 1296 1297 rcu_read_lock(); 1298 radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { 1299 struct page *head, *page; 1300 repeat: 1301 page = radix_tree_deref_slot(slot); 1302 if (unlikely(!page)) 1303 continue; 1304 if (radix_tree_exception(page)) { 1305 if (radix_tree_deref_retry(page)) { 1306 slot = radix_tree_iter_retry(&iter); 1307 continue; 1308 } 1309 /* 1310 * A shadow entry of a recently evicted page, a swap 1311 * entry from shmem/tmpfs or a DAX entry. Return it 1312 * without attempting to raise page count. 1313 */ 1314 goto export; 1315 } 1316 1317 head = compound_head(page); 1318 if (!page_cache_get_speculative(head)) 1319 goto repeat; 1320 1321 /* The page was split under us? */ 1322 if (compound_head(page) != head) { 1323 put_page(head); 1324 goto repeat; 1325 } 1326 1327 /* Has the page moved? */ 1328 if (unlikely(page != *slot)) { 1329 put_page(head); 1330 goto repeat; 1331 } 1332 export: 1333 indices[ret] = iter.index; 1334 entries[ret] = page; 1335 if (++ret == nr_entries) 1336 break; 1337 } 1338 rcu_read_unlock(); 1339 return ret; 1340 } 1341 1342 /** 1343 * find_get_pages - gang pagecache lookup 1344 * @mapping: The address_space to search 1345 * @start: The starting page index 1346 * @nr_pages: The maximum number of pages 1347 * @pages: Where the resulting pages are placed 1348 * 1349 * find_get_pages() will search for and return a group of up to 1350 * @nr_pages pages in the mapping. The pages are placed at @pages. 1351 * find_get_pages() takes a reference against the returned pages. 1352 * 1353 * The search returns a group of mapping-contiguous pages with ascending 1354 * indexes. There may be holes in the indices due to not-present pages. 1355 * 1356 * find_get_pages() returns the number of pages which were found. 1357 */ 1358 unsigned find_get_pages(struct address_space *mapping, pgoff_t start, 1359 unsigned int nr_pages, struct page **pages) 1360 { 1361 struct radix_tree_iter iter; 1362 void **slot; 1363 unsigned ret = 0; 1364 1365 if (unlikely(!nr_pages)) 1366 return 0; 1367 1368 rcu_read_lock(); 1369 radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { 1370 struct page *head, *page; 1371 repeat: 1372 page = radix_tree_deref_slot(slot); 1373 if (unlikely(!page)) 1374 continue; 1375 1376 if (radix_tree_exception(page)) { 1377 if (radix_tree_deref_retry(page)) { 1378 slot = radix_tree_iter_retry(&iter); 1379 continue; 1380 } 1381 /* 1382 * A shadow entry of a recently evicted page, 1383 * or a swap entry from shmem/tmpfs. Skip 1384 * over it. 1385 */ 1386 continue; 1387 } 1388 1389 head = compound_head(page); 1390 if (!page_cache_get_speculative(head)) 1391 goto repeat; 1392 1393 /* The page was split under us? */ 1394 if (compound_head(page) != head) { 1395 put_page(head); 1396 goto repeat; 1397 } 1398 1399 /* Has the page moved? */ 1400 if (unlikely(page != *slot)) { 1401 put_page(head); 1402 goto repeat; 1403 } 1404 1405 pages[ret] = page; 1406 if (++ret == nr_pages) 1407 break; 1408 } 1409 1410 rcu_read_unlock(); 1411 return ret; 1412 } 1413 1414 /** 1415 * find_get_pages_contig - gang contiguous pagecache lookup 1416 * @mapping: The address_space to search 1417 * @index: The starting page index 1418 * @nr_pages: The maximum number of pages 1419 * @pages: Where the resulting pages are placed 1420 * 1421 * find_get_pages_contig() works exactly like find_get_pages(), except 1422 * that the returned number of pages are guaranteed to be contiguous. 1423 * 1424 * find_get_pages_contig() returns the number of pages which were found. 1425 */ 1426 unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, 1427 unsigned int nr_pages, struct page **pages) 1428 { 1429 struct radix_tree_iter iter; 1430 void **slot; 1431 unsigned int ret = 0; 1432 1433 if (unlikely(!nr_pages)) 1434 return 0; 1435 1436 rcu_read_lock(); 1437 radix_tree_for_each_contig(slot, &mapping->page_tree, &iter, index) { 1438 struct page *head, *page; 1439 repeat: 1440 page = radix_tree_deref_slot(slot); 1441 /* The hole, there no reason to continue */ 1442 if (unlikely(!page)) 1443 break; 1444 1445 if (radix_tree_exception(page)) { 1446 if (radix_tree_deref_retry(page)) { 1447 slot = radix_tree_iter_retry(&iter); 1448 continue; 1449 } 1450 /* 1451 * A shadow entry of a recently evicted page, 1452 * or a swap entry from shmem/tmpfs. Stop 1453 * looking for contiguous pages. 1454 */ 1455 break; 1456 } 1457 1458 head = compound_head(page); 1459 if (!page_cache_get_speculative(head)) 1460 goto repeat; 1461 1462 /* The page was split under us? */ 1463 if (compound_head(page) != head) { 1464 put_page(head); 1465 goto repeat; 1466 } 1467 1468 /* Has the page moved? */ 1469 if (unlikely(page != *slot)) { 1470 put_page(head); 1471 goto repeat; 1472 } 1473 1474 /* 1475 * must check mapping and index after taking the ref. 1476 * otherwise we can get both false positives and false 1477 * negatives, which is just confusing to the caller. 1478 */ 1479 if (page->mapping == NULL || page_to_pgoff(page) != iter.index) { 1480 put_page(page); 1481 break; 1482 } 1483 1484 pages[ret] = page; 1485 if (++ret == nr_pages) 1486 break; 1487 } 1488 rcu_read_unlock(); 1489 return ret; 1490 } 1491 EXPORT_SYMBOL(find_get_pages_contig); 1492 1493 /** 1494 * find_get_pages_tag - find and return pages that match @tag 1495 * @mapping: the address_space to search 1496 * @index: the starting page index 1497 * @tag: the tag index 1498 * @nr_pages: the maximum number of pages 1499 * @pages: where the resulting pages are placed 1500 * 1501 * Like find_get_pages, except we only return pages which are tagged with 1502 * @tag. We update @index to index the next page for the traversal. 1503 */ 1504 unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, 1505 int tag, unsigned int nr_pages, struct page **pages) 1506 { 1507 struct radix_tree_iter iter; 1508 void **slot; 1509 unsigned ret = 0; 1510 1511 if (unlikely(!nr_pages)) 1512 return 0; 1513 1514 rcu_read_lock(); 1515 radix_tree_for_each_tagged(slot, &mapping->page_tree, 1516 &iter, *index, tag) { 1517 struct page *head, *page; 1518 repeat: 1519 page = radix_tree_deref_slot(slot); 1520 if (unlikely(!page)) 1521 continue; 1522 1523 if (radix_tree_exception(page)) { 1524 if (radix_tree_deref_retry(page)) { 1525 slot = radix_tree_iter_retry(&iter); 1526 continue; 1527 } 1528 /* 1529 * A shadow entry of a recently evicted page. 1530 * 1531 * Those entries should never be tagged, but 1532 * this tree walk is lockless and the tags are 1533 * looked up in bulk, one radix tree node at a 1534 * time, so there is a sizable window for page 1535 * reclaim to evict a page we saw tagged. 1536 * 1537 * Skip over it. 1538 */ 1539 continue; 1540 } 1541 1542 head = compound_head(page); 1543 if (!page_cache_get_speculative(head)) 1544 goto repeat; 1545 1546 /* The page was split under us? */ 1547 if (compound_head(page) != head) { 1548 put_page(head); 1549 goto repeat; 1550 } 1551 1552 /* Has the page moved? */ 1553 if (unlikely(page != *slot)) { 1554 put_page(head); 1555 goto repeat; 1556 } 1557 1558 pages[ret] = page; 1559 if (++ret == nr_pages) 1560 break; 1561 } 1562 1563 rcu_read_unlock(); 1564 1565 if (ret) 1566 *index = pages[ret - 1]->index + 1; 1567 1568 return ret; 1569 } 1570 EXPORT_SYMBOL(find_get_pages_tag); 1571 1572 /** 1573 * find_get_entries_tag - find and return entries that match @tag 1574 * @mapping: the address_space to search 1575 * @start: the starting page cache index 1576 * @tag: the tag index 1577 * @nr_entries: the maximum number of entries 1578 * @entries: where the resulting entries are placed 1579 * @indices: the cache indices corresponding to the entries in @entries 1580 * 1581 * Like find_get_entries, except we only return entries which are tagged with 1582 * @tag. 1583 */ 1584 unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, 1585 int tag, unsigned int nr_entries, 1586 struct page **entries, pgoff_t *indices) 1587 { 1588 void **slot; 1589 unsigned int ret = 0; 1590 struct radix_tree_iter iter; 1591 1592 if (!nr_entries) 1593 return 0; 1594 1595 rcu_read_lock(); 1596 radix_tree_for_each_tagged(slot, &mapping->page_tree, 1597 &iter, start, tag) { 1598 struct page *head, *page; 1599 repeat: 1600 page = radix_tree_deref_slot(slot); 1601 if (unlikely(!page)) 1602 continue; 1603 if (radix_tree_exception(page)) { 1604 if (radix_tree_deref_retry(page)) { 1605 slot = radix_tree_iter_retry(&iter); 1606 continue; 1607 } 1608 1609 /* 1610 * A shadow entry of a recently evicted page, a swap 1611 * entry from shmem/tmpfs or a DAX entry. Return it 1612 * without attempting to raise page count. 1613 */ 1614 goto export; 1615 } 1616 1617 head = compound_head(page); 1618 if (!page_cache_get_speculative(head)) 1619 goto repeat; 1620 1621 /* The page was split under us? */ 1622 if (compound_head(page) != head) { 1623 put_page(head); 1624 goto repeat; 1625 } 1626 1627 /* Has the page moved? */ 1628 if (unlikely(page != *slot)) { 1629 put_page(head); 1630 goto repeat; 1631 } 1632 export: 1633 indices[ret] = iter.index; 1634 entries[ret] = page; 1635 if (++ret == nr_entries) 1636 break; 1637 } 1638 rcu_read_unlock(); 1639 return ret; 1640 } 1641 EXPORT_SYMBOL(find_get_entries_tag); 1642 1643 /* 1644 * CD/DVDs are error prone. When a medium error occurs, the driver may fail 1645 * a _large_ part of the i/o request. Imagine the worst scenario: 1646 * 1647 * ---R__________________________________________B__________ 1648 * ^ reading here ^ bad block(assume 4k) 1649 * 1650 * read(R) => miss => readahead(R...B) => media error => frustrating retries 1651 * => failing the whole request => read(R) => read(R+1) => 1652 * readahead(R+1...B+1) => bang => read(R+2) => read(R+3) => 1653 * readahead(R+3...B+2) => bang => read(R+3) => read(R+4) => 1654 * readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ...... 1655 * 1656 * It is going insane. Fix it by quickly scaling down the readahead size. 1657 */ 1658 static void shrink_readahead_size_eio(struct file *filp, 1659 struct file_ra_state *ra) 1660 { 1661 ra->ra_pages /= 4; 1662 } 1663 1664 /** 1665 * do_generic_file_read - generic file read routine 1666 * @filp: the file to read 1667 * @ppos: current file position 1668 * @iter: data destination 1669 * @written: already copied 1670 * 1671 * This is a generic file read routine, and uses the 1672 * mapping->a_ops->readpage() function for the actual low-level stuff. 1673 * 1674 * This is really ugly. But the goto's actually try to clarify some 1675 * of the logic when it comes to error handling etc. 1676 */ 1677 static ssize_t do_generic_file_read(struct file *filp, loff_t *ppos, 1678 struct iov_iter *iter, ssize_t written) 1679 { 1680 struct address_space *mapping = filp->f_mapping; 1681 struct inode *inode = mapping->host; 1682 struct file_ra_state *ra = &filp->f_ra; 1683 pgoff_t index; 1684 pgoff_t last_index; 1685 pgoff_t prev_index; 1686 unsigned long offset; /* offset into pagecache page */ 1687 unsigned int prev_offset; 1688 int error = 0; 1689 1690 if (unlikely(*ppos >= inode->i_sb->s_maxbytes)) 1691 return -EINVAL; 1692 iov_iter_truncate(iter, inode->i_sb->s_maxbytes); 1693 1694 index = *ppos >> PAGE_SHIFT; 1695 prev_index = ra->prev_pos >> PAGE_SHIFT; 1696 prev_offset = ra->prev_pos & (PAGE_SIZE-1); 1697 last_index = (*ppos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT; 1698 offset = *ppos & ~PAGE_MASK; 1699 1700 for (;;) { 1701 struct page *page; 1702 pgoff_t end_index; 1703 loff_t isize; 1704 unsigned long nr, ret; 1705 1706 cond_resched(); 1707 find_page: 1708 page = find_get_page(mapping, index); 1709 if (!page) { 1710 page_cache_sync_readahead(mapping, 1711 ra, filp, 1712 index, last_index - index); 1713 page = find_get_page(mapping, index); 1714 if (unlikely(page == NULL)) 1715 goto no_cached_page; 1716 } 1717 if (PageReadahead(page)) { 1718 page_cache_async_readahead(mapping, 1719 ra, filp, page, 1720 index, last_index - index); 1721 } 1722 if (!PageUptodate(page)) { 1723 /* 1724 * See comment in do_read_cache_page on why 1725 * wait_on_page_locked is used to avoid unnecessarily 1726 * serialisations and why it's safe. 1727 */ 1728 error = wait_on_page_locked_killable(page); 1729 if (unlikely(error)) 1730 goto readpage_error; 1731 if (PageUptodate(page)) 1732 goto page_ok; 1733 1734 if (inode->i_blkbits == PAGE_SHIFT || 1735 !mapping->a_ops->is_partially_uptodate) 1736 goto page_not_up_to_date; 1737 if (!trylock_page(page)) 1738 goto page_not_up_to_date; 1739 /* Did it get truncated before we got the lock? */ 1740 if (!page->mapping) 1741 goto page_not_up_to_date_locked; 1742 if (!mapping->a_ops->is_partially_uptodate(page, 1743 offset, iter->count)) 1744 goto page_not_up_to_date_locked; 1745 unlock_page(page); 1746 } 1747 page_ok: 1748 /* 1749 * i_size must be checked after we know the page is Uptodate. 1750 * 1751 * Checking i_size after the check allows us to calculate 1752 * the correct value for "nr", which means the zero-filled 1753 * part of the page is not copied back to userspace (unless 1754 * another truncate extends the file - this is desired though). 1755 */ 1756 1757 isize = i_size_read(inode); 1758 end_index = (isize - 1) >> PAGE_SHIFT; 1759 if (unlikely(!isize || index > end_index)) { 1760 put_page(page); 1761 goto out; 1762 } 1763 1764 /* nr is the maximum number of bytes to copy from this page */ 1765 nr = PAGE_SIZE; 1766 if (index == end_index) { 1767 nr = ((isize - 1) & ~PAGE_MASK) + 1; 1768 if (nr <= offset) { 1769 put_page(page); 1770 goto out; 1771 } 1772 } 1773 nr = nr - offset; 1774 1775 /* If users can be writing to this page using arbitrary 1776 * virtual addresses, take care about potential aliasing 1777 * before reading the page on the kernel side. 1778 */ 1779 if (mapping_writably_mapped(mapping)) 1780 flush_dcache_page(page); 1781 1782 /* 1783 * When a sequential read accesses a page several times, 1784 * only mark it as accessed the first time. 1785 */ 1786 if (prev_index != index || offset != prev_offset) 1787 mark_page_accessed(page); 1788 prev_index = index; 1789 1790 /* 1791 * Ok, we have the page, and it's up-to-date, so 1792 * now we can copy it to user space... 1793 */ 1794 1795 ret = copy_page_to_iter(page, offset, nr, iter); 1796 offset += ret; 1797 index += offset >> PAGE_SHIFT; 1798 offset &= ~PAGE_MASK; 1799 prev_offset = offset; 1800 1801 put_page(page); 1802 written += ret; 1803 if (!iov_iter_count(iter)) 1804 goto out; 1805 if (ret < nr) { 1806 error = -EFAULT; 1807 goto out; 1808 } 1809 continue; 1810 1811 page_not_up_to_date: 1812 /* Get exclusive access to the page ... */ 1813 error = lock_page_killable(page); 1814 if (unlikely(error)) 1815 goto readpage_error; 1816 1817 page_not_up_to_date_locked: 1818 /* Did it get truncated before we got the lock? */ 1819 if (!page->mapping) { 1820 unlock_page(page); 1821 put_page(page); 1822 continue; 1823 } 1824 1825 /* Did somebody else fill it already? */ 1826 if (PageUptodate(page)) { 1827 unlock_page(page); 1828 goto page_ok; 1829 } 1830 1831 readpage: 1832 /* 1833 * A previous I/O error may have been due to temporary 1834 * failures, eg. multipath errors. 1835 * PG_error will be set again if readpage fails. 1836 */ 1837 ClearPageError(page); 1838 /* Start the actual read. The read will unlock the page. */ 1839 error = mapping->a_ops->readpage(filp, page); 1840 1841 if (unlikely(error)) { 1842 if (error == AOP_TRUNCATED_PAGE) { 1843 put_page(page); 1844 error = 0; 1845 goto find_page; 1846 } 1847 goto readpage_error; 1848 } 1849 1850 if (!PageUptodate(page)) { 1851 error = lock_page_killable(page); 1852 if (unlikely(error)) 1853 goto readpage_error; 1854 if (!PageUptodate(page)) { 1855 if (page->mapping == NULL) { 1856 /* 1857 * invalidate_mapping_pages got it 1858 */ 1859 unlock_page(page); 1860 put_page(page); 1861 goto find_page; 1862 } 1863 unlock_page(page); 1864 shrink_readahead_size_eio(filp, ra); 1865 error = -EIO; 1866 goto readpage_error; 1867 } 1868 unlock_page(page); 1869 } 1870 1871 goto page_ok; 1872 1873 readpage_error: 1874 /* UHHUH! A synchronous read error occurred. Report it */ 1875 put_page(page); 1876 goto out; 1877 1878 no_cached_page: 1879 /* 1880 * Ok, it wasn't cached, so we need to create a new 1881 * page.. 1882 */ 1883 page = page_cache_alloc_cold(mapping); 1884 if (!page) { 1885 error = -ENOMEM; 1886 goto out; 1887 } 1888 error = add_to_page_cache_lru(page, mapping, index, 1889 mapping_gfp_constraint(mapping, GFP_KERNEL)); 1890 if (error) { 1891 put_page(page); 1892 if (error == -EEXIST) { 1893 error = 0; 1894 goto find_page; 1895 } 1896 goto out; 1897 } 1898 goto readpage; 1899 } 1900 1901 out: 1902 ra->prev_pos = prev_index; 1903 ra->prev_pos <<= PAGE_SHIFT; 1904 ra->prev_pos |= prev_offset; 1905 1906 *ppos = ((loff_t)index << PAGE_SHIFT) + offset; 1907 file_accessed(filp); 1908 return written ? written : error; 1909 } 1910 1911 /** 1912 * generic_file_read_iter - generic filesystem read routine 1913 * @iocb: kernel I/O control block 1914 * @iter: destination for the data read 1915 * 1916 * This is the "read_iter()" routine for all filesystems 1917 * that can use the page cache directly. 1918 */ 1919 ssize_t 1920 generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) 1921 { 1922 struct file *file = iocb->ki_filp; 1923 ssize_t retval = 0; 1924 size_t count = iov_iter_count(iter); 1925 1926 if (!count) 1927 goto out; /* skip atime */ 1928 1929 if (iocb->ki_flags & IOCB_DIRECT) { 1930 struct address_space *mapping = file->f_mapping; 1931 struct inode *inode = mapping->host; 1932 struct iov_iter data = *iter; 1933 loff_t size; 1934 1935 size = i_size_read(inode); 1936 retval = filemap_write_and_wait_range(mapping, iocb->ki_pos, 1937 iocb->ki_pos + count - 1); 1938 if (retval < 0) 1939 goto out; 1940 1941 file_accessed(file); 1942 1943 retval = mapping->a_ops->direct_IO(iocb, &data); 1944 if (retval >= 0) { 1945 iocb->ki_pos += retval; 1946 iov_iter_advance(iter, retval); 1947 } 1948 1949 /* 1950 * Btrfs can have a short DIO read if we encounter 1951 * compressed extents, so if there was an error, or if 1952 * we've already read everything we wanted to, or if 1953 * there was a short read because we hit EOF, go ahead 1954 * and return. Otherwise fallthrough to buffered io for 1955 * the rest of the read. Buffered reads will not work for 1956 * DAX files, so don't bother trying. 1957 */ 1958 if (retval < 0 || !iov_iter_count(iter) || iocb->ki_pos >= size || 1959 IS_DAX(inode)) 1960 goto out; 1961 } 1962 1963 retval = do_generic_file_read(file, &iocb->ki_pos, iter, retval); 1964 out: 1965 return retval; 1966 } 1967 EXPORT_SYMBOL(generic_file_read_iter); 1968 1969 #ifdef CONFIG_MMU 1970 /** 1971 * page_cache_read - adds requested page to the page cache if not already there 1972 * @file: file to read 1973 * @offset: page index 1974 * @gfp_mask: memory allocation flags 1975 * 1976 * This adds the requested page to the page cache if it isn't already there, 1977 * and schedules an I/O to read in its contents from disk. 1978 */ 1979 static int page_cache_read(struct file *file, pgoff_t offset, gfp_t gfp_mask) 1980 { 1981 struct address_space *mapping = file->f_mapping; 1982 struct page *page; 1983 int ret; 1984 1985 do { 1986 page = __page_cache_alloc(gfp_mask|__GFP_COLD); 1987 if (!page) 1988 return -ENOMEM; 1989 1990 ret = add_to_page_cache_lru(page, mapping, offset, gfp_mask & GFP_KERNEL); 1991 if (ret == 0) 1992 ret = mapping->a_ops->readpage(file, page); 1993 else if (ret == -EEXIST) 1994 ret = 0; /* losing race to add is OK */ 1995 1996 put_page(page); 1997 1998 } while (ret == AOP_TRUNCATED_PAGE); 1999 2000 return ret; 2001 } 2002 2003 #define MMAP_LOTSAMISS (100) 2004 2005 /* 2006 * Synchronous readahead happens when we don't even find 2007 * a page in the page cache at all. 2008 */ 2009 static void do_sync_mmap_readahead(struct vm_area_struct *vma, 2010 struct file_ra_state *ra, 2011 struct file *file, 2012 pgoff_t offset) 2013 { 2014 struct address_space *mapping = file->f_mapping; 2015 2016 /* If we don't want any read-ahead, don't bother */ 2017 if (vma->vm_flags & VM_RAND_READ) 2018 return; 2019 if (!ra->ra_pages) 2020 return; 2021 2022 if (vma->vm_flags & VM_SEQ_READ) { 2023 page_cache_sync_readahead(mapping, ra, file, offset, 2024 ra->ra_pages); 2025 return; 2026 } 2027 2028 /* Avoid banging the cache line if not needed */ 2029 if (ra->mmap_miss < MMAP_LOTSAMISS * 10) 2030 ra->mmap_miss++; 2031 2032 /* 2033 * Do we miss much more than hit in this file? If so, 2034 * stop bothering with read-ahead. It will only hurt. 2035 */ 2036 if (ra->mmap_miss > MMAP_LOTSAMISS) 2037 return; 2038 2039 /* 2040 * mmap read-around 2041 */ 2042 ra->start = max_t(long, 0, offset - ra->ra_pages / 2); 2043 ra->size = ra->ra_pages; 2044 ra->async_size = ra->ra_pages / 4; 2045 ra_submit(ra, mapping, file); 2046 } 2047 2048 /* 2049 * Asynchronous readahead happens when we find the page and PG_readahead, 2050 * so we want to possibly extend the readahead further.. 2051 */ 2052 static void do_async_mmap_readahead(struct vm_area_struct *vma, 2053 struct file_ra_state *ra, 2054 struct file *file, 2055 struct page *page, 2056 pgoff_t offset) 2057 { 2058 struct address_space *mapping = file->f_mapping; 2059 2060 /* If we don't want any read-ahead, don't bother */ 2061 if (vma->vm_flags & VM_RAND_READ) 2062 return; 2063 if (ra->mmap_miss > 0) 2064 ra->mmap_miss--; 2065 if (PageReadahead(page)) 2066 page_cache_async_readahead(mapping, ra, file, 2067 page, offset, ra->ra_pages); 2068 } 2069 2070 /** 2071 * filemap_fault - read in file data for page fault handling 2072 * @vma: vma in which the fault was taken 2073 * @vmf: struct vm_fault containing details of the fault 2074 * 2075 * filemap_fault() is invoked via the vma operations vector for a 2076 * mapped memory region to read in file data during a page fault. 2077 * 2078 * The goto's are kind of ugly, but this streamlines the normal case of having 2079 * it in the page cache, and handles the special cases reasonably without 2080 * having a lot of duplicated code. 2081 * 2082 * vma->vm_mm->mmap_sem must be held on entry. 2083 * 2084 * If our return value has VM_FAULT_RETRY set, it's because 2085 * lock_page_or_retry() returned 0. 2086 * The mmap_sem has usually been released in this case. 2087 * See __lock_page_or_retry() for the exception. 2088 * 2089 * If our return value does not have VM_FAULT_RETRY set, the mmap_sem 2090 * has not been released. 2091 * 2092 * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set. 2093 */ 2094 int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 2095 { 2096 int error; 2097 struct file *file = vma->vm_file; 2098 struct address_space *mapping = file->f_mapping; 2099 struct file_ra_state *ra = &file->f_ra; 2100 struct inode *inode = mapping->host; 2101 pgoff_t offset = vmf->pgoff; 2102 struct page *page; 2103 loff_t size; 2104 int ret = 0; 2105 2106 size = round_up(i_size_read(inode), PAGE_SIZE); 2107 if (offset >= size >> PAGE_SHIFT) 2108 return VM_FAULT_SIGBUS; 2109 2110 /* 2111 * Do we have something in the page cache already? 2112 */ 2113 page = find_get_page(mapping, offset); 2114 if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) { 2115 /* 2116 * We found the page, so try async readahead before 2117 * waiting for the lock. 2118 */ 2119 do_async_mmap_readahead(vma, ra, file, page, offset); 2120 } else if (!page) { 2121 /* No page in the page cache at all */ 2122 do_sync_mmap_readahead(vma, ra, file, offset); 2123 count_vm_event(PGMAJFAULT); 2124 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); 2125 ret = VM_FAULT_MAJOR; 2126 retry_find: 2127 page = find_get_page(mapping, offset); 2128 if (!page) 2129 goto no_cached_page; 2130 } 2131 2132 if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) { 2133 put_page(page); 2134 return ret | VM_FAULT_RETRY; 2135 } 2136 2137 /* Did it get truncated? */ 2138 if (unlikely(page->mapping != mapping)) { 2139 unlock_page(page); 2140 put_page(page); 2141 goto retry_find; 2142 } 2143 VM_BUG_ON_PAGE(page->index != offset, page); 2144 2145 /* 2146 * We have a locked page in the page cache, now we need to check 2147 * that it's up-to-date. If not, it is going to be due to an error. 2148 */ 2149 if (unlikely(!PageUptodate(page))) 2150 goto page_not_uptodate; 2151 2152 /* 2153 * Found the page and have a reference on it. 2154 * We must recheck i_size under page lock. 2155 */ 2156 size = round_up(i_size_read(inode), PAGE_SIZE); 2157 if (unlikely(offset >= size >> PAGE_SHIFT)) { 2158 unlock_page(page); 2159 put_page(page); 2160 return VM_FAULT_SIGBUS; 2161 } 2162 2163 vmf->page = page; 2164 return ret | VM_FAULT_LOCKED; 2165 2166 no_cached_page: 2167 /* 2168 * We're only likely to ever get here if MADV_RANDOM is in 2169 * effect. 2170 */ 2171 error = page_cache_read(file, offset, vmf->gfp_mask); 2172 2173 /* 2174 * The page we want has now been added to the page cache. 2175 * In the unlikely event that someone removed it in the 2176 * meantime, we'll just come back here and read it again. 2177 */ 2178 if (error >= 0) 2179 goto retry_find; 2180 2181 /* 2182 * An error return from page_cache_read can result if the 2183 * system is low on memory, or a problem occurs while trying 2184 * to schedule I/O. 2185 */ 2186 if (error == -ENOMEM) 2187 return VM_FAULT_OOM; 2188 return VM_FAULT_SIGBUS; 2189 2190 page_not_uptodate: 2191 /* 2192 * Umm, take care of errors if the page isn't up-to-date. 2193 * Try to re-read it _once_. We do this synchronously, 2194 * because there really aren't any performance issues here 2195 * and we need to check for errors. 2196 */ 2197 ClearPageError(page); 2198 error = mapping->a_ops->readpage(file, page); 2199 if (!error) { 2200 wait_on_page_locked(page); 2201 if (!PageUptodate(page)) 2202 error = -EIO; 2203 } 2204 put_page(page); 2205 2206 if (!error || error == AOP_TRUNCATED_PAGE) 2207 goto retry_find; 2208 2209 /* Things didn't work out. Return zero to tell the mm layer so. */ 2210 shrink_readahead_size_eio(file, ra); 2211 return VM_FAULT_SIGBUS; 2212 } 2213 EXPORT_SYMBOL(filemap_fault); 2214 2215 void filemap_map_pages(struct fault_env *fe, 2216 pgoff_t start_pgoff, pgoff_t end_pgoff) 2217 { 2218 struct radix_tree_iter iter; 2219 void **slot; 2220 struct file *file = fe->vma->vm_file; 2221 struct address_space *mapping = file->f_mapping; 2222 pgoff_t last_pgoff = start_pgoff; 2223 loff_t size; 2224 struct page *head, *page; 2225 2226 rcu_read_lock(); 2227 radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, 2228 start_pgoff) { 2229 if (iter.index > end_pgoff) 2230 break; 2231 repeat: 2232 page = radix_tree_deref_slot(slot); 2233 if (unlikely(!page)) 2234 goto next; 2235 if (radix_tree_exception(page)) { 2236 if (radix_tree_deref_retry(page)) { 2237 slot = radix_tree_iter_retry(&iter); 2238 continue; 2239 } 2240 goto next; 2241 } 2242 2243 head = compound_head(page); 2244 if (!page_cache_get_speculative(head)) 2245 goto repeat; 2246 2247 /* The page was split under us? */ 2248 if (compound_head(page) != head) { 2249 put_page(head); 2250 goto repeat; 2251 } 2252 2253 /* Has the page moved? */ 2254 if (unlikely(page != *slot)) { 2255 put_page(head); 2256 goto repeat; 2257 } 2258 2259 if (!PageUptodate(page) || 2260 PageReadahead(page) || 2261 PageHWPoison(page)) 2262 goto skip; 2263 if (!trylock_page(page)) 2264 goto skip; 2265 2266 if (page->mapping != mapping || !PageUptodate(page)) 2267 goto unlock; 2268 2269 size = round_up(i_size_read(mapping->host), PAGE_SIZE); 2270 if (page->index >= size >> PAGE_SHIFT) 2271 goto unlock; 2272 2273 if (file->f_ra.mmap_miss > 0) 2274 file->f_ra.mmap_miss--; 2275 2276 fe->address += (iter.index - last_pgoff) << PAGE_SHIFT; 2277 if (fe->pte) 2278 fe->pte += iter.index - last_pgoff; 2279 last_pgoff = iter.index; 2280 if (alloc_set_pte(fe, NULL, page)) 2281 goto unlock; 2282 unlock_page(page); 2283 goto next; 2284 unlock: 2285 unlock_page(page); 2286 skip: 2287 put_page(page); 2288 next: 2289 /* Huge page is mapped? No need to proceed. */ 2290 if (pmd_trans_huge(*fe->pmd)) 2291 break; 2292 if (iter.index == end_pgoff) 2293 break; 2294 } 2295 rcu_read_unlock(); 2296 } 2297 EXPORT_SYMBOL(filemap_map_pages); 2298 2299 int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) 2300 { 2301 struct page *page = vmf->page; 2302 struct inode *inode = file_inode(vma->vm_file); 2303 int ret = VM_FAULT_LOCKED; 2304 2305 sb_start_pagefault(inode->i_sb); 2306 file_update_time(vma->vm_file); 2307 lock_page(page); 2308 if (page->mapping != inode->i_mapping) { 2309 unlock_page(page); 2310 ret = VM_FAULT_NOPAGE; 2311 goto out; 2312 } 2313 /* 2314 * We mark the page dirty already here so that when freeze is in 2315 * progress, we are guaranteed that writeback during freezing will 2316 * see the dirty page and writeprotect it again. 2317 */ 2318 set_page_dirty(page); 2319 wait_for_stable_page(page); 2320 out: 2321 sb_end_pagefault(inode->i_sb); 2322 return ret; 2323 } 2324 EXPORT_SYMBOL(filemap_page_mkwrite); 2325 2326 const struct vm_operations_struct generic_file_vm_ops = { 2327 .fault = filemap_fault, 2328 .map_pages = filemap_map_pages, 2329 .page_mkwrite = filemap_page_mkwrite, 2330 }; 2331 2332 /* This is used for a general mmap of a disk file */ 2333 2334 int generic_file_mmap(struct file * file, struct vm_area_struct * vma) 2335 { 2336 struct address_space *mapping = file->f_mapping; 2337 2338 if (!mapping->a_ops->readpage) 2339 return -ENOEXEC; 2340 file_accessed(file); 2341 vma->vm_ops = &generic_file_vm_ops; 2342 return 0; 2343 } 2344 2345 /* 2346 * This is for filesystems which do not implement ->writepage. 2347 */ 2348 int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma) 2349 { 2350 if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) 2351 return -EINVAL; 2352 return generic_file_mmap(file, vma); 2353 } 2354 #else 2355 int generic_file_mmap(struct file * file, struct vm_area_struct * vma) 2356 { 2357 return -ENOSYS; 2358 } 2359 int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma) 2360 { 2361 return -ENOSYS; 2362 } 2363 #endif /* CONFIG_MMU */ 2364 2365 EXPORT_SYMBOL(generic_file_mmap); 2366 EXPORT_SYMBOL(generic_file_readonly_mmap); 2367 2368 static struct page *wait_on_page_read(struct page *page) 2369 { 2370 if (!IS_ERR(page)) { 2371 wait_on_page_locked(page); 2372 if (!PageUptodate(page)) { 2373 put_page(page); 2374 page = ERR_PTR(-EIO); 2375 } 2376 } 2377 return page; 2378 } 2379 2380 static struct page *do_read_cache_page(struct address_space *mapping, 2381 pgoff_t index, 2382 int (*filler)(void *, struct page *), 2383 void *data, 2384 gfp_t gfp) 2385 { 2386 struct page *page; 2387 int err; 2388 repeat: 2389 page = find_get_page(mapping, index); 2390 if (!page) { 2391 page = __page_cache_alloc(gfp | __GFP_COLD); 2392 if (!page) 2393 return ERR_PTR(-ENOMEM); 2394 err = add_to_page_cache_lru(page, mapping, index, gfp); 2395 if (unlikely(err)) { 2396 put_page(page); 2397 if (err == -EEXIST) 2398 goto repeat; 2399 /* Presumably ENOMEM for radix tree node */ 2400 return ERR_PTR(err); 2401 } 2402 2403 filler: 2404 err = filler(data, page); 2405 if (err < 0) { 2406 put_page(page); 2407 return ERR_PTR(err); 2408 } 2409 2410 page = wait_on_page_read(page); 2411 if (IS_ERR(page)) 2412 return page; 2413 goto out; 2414 } 2415 if (PageUptodate(page)) 2416 goto out; 2417 2418 /* 2419 * Page is not up to date and may be locked due one of the following 2420 * case a: Page is being filled and the page lock is held 2421 * case b: Read/write error clearing the page uptodate status 2422 * case c: Truncation in progress (page locked) 2423 * case d: Reclaim in progress 2424 * 2425 * Case a, the page will be up to date when the page is unlocked. 2426 * There is no need to serialise on the page lock here as the page 2427 * is pinned so the lock gives no additional protection. Even if the 2428 * the page is truncated, the data is still valid if PageUptodate as 2429 * it's a race vs truncate race. 2430 * Case b, the page will not be up to date 2431 * Case c, the page may be truncated but in itself, the data may still 2432 * be valid after IO completes as it's a read vs truncate race. The 2433 * operation must restart if the page is not uptodate on unlock but 2434 * otherwise serialising on page lock to stabilise the mapping gives 2435 * no additional guarantees to the caller as the page lock is 2436 * released before return. 2437 * Case d, similar to truncation. If reclaim holds the page lock, it 2438 * will be a race with remove_mapping that determines if the mapping 2439 * is valid on unlock but otherwise the data is valid and there is 2440 * no need to serialise with page lock. 2441 * 2442 * As the page lock gives no additional guarantee, we optimistically 2443 * wait on the page to be unlocked and check if it's up to date and 2444 * use the page if it is. Otherwise, the page lock is required to 2445 * distinguish between the different cases. The motivation is that we 2446 * avoid spurious serialisations and wakeups when multiple processes 2447 * wait on the same page for IO to complete. 2448 */ 2449 wait_on_page_locked(page); 2450 if (PageUptodate(page)) 2451 goto out; 2452 2453 /* Distinguish between all the cases under the safety of the lock */ 2454 lock_page(page); 2455 2456 /* Case c or d, restart the operation */ 2457 if (!page->mapping) { 2458 unlock_page(page); 2459 put_page(page); 2460 goto repeat; 2461 } 2462 2463 /* Someone else locked and filled the page in a very small window */ 2464 if (PageUptodate(page)) { 2465 unlock_page(page); 2466 goto out; 2467 } 2468 goto filler; 2469 2470 out: 2471 mark_page_accessed(page); 2472 return page; 2473 } 2474 2475 /** 2476 * read_cache_page - read into page cache, fill it if needed 2477 * @mapping: the page's address_space 2478 * @index: the page index 2479 * @filler: function to perform the read 2480 * @data: first arg to filler(data, page) function, often left as NULL 2481 * 2482 * Read into the page cache. If a page already exists, and PageUptodate() is 2483 * not set, try to fill the page and wait for it to become unlocked. 2484 * 2485 * If the page does not get brought uptodate, return -EIO. 2486 */ 2487 struct page *read_cache_page(struct address_space *mapping, 2488 pgoff_t index, 2489 int (*filler)(void *, struct page *), 2490 void *data) 2491 { 2492 return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping)); 2493 } 2494 EXPORT_SYMBOL(read_cache_page); 2495 2496 /** 2497 * read_cache_page_gfp - read into page cache, using specified page allocation flags. 2498 * @mapping: the page's address_space 2499 * @index: the page index 2500 * @gfp: the page allocator flags to use if allocating 2501 * 2502 * This is the same as "read_mapping_page(mapping, index, NULL)", but with 2503 * any new page allocations done using the specified allocation flags. 2504 * 2505 * If the page does not get brought uptodate, return -EIO. 2506 */ 2507 struct page *read_cache_page_gfp(struct address_space *mapping, 2508 pgoff_t index, 2509 gfp_t gfp) 2510 { 2511 filler_t *filler = (filler_t *)mapping->a_ops->readpage; 2512 2513 return do_read_cache_page(mapping, index, filler, NULL, gfp); 2514 } 2515 EXPORT_SYMBOL(read_cache_page_gfp); 2516 2517 /* 2518 * Performs necessary checks before doing a write 2519 * 2520 * Can adjust writing position or amount of bytes to write. 2521 * Returns appropriate error code that caller should return or 2522 * zero in case that write should be allowed. 2523 */ 2524 inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from) 2525 { 2526 struct file *file = iocb->ki_filp; 2527 struct inode *inode = file->f_mapping->host; 2528 unsigned long limit = rlimit(RLIMIT_FSIZE); 2529 loff_t pos; 2530 2531 if (!iov_iter_count(from)) 2532 return 0; 2533 2534 /* FIXME: this is for backwards compatibility with 2.4 */ 2535 if (iocb->ki_flags & IOCB_APPEND) 2536 iocb->ki_pos = i_size_read(inode); 2537 2538 pos = iocb->ki_pos; 2539 2540 if (limit != RLIM_INFINITY) { 2541 if (iocb->ki_pos >= limit) { 2542 send_sig(SIGXFSZ, current, 0); 2543 return -EFBIG; 2544 } 2545 iov_iter_truncate(from, limit - (unsigned long)pos); 2546 } 2547 2548 /* 2549 * LFS rule 2550 */ 2551 if (unlikely(pos + iov_iter_count(from) > MAX_NON_LFS && 2552 !(file->f_flags & O_LARGEFILE))) { 2553 if (pos >= MAX_NON_LFS) 2554 return -EFBIG; 2555 iov_iter_truncate(from, MAX_NON_LFS - (unsigned long)pos); 2556 } 2557 2558 /* 2559 * Are we about to exceed the fs block limit ? 2560 * 2561 * If we have written data it becomes a short write. If we have 2562 * exceeded without writing data we send a signal and return EFBIG. 2563 * Linus frestrict idea will clean these up nicely.. 2564 */ 2565 if (unlikely(pos >= inode->i_sb->s_maxbytes)) 2566 return -EFBIG; 2567 2568 iov_iter_truncate(from, inode->i_sb->s_maxbytes - pos); 2569 return iov_iter_count(from); 2570 } 2571 EXPORT_SYMBOL(generic_write_checks); 2572 2573 int pagecache_write_begin(struct file *file, struct address_space *mapping, 2574 loff_t pos, unsigned len, unsigned flags, 2575 struct page **pagep, void **fsdata) 2576 { 2577 const struct address_space_operations *aops = mapping->a_ops; 2578 2579 return aops->write_begin(file, mapping, pos, len, flags, 2580 pagep, fsdata); 2581 } 2582 EXPORT_SYMBOL(pagecache_write_begin); 2583 2584 int pagecache_write_end(struct file *file, struct address_space *mapping, 2585 loff_t pos, unsigned len, unsigned copied, 2586 struct page *page, void *fsdata) 2587 { 2588 const struct address_space_operations *aops = mapping->a_ops; 2589 2590 return aops->write_end(file, mapping, pos, len, copied, page, fsdata); 2591 } 2592 EXPORT_SYMBOL(pagecache_write_end); 2593 2594 ssize_t 2595 generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) 2596 { 2597 struct file *file = iocb->ki_filp; 2598 struct address_space *mapping = file->f_mapping; 2599 struct inode *inode = mapping->host; 2600 loff_t pos = iocb->ki_pos; 2601 ssize_t written; 2602 size_t write_len; 2603 pgoff_t end; 2604 struct iov_iter data; 2605 2606 write_len = iov_iter_count(from); 2607 end = (pos + write_len - 1) >> PAGE_SHIFT; 2608 2609 written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1); 2610 if (written) 2611 goto out; 2612 2613 /* 2614 * After a write we want buffered reads to be sure to go to disk to get 2615 * the new data. We invalidate clean cached page from the region we're 2616 * about to write. We do this *before* the write so that we can return 2617 * without clobbering -EIOCBQUEUED from ->direct_IO(). 2618 */ 2619 if (mapping->nrpages) { 2620 written = invalidate_inode_pages2_range(mapping, 2621 pos >> PAGE_SHIFT, end); 2622 /* 2623 * If a page can not be invalidated, return 0 to fall back 2624 * to buffered write. 2625 */ 2626 if (written) { 2627 if (written == -EBUSY) 2628 return 0; 2629 goto out; 2630 } 2631 } 2632 2633 data = *from; 2634 written = mapping->a_ops->direct_IO(iocb, &data); 2635 2636 /* 2637 * Finally, try again to invalidate clean pages which might have been 2638 * cached by non-direct readahead, or faulted in by get_user_pages() 2639 * if the source of the write was an mmap'ed region of the file 2640 * we're writing. Either one is a pretty crazy thing to do, 2641 * so we don't support it 100%. If this invalidation 2642 * fails, tough, the write still worked... 2643 */ 2644 if (mapping->nrpages) { 2645 invalidate_inode_pages2_range(mapping, 2646 pos >> PAGE_SHIFT, end); 2647 } 2648 2649 if (written > 0) { 2650 pos += written; 2651 iov_iter_advance(from, written); 2652 if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { 2653 i_size_write(inode, pos); 2654 mark_inode_dirty(inode); 2655 } 2656 iocb->ki_pos = pos; 2657 } 2658 out: 2659 return written; 2660 } 2661 EXPORT_SYMBOL(generic_file_direct_write); 2662 2663 /* 2664 * Find or create a page at the given pagecache position. Return the locked 2665 * page. This function is specifically for buffered writes. 2666 */ 2667 struct page *grab_cache_page_write_begin(struct address_space *mapping, 2668 pgoff_t index, unsigned flags) 2669 { 2670 struct page *page; 2671 int fgp_flags = FGP_LOCK|FGP_WRITE|FGP_CREAT; 2672 2673 if (flags & AOP_FLAG_NOFS) 2674 fgp_flags |= FGP_NOFS; 2675 2676 page = pagecache_get_page(mapping, index, fgp_flags, 2677 mapping_gfp_mask(mapping)); 2678 if (page) 2679 wait_for_stable_page(page); 2680 2681 return page; 2682 } 2683 EXPORT_SYMBOL(grab_cache_page_write_begin); 2684 2685 ssize_t generic_perform_write(struct file *file, 2686 struct iov_iter *i, loff_t pos) 2687 { 2688 struct address_space *mapping = file->f_mapping; 2689 const struct address_space_operations *a_ops = mapping->a_ops; 2690 long status = 0; 2691 ssize_t written = 0; 2692 unsigned int flags = 0; 2693 2694 /* 2695 * Copies from kernel address space cannot fail (NFSD is a big user). 2696 */ 2697 if (!iter_is_iovec(i)) 2698 flags |= AOP_FLAG_UNINTERRUPTIBLE; 2699 2700 do { 2701 struct page *page; 2702 unsigned long offset; /* Offset into pagecache page */ 2703 unsigned long bytes; /* Bytes to write to page */ 2704 size_t copied; /* Bytes copied from user */ 2705 void *fsdata; 2706 2707 offset = (pos & (PAGE_SIZE - 1)); 2708 bytes = min_t(unsigned long, PAGE_SIZE - offset, 2709 iov_iter_count(i)); 2710 2711 again: 2712 /* 2713 * Bring in the user page that we will copy from _first_. 2714 * Otherwise there's a nasty deadlock on copying from the 2715 * same page as we're writing to, without it being marked 2716 * up-to-date. 2717 * 2718 * Not only is this an optimisation, but it is also required 2719 * to check that the address is actually valid, when atomic 2720 * usercopies are used, below. 2721 */ 2722 if (unlikely(iov_iter_fault_in_readable(i, bytes))) { 2723 status = -EFAULT; 2724 break; 2725 } 2726 2727 if (fatal_signal_pending(current)) { 2728 status = -EINTR; 2729 break; 2730 } 2731 2732 status = a_ops->write_begin(file, mapping, pos, bytes, flags, 2733 &page, &fsdata); 2734 if (unlikely(status < 0)) 2735 break; 2736 2737 if (mapping_writably_mapped(mapping)) 2738 flush_dcache_page(page); 2739 2740 copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); 2741 flush_dcache_page(page); 2742 2743 status = a_ops->write_end(file, mapping, pos, bytes, copied, 2744 page, fsdata); 2745 if (unlikely(status < 0)) 2746 break; 2747 copied = status; 2748 2749 cond_resched(); 2750 2751 iov_iter_advance(i, copied); 2752 if (unlikely(copied == 0)) { 2753 /* 2754 * If we were unable to copy any data at all, we must 2755 * fall back to a single segment length write. 2756 * 2757 * If we didn't fallback here, we could livelock 2758 * because not all segments in the iov can be copied at 2759 * once without a pagefault. 2760 */ 2761 bytes = min_t(unsigned long, PAGE_SIZE - offset, 2762 iov_iter_single_seg_count(i)); 2763 goto again; 2764 } 2765 pos += copied; 2766 written += copied; 2767 2768 balance_dirty_pages_ratelimited(mapping); 2769 } while (iov_iter_count(i)); 2770 2771 return written ? written : status; 2772 } 2773 EXPORT_SYMBOL(generic_perform_write); 2774 2775 /** 2776 * __generic_file_write_iter - write data to a file 2777 * @iocb: IO state structure (file, offset, etc.) 2778 * @from: iov_iter with data to write 2779 * 2780 * This function does all the work needed for actually writing data to a 2781 * file. It does all basic checks, removes SUID from the file, updates 2782 * modification times and calls proper subroutines depending on whether we 2783 * do direct IO or a standard buffered write. 2784 * 2785 * It expects i_mutex to be grabbed unless we work on a block device or similar 2786 * object which does not need locking at all. 2787 * 2788 * This function does *not* take care of syncing data in case of O_SYNC write. 2789 * A caller has to handle it. This is mainly due to the fact that we want to 2790 * avoid syncing under i_mutex. 2791 */ 2792 ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) 2793 { 2794 struct file *file = iocb->ki_filp; 2795 struct address_space * mapping = file->f_mapping; 2796 struct inode *inode = mapping->host; 2797 ssize_t written = 0; 2798 ssize_t err; 2799 ssize_t status; 2800 2801 /* We can write back this queue in page reclaim */ 2802 current->backing_dev_info = inode_to_bdi(inode); 2803 err = file_remove_privs(file); 2804 if (err) 2805 goto out; 2806 2807 err = file_update_time(file); 2808 if (err) 2809 goto out; 2810 2811 if (iocb->ki_flags & IOCB_DIRECT) { 2812 loff_t pos, endbyte; 2813 2814 written = generic_file_direct_write(iocb, from); 2815 /* 2816 * If the write stopped short of completing, fall back to 2817 * buffered writes. Some filesystems do this for writes to 2818 * holes, for example. For DAX files, a buffered write will 2819 * not succeed (even if it did, DAX does not handle dirty 2820 * page-cache pages correctly). 2821 */ 2822 if (written < 0 || !iov_iter_count(from) || IS_DAX(inode)) 2823 goto out; 2824 2825 status = generic_perform_write(file, from, pos = iocb->ki_pos); 2826 /* 2827 * If generic_perform_write() returned a synchronous error 2828 * then we want to return the number of bytes which were 2829 * direct-written, or the error code if that was zero. Note 2830 * that this differs from normal direct-io semantics, which 2831 * will return -EFOO even if some bytes were written. 2832 */ 2833 if (unlikely(status < 0)) { 2834 err = status; 2835 goto out; 2836 } 2837 /* 2838 * We need to ensure that the page cache pages are written to 2839 * disk and invalidated to preserve the expected O_DIRECT 2840 * semantics. 2841 */ 2842 endbyte = pos + status - 1; 2843 err = filemap_write_and_wait_range(mapping, pos, endbyte); 2844 if (err == 0) { 2845 iocb->ki_pos = endbyte + 1; 2846 written += status; 2847 invalidate_mapping_pages(mapping, 2848 pos >> PAGE_SHIFT, 2849 endbyte >> PAGE_SHIFT); 2850 } else { 2851 /* 2852 * We don't know how much we wrote, so just return 2853 * the number of bytes which were direct-written 2854 */ 2855 } 2856 } else { 2857 written = generic_perform_write(file, from, iocb->ki_pos); 2858 if (likely(written > 0)) 2859 iocb->ki_pos += written; 2860 } 2861 out: 2862 current->backing_dev_info = NULL; 2863 return written ? written : err; 2864 } 2865 EXPORT_SYMBOL(__generic_file_write_iter); 2866 2867 /** 2868 * generic_file_write_iter - write data to a file 2869 * @iocb: IO state structure 2870 * @from: iov_iter with data to write 2871 * 2872 * This is a wrapper around __generic_file_write_iter() to be used by most 2873 * filesystems. It takes care of syncing the file in case of O_SYNC file 2874 * and acquires i_mutex as needed. 2875 */ 2876 ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) 2877 { 2878 struct file *file = iocb->ki_filp; 2879 struct inode *inode = file->f_mapping->host; 2880 ssize_t ret; 2881 2882 inode_lock(inode); 2883 ret = generic_write_checks(iocb, from); 2884 if (ret > 0) 2885 ret = __generic_file_write_iter(iocb, from); 2886 inode_unlock(inode); 2887 2888 if (ret > 0) 2889 ret = generic_write_sync(iocb, ret); 2890 return ret; 2891 } 2892 EXPORT_SYMBOL(generic_file_write_iter); 2893 2894 /** 2895 * try_to_release_page() - release old fs-specific metadata on a page 2896 * 2897 * @page: the page which the kernel is trying to free 2898 * @gfp_mask: memory allocation flags (and I/O mode) 2899 * 2900 * The address_space is to try to release any data against the page 2901 * (presumably at page->private). If the release was successful, return `1'. 2902 * Otherwise return zero. 2903 * 2904 * This may also be called if PG_fscache is set on a page, indicating that the 2905 * page is known to the local caching routines. 2906 * 2907 * The @gfp_mask argument specifies whether I/O may be performed to release 2908 * this page (__GFP_IO), and whether the call may block (__GFP_RECLAIM & __GFP_FS). 2909 * 2910 */ 2911 int try_to_release_page(struct page *page, gfp_t gfp_mask) 2912 { 2913 struct address_space * const mapping = page->mapping; 2914 2915 BUG_ON(!PageLocked(page)); 2916 if (PageWriteback(page)) 2917 return 0; 2918 2919 if (mapping && mapping->a_ops->releasepage) 2920 return mapping->a_ops->releasepage(page, gfp_mask); 2921 return try_to_free_buffers(page); 2922 } 2923 2924 EXPORT_SYMBOL(try_to_release_page); 2925