1 /* 2 * linux/mm/filemap.c 3 * 4 * Copyright (C) 1994-1999 Linus Torvalds 5 */ 6 7 /* 8 * This file handles the generic file mmap semantics used by 9 * most "normal" filesystems (but you don't /have/ to use this: 10 * the NFS filesystem used to do this differently, for example) 11 */ 12 #include <linux/export.h> 13 #include <linux/compiler.h> 14 #include <linux/dax.h> 15 #include <linux/fs.h> 16 #include <linux/uaccess.h> 17 #include <linux/capability.h> 18 #include <linux/kernel_stat.h> 19 #include <linux/gfp.h> 20 #include <linux/mm.h> 21 #include <linux/swap.h> 22 #include <linux/mman.h> 23 #include <linux/pagemap.h> 24 #include <linux/file.h> 25 #include <linux/uio.h> 26 #include <linux/hash.h> 27 #include <linux/writeback.h> 28 #include <linux/backing-dev.h> 29 #include <linux/pagevec.h> 30 #include <linux/blkdev.h> 31 #include <linux/security.h> 32 #include <linux/cpuset.h> 33 #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ 34 #include <linux/hugetlb.h> 35 #include <linux/memcontrol.h> 36 #include <linux/cleancache.h> 37 #include <linux/rmap.h> 38 #include "internal.h" 39 40 #define CREATE_TRACE_POINTS 41 #include <trace/events/filemap.h> 42 43 /* 44 * FIXME: remove all knowledge of the buffer layer from the core VM 45 */ 46 #include <linux/buffer_head.h> /* for try_to_free_buffers */ 47 48 #include <asm/mman.h> 49 50 /* 51 * Shared mappings implemented 30.11.1994. It's not fully working yet, 52 * though. 53 * 54 * Shared mappings now work. 15.8.1995 Bruno. 55 * 56 * finished 'unifying' the page and buffer cache and SMP-threaded the 57 * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com> 58 * 59 * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de> 60 */ 61 62 /* 63 * Lock ordering: 64 * 65 * ->i_mmap_rwsem (truncate_pagecache) 66 * ->private_lock (__free_pte->__set_page_dirty_buffers) 67 * ->swap_lock (exclusive_swap_page, others) 68 * ->mapping->tree_lock 69 * 70 * ->i_mutex 71 * ->i_mmap_rwsem (truncate->unmap_mapping_range) 72 * 73 * ->mmap_sem 74 * ->i_mmap_rwsem 75 * ->page_table_lock or pte_lock (various, mainly in memory.c) 76 * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock) 77 * 78 * ->mmap_sem 79 * ->lock_page (access_process_vm) 80 * 81 * ->i_mutex (generic_perform_write) 82 * ->mmap_sem (fault_in_pages_readable->do_page_fault) 83 * 84 * bdi->wb.list_lock 85 * sb_lock (fs/fs-writeback.c) 86 * ->mapping->tree_lock (__sync_single_inode) 87 * 88 * ->i_mmap_rwsem 89 * ->anon_vma.lock (vma_adjust) 90 * 91 * ->anon_vma.lock 92 * ->page_table_lock or pte_lock (anon_vma_prepare and various) 93 * 94 * ->page_table_lock or pte_lock 95 * ->swap_lock (try_to_unmap_one) 96 * ->private_lock (try_to_unmap_one) 97 * ->tree_lock (try_to_unmap_one) 98 * ->zone_lru_lock(zone) (follow_page->mark_page_accessed) 99 * ->zone_lru_lock(zone) (check_pte_range->isolate_lru_page) 100 * ->private_lock (page_remove_rmap->set_page_dirty) 101 * ->tree_lock (page_remove_rmap->set_page_dirty) 102 * bdi.wb->list_lock (page_remove_rmap->set_page_dirty) 103 * ->inode->i_lock (page_remove_rmap->set_page_dirty) 104 * ->memcg->move_lock (page_remove_rmap->lock_page_memcg) 105 * bdi.wb->list_lock (zap_pte_range->set_page_dirty) 106 * ->inode->i_lock (zap_pte_range->set_page_dirty) 107 * ->private_lock (zap_pte_range->__set_page_dirty_buffers) 108 * 109 * ->i_mmap_rwsem 110 * ->tasklist_lock (memory_failure, collect_procs_ao) 111 */ 112 113 static int page_cache_tree_insert(struct address_space *mapping, 114 struct page *page, void **shadowp) 115 { 116 struct radix_tree_node *node; 117 void **slot; 118 int error; 119 120 error = __radix_tree_create(&mapping->page_tree, page->index, 0, 121 &node, &slot); 122 if (error) 123 return error; 124 if (*slot) { 125 void *p; 126 127 p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock); 128 if (!radix_tree_exceptional_entry(p)) 129 return -EEXIST; 130 131 mapping->nrexceptional--; 132 if (!dax_mapping(mapping)) { 133 if (shadowp) 134 *shadowp = p; 135 if (node) 136 workingset_node_shadows_dec(node); 137 } else { 138 /* DAX can replace empty locked entry with a hole */ 139 WARN_ON_ONCE(p != 140 (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | 141 RADIX_DAX_ENTRY_LOCK)); 142 /* DAX accounts exceptional entries as normal pages */ 143 if (node) 144 workingset_node_pages_dec(node); 145 /* Wakeup waiters for exceptional entry lock */ 146 dax_wake_mapping_entry_waiter(mapping, page->index, 147 false); 148 } 149 } 150 radix_tree_replace_slot(slot, page); 151 mapping->nrpages++; 152 if (node) { 153 workingset_node_pages_inc(node); 154 /* 155 * Don't track node that contains actual pages. 156 * 157 * Avoid acquiring the list_lru lock if already 158 * untracked. The list_empty() test is safe as 159 * node->private_list is protected by 160 * mapping->tree_lock. 161 */ 162 if (!list_empty(&node->private_list)) 163 list_lru_del(&workingset_shadow_nodes, 164 &node->private_list); 165 } 166 return 0; 167 } 168 169 static void page_cache_tree_delete(struct address_space *mapping, 170 struct page *page, void *shadow) 171 { 172 int i, nr = PageHuge(page) ? 1 : hpage_nr_pages(page); 173 174 VM_BUG_ON_PAGE(!PageLocked(page), page); 175 VM_BUG_ON_PAGE(PageTail(page), page); 176 VM_BUG_ON_PAGE(nr != 1 && shadow, page); 177 178 for (i = 0; i < nr; i++) { 179 struct radix_tree_node *node; 180 void **slot; 181 182 __radix_tree_lookup(&mapping->page_tree, page->index + i, 183 &node, &slot); 184 185 radix_tree_clear_tags(&mapping->page_tree, node, slot); 186 187 if (!node) { 188 VM_BUG_ON_PAGE(nr != 1, page); 189 /* 190 * We need a node to properly account shadow 191 * entries. Don't plant any without. XXX 192 */ 193 shadow = NULL; 194 } 195 196 radix_tree_replace_slot(slot, shadow); 197 198 if (!node) 199 break; 200 201 workingset_node_pages_dec(node); 202 if (shadow) 203 workingset_node_shadows_inc(node); 204 else 205 if (__radix_tree_delete_node(&mapping->page_tree, node)) 206 continue; 207 208 /* 209 * Track node that only contains shadow entries. DAX mappings 210 * contain no shadow entries and may contain other exceptional 211 * entries so skip those. 212 * 213 * Avoid acquiring the list_lru lock if already tracked. 214 * The list_empty() test is safe as node->private_list is 215 * protected by mapping->tree_lock. 216 */ 217 if (!dax_mapping(mapping) && !workingset_node_pages(node) && 218 list_empty(&node->private_list)) { 219 node->private_data = mapping; 220 list_lru_add(&workingset_shadow_nodes, 221 &node->private_list); 222 } 223 } 224 225 if (shadow) { 226 mapping->nrexceptional += nr; 227 /* 228 * Make sure the nrexceptional update is committed before 229 * the nrpages update so that final truncate racing 230 * with reclaim does not see both counters 0 at the 231 * same time and miss a shadow entry. 232 */ 233 smp_wmb(); 234 } 235 mapping->nrpages -= nr; 236 } 237 238 /* 239 * Delete a page from the page cache and free it. Caller has to make 240 * sure the page is locked and that nobody else uses it - or that usage 241 * is safe. The caller must hold the mapping's tree_lock. 242 */ 243 void __delete_from_page_cache(struct page *page, void *shadow) 244 { 245 struct address_space *mapping = page->mapping; 246 int nr = hpage_nr_pages(page); 247 248 trace_mm_filemap_delete_from_page_cache(page); 249 /* 250 * if we're uptodate, flush out into the cleancache, otherwise 251 * invalidate any existing cleancache entries. We can't leave 252 * stale data around in the cleancache once our page is gone 253 */ 254 if (PageUptodate(page) && PageMappedToDisk(page)) 255 cleancache_put_page(page); 256 else 257 cleancache_invalidate_page(mapping, page); 258 259 VM_BUG_ON_PAGE(PageTail(page), page); 260 VM_BUG_ON_PAGE(page_mapped(page), page); 261 if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(page_mapped(page))) { 262 int mapcount; 263 264 pr_alert("BUG: Bad page cache in process %s pfn:%05lx\n", 265 current->comm, page_to_pfn(page)); 266 dump_page(page, "still mapped when deleted"); 267 dump_stack(); 268 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); 269 270 mapcount = page_mapcount(page); 271 if (mapping_exiting(mapping) && 272 page_count(page) >= mapcount + 2) { 273 /* 274 * All vmas have already been torn down, so it's 275 * a good bet that actually the page is unmapped, 276 * and we'd prefer not to leak it: if we're wrong, 277 * some other bad page check should catch it later. 278 */ 279 page_mapcount_reset(page); 280 page_ref_sub(page, mapcount); 281 } 282 } 283 284 page_cache_tree_delete(mapping, page, shadow); 285 286 page->mapping = NULL; 287 /* Leave page->index set: truncation lookup relies upon it */ 288 289 /* hugetlb pages do not participate in page cache accounting. */ 290 if (!PageHuge(page)) 291 __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr); 292 if (PageSwapBacked(page)) { 293 __mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr); 294 if (PageTransHuge(page)) 295 __dec_node_page_state(page, NR_SHMEM_THPS); 296 } else { 297 VM_BUG_ON_PAGE(PageTransHuge(page) && !PageHuge(page), page); 298 } 299 300 /* 301 * At this point page must be either written or cleaned by truncate. 302 * Dirty page here signals a bug and loss of unwritten data. 303 * 304 * This fixes dirty accounting after removing the page entirely but 305 * leaves PageDirty set: it has no effect for truncated page and 306 * anyway will be cleared before returning page into buddy allocator. 307 */ 308 if (WARN_ON_ONCE(PageDirty(page))) 309 account_page_cleaned(page, mapping, inode_to_wb(mapping->host)); 310 } 311 312 /** 313 * delete_from_page_cache - delete page from page cache 314 * @page: the page which the kernel is trying to remove from page cache 315 * 316 * This must be called only on pages that have been verified to be in the page 317 * cache and locked. It will never put the page into the free list, the caller 318 * has a reference on the page. 319 */ 320 void delete_from_page_cache(struct page *page) 321 { 322 struct address_space *mapping = page_mapping(page); 323 unsigned long flags; 324 void (*freepage)(struct page *); 325 326 BUG_ON(!PageLocked(page)); 327 328 freepage = mapping->a_ops->freepage; 329 330 spin_lock_irqsave(&mapping->tree_lock, flags); 331 __delete_from_page_cache(page, NULL); 332 spin_unlock_irqrestore(&mapping->tree_lock, flags); 333 334 if (freepage) 335 freepage(page); 336 337 if (PageTransHuge(page) && !PageHuge(page)) { 338 page_ref_sub(page, HPAGE_PMD_NR); 339 VM_BUG_ON_PAGE(page_count(page) <= 0, page); 340 } else { 341 put_page(page); 342 } 343 } 344 EXPORT_SYMBOL(delete_from_page_cache); 345 346 int filemap_check_errors(struct address_space *mapping) 347 { 348 int ret = 0; 349 /* Check for outstanding write errors */ 350 if (test_bit(AS_ENOSPC, &mapping->flags) && 351 test_and_clear_bit(AS_ENOSPC, &mapping->flags)) 352 ret = -ENOSPC; 353 if (test_bit(AS_EIO, &mapping->flags) && 354 test_and_clear_bit(AS_EIO, &mapping->flags)) 355 ret = -EIO; 356 return ret; 357 } 358 EXPORT_SYMBOL(filemap_check_errors); 359 360 /** 361 * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range 362 * @mapping: address space structure to write 363 * @start: offset in bytes where the range starts 364 * @end: offset in bytes where the range ends (inclusive) 365 * @sync_mode: enable synchronous operation 366 * 367 * Start writeback against all of a mapping's dirty pages that lie 368 * within the byte offsets <start, end> inclusive. 369 * 370 * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as 371 * opposed to a regular memory cleansing writeback. The difference between 372 * these two operations is that if a dirty page/buffer is encountered, it must 373 * be waited upon, and not just skipped over. 374 */ 375 int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, 376 loff_t end, int sync_mode) 377 { 378 int ret; 379 struct writeback_control wbc = { 380 .sync_mode = sync_mode, 381 .nr_to_write = LONG_MAX, 382 .range_start = start, 383 .range_end = end, 384 }; 385 386 if (!mapping_cap_writeback_dirty(mapping)) 387 return 0; 388 389 wbc_attach_fdatawrite_inode(&wbc, mapping->host); 390 ret = do_writepages(mapping, &wbc); 391 wbc_detach_inode(&wbc); 392 return ret; 393 } 394 395 static inline int __filemap_fdatawrite(struct address_space *mapping, 396 int sync_mode) 397 { 398 return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode); 399 } 400 401 int filemap_fdatawrite(struct address_space *mapping) 402 { 403 return __filemap_fdatawrite(mapping, WB_SYNC_ALL); 404 } 405 EXPORT_SYMBOL(filemap_fdatawrite); 406 407 int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, 408 loff_t end) 409 { 410 return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL); 411 } 412 EXPORT_SYMBOL(filemap_fdatawrite_range); 413 414 /** 415 * filemap_flush - mostly a non-blocking flush 416 * @mapping: target address_space 417 * 418 * This is a mostly non-blocking flush. Not suitable for data-integrity 419 * purposes - I/O may not be started against all dirty pages. 420 */ 421 int filemap_flush(struct address_space *mapping) 422 { 423 return __filemap_fdatawrite(mapping, WB_SYNC_NONE); 424 } 425 EXPORT_SYMBOL(filemap_flush); 426 427 static int __filemap_fdatawait_range(struct address_space *mapping, 428 loff_t start_byte, loff_t end_byte) 429 { 430 pgoff_t index = start_byte >> PAGE_SHIFT; 431 pgoff_t end = end_byte >> PAGE_SHIFT; 432 struct pagevec pvec; 433 int nr_pages; 434 int ret = 0; 435 436 if (end_byte < start_byte) 437 goto out; 438 439 pagevec_init(&pvec, 0); 440 while ((index <= end) && 441 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, 442 PAGECACHE_TAG_WRITEBACK, 443 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) { 444 unsigned i; 445 446 for (i = 0; i < nr_pages; i++) { 447 struct page *page = pvec.pages[i]; 448 449 /* until radix tree lookup accepts end_index */ 450 if (page->index > end) 451 continue; 452 453 wait_on_page_writeback(page); 454 if (TestClearPageError(page)) 455 ret = -EIO; 456 } 457 pagevec_release(&pvec); 458 cond_resched(); 459 } 460 out: 461 return ret; 462 } 463 464 /** 465 * filemap_fdatawait_range - wait for writeback to complete 466 * @mapping: address space structure to wait for 467 * @start_byte: offset in bytes where the range starts 468 * @end_byte: offset in bytes where the range ends (inclusive) 469 * 470 * Walk the list of under-writeback pages of the given address space 471 * in the given range and wait for all of them. Check error status of 472 * the address space and return it. 473 * 474 * Since the error status of the address space is cleared by this function, 475 * callers are responsible for checking the return value and handling and/or 476 * reporting the error. 477 */ 478 int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, 479 loff_t end_byte) 480 { 481 int ret, ret2; 482 483 ret = __filemap_fdatawait_range(mapping, start_byte, end_byte); 484 ret2 = filemap_check_errors(mapping); 485 if (!ret) 486 ret = ret2; 487 488 return ret; 489 } 490 EXPORT_SYMBOL(filemap_fdatawait_range); 491 492 /** 493 * filemap_fdatawait_keep_errors - wait for writeback without clearing errors 494 * @mapping: address space structure to wait for 495 * 496 * Walk the list of under-writeback pages of the given address space 497 * and wait for all of them. Unlike filemap_fdatawait(), this function 498 * does not clear error status of the address space. 499 * 500 * Use this function if callers don't handle errors themselves. Expected 501 * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2), 502 * fsfreeze(8) 503 */ 504 void filemap_fdatawait_keep_errors(struct address_space *mapping) 505 { 506 loff_t i_size = i_size_read(mapping->host); 507 508 if (i_size == 0) 509 return; 510 511 __filemap_fdatawait_range(mapping, 0, i_size - 1); 512 } 513 514 /** 515 * filemap_fdatawait - wait for all under-writeback pages to complete 516 * @mapping: address space structure to wait for 517 * 518 * Walk the list of under-writeback pages of the given address space 519 * and wait for all of them. Check error status of the address space 520 * and return it. 521 * 522 * Since the error status of the address space is cleared by this function, 523 * callers are responsible for checking the return value and handling and/or 524 * reporting the error. 525 */ 526 int filemap_fdatawait(struct address_space *mapping) 527 { 528 loff_t i_size = i_size_read(mapping->host); 529 530 if (i_size == 0) 531 return 0; 532 533 return filemap_fdatawait_range(mapping, 0, i_size - 1); 534 } 535 EXPORT_SYMBOL(filemap_fdatawait); 536 537 int filemap_write_and_wait(struct address_space *mapping) 538 { 539 int err = 0; 540 541 if ((!dax_mapping(mapping) && mapping->nrpages) || 542 (dax_mapping(mapping) && mapping->nrexceptional)) { 543 err = filemap_fdatawrite(mapping); 544 /* 545 * Even if the above returned error, the pages may be 546 * written partially (e.g. -ENOSPC), so we wait for it. 547 * But the -EIO is special case, it may indicate the worst 548 * thing (e.g. bug) happened, so we avoid waiting for it. 549 */ 550 if (err != -EIO) { 551 int err2 = filemap_fdatawait(mapping); 552 if (!err) 553 err = err2; 554 } 555 } else { 556 err = filemap_check_errors(mapping); 557 } 558 return err; 559 } 560 EXPORT_SYMBOL(filemap_write_and_wait); 561 562 /** 563 * filemap_write_and_wait_range - write out & wait on a file range 564 * @mapping: the address_space for the pages 565 * @lstart: offset in bytes where the range starts 566 * @lend: offset in bytes where the range ends (inclusive) 567 * 568 * Write out and wait upon file offsets lstart->lend, inclusive. 569 * 570 * Note that `lend' is inclusive (describes the last byte to be written) so 571 * that this function can be used to write to the very end-of-file (end = -1). 572 */ 573 int filemap_write_and_wait_range(struct address_space *mapping, 574 loff_t lstart, loff_t lend) 575 { 576 int err = 0; 577 578 if ((!dax_mapping(mapping) && mapping->nrpages) || 579 (dax_mapping(mapping) && mapping->nrexceptional)) { 580 err = __filemap_fdatawrite_range(mapping, lstart, lend, 581 WB_SYNC_ALL); 582 /* See comment of filemap_write_and_wait() */ 583 if (err != -EIO) { 584 int err2 = filemap_fdatawait_range(mapping, 585 lstart, lend); 586 if (!err) 587 err = err2; 588 } 589 } else { 590 err = filemap_check_errors(mapping); 591 } 592 return err; 593 } 594 EXPORT_SYMBOL(filemap_write_and_wait_range); 595 596 /** 597 * replace_page_cache_page - replace a pagecache page with a new one 598 * @old: page to be replaced 599 * @new: page to replace with 600 * @gfp_mask: allocation mode 601 * 602 * This function replaces a page in the pagecache with a new one. On 603 * success it acquires the pagecache reference for the new page and 604 * drops it for the old page. Both the old and new pages must be 605 * locked. This function does not add the new page to the LRU, the 606 * caller must do that. 607 * 608 * The remove + add is atomic. The only way this function can fail is 609 * memory allocation failure. 610 */ 611 int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) 612 { 613 int error; 614 615 VM_BUG_ON_PAGE(!PageLocked(old), old); 616 VM_BUG_ON_PAGE(!PageLocked(new), new); 617 VM_BUG_ON_PAGE(new->mapping, new); 618 619 error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); 620 if (!error) { 621 struct address_space *mapping = old->mapping; 622 void (*freepage)(struct page *); 623 unsigned long flags; 624 625 pgoff_t offset = old->index; 626 freepage = mapping->a_ops->freepage; 627 628 get_page(new); 629 new->mapping = mapping; 630 new->index = offset; 631 632 spin_lock_irqsave(&mapping->tree_lock, flags); 633 __delete_from_page_cache(old, NULL); 634 error = page_cache_tree_insert(mapping, new, NULL); 635 BUG_ON(error); 636 637 /* 638 * hugetlb pages do not participate in page cache accounting. 639 */ 640 if (!PageHuge(new)) 641 __inc_node_page_state(new, NR_FILE_PAGES); 642 if (PageSwapBacked(new)) 643 __inc_node_page_state(new, NR_SHMEM); 644 spin_unlock_irqrestore(&mapping->tree_lock, flags); 645 mem_cgroup_migrate(old, new); 646 radix_tree_preload_end(); 647 if (freepage) 648 freepage(old); 649 put_page(old); 650 } 651 652 return error; 653 } 654 EXPORT_SYMBOL_GPL(replace_page_cache_page); 655 656 static int __add_to_page_cache_locked(struct page *page, 657 struct address_space *mapping, 658 pgoff_t offset, gfp_t gfp_mask, 659 void **shadowp) 660 { 661 int huge = PageHuge(page); 662 struct mem_cgroup *memcg; 663 int error; 664 665 VM_BUG_ON_PAGE(!PageLocked(page), page); 666 VM_BUG_ON_PAGE(PageSwapBacked(page), page); 667 668 if (!huge) { 669 error = mem_cgroup_try_charge(page, current->mm, 670 gfp_mask, &memcg, false); 671 if (error) 672 return error; 673 } 674 675 error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM); 676 if (error) { 677 if (!huge) 678 mem_cgroup_cancel_charge(page, memcg, false); 679 return error; 680 } 681 682 get_page(page); 683 page->mapping = mapping; 684 page->index = offset; 685 686 spin_lock_irq(&mapping->tree_lock); 687 error = page_cache_tree_insert(mapping, page, shadowp); 688 radix_tree_preload_end(); 689 if (unlikely(error)) 690 goto err_insert; 691 692 /* hugetlb pages do not participate in page cache accounting. */ 693 if (!huge) 694 __inc_node_page_state(page, NR_FILE_PAGES); 695 spin_unlock_irq(&mapping->tree_lock); 696 if (!huge) 697 mem_cgroup_commit_charge(page, memcg, false, false); 698 trace_mm_filemap_add_to_page_cache(page); 699 return 0; 700 err_insert: 701 page->mapping = NULL; 702 /* Leave page->index set: truncation relies upon it */ 703 spin_unlock_irq(&mapping->tree_lock); 704 if (!huge) 705 mem_cgroup_cancel_charge(page, memcg, false); 706 put_page(page); 707 return error; 708 } 709 710 /** 711 * add_to_page_cache_locked - add a locked page to the pagecache 712 * @page: page to add 713 * @mapping: the page's address_space 714 * @offset: page index 715 * @gfp_mask: page allocation mode 716 * 717 * This function is used to add a page to the pagecache. It must be locked. 718 * This function does not add the page to the LRU. The caller must do that. 719 */ 720 int add_to_page_cache_locked(struct page *page, struct address_space *mapping, 721 pgoff_t offset, gfp_t gfp_mask) 722 { 723 return __add_to_page_cache_locked(page, mapping, offset, 724 gfp_mask, NULL); 725 } 726 EXPORT_SYMBOL(add_to_page_cache_locked); 727 728 int add_to_page_cache_lru(struct page *page, struct address_space *mapping, 729 pgoff_t offset, gfp_t gfp_mask) 730 { 731 void *shadow = NULL; 732 int ret; 733 734 __SetPageLocked(page); 735 ret = __add_to_page_cache_locked(page, mapping, offset, 736 gfp_mask, &shadow); 737 if (unlikely(ret)) 738 __ClearPageLocked(page); 739 else { 740 /* 741 * The page might have been evicted from cache only 742 * recently, in which case it should be activated like 743 * any other repeatedly accessed page. 744 * The exception is pages getting rewritten; evicting other 745 * data from the working set, only to cache data that will 746 * get overwritten with something else, is a waste of memory. 747 */ 748 if (!(gfp_mask & __GFP_WRITE) && 749 shadow && workingset_refault(shadow)) { 750 SetPageActive(page); 751 workingset_activation(page); 752 } else 753 ClearPageActive(page); 754 lru_cache_add(page); 755 } 756 return ret; 757 } 758 EXPORT_SYMBOL_GPL(add_to_page_cache_lru); 759 760 #ifdef CONFIG_NUMA 761 struct page *__page_cache_alloc(gfp_t gfp) 762 { 763 int n; 764 struct page *page; 765 766 if (cpuset_do_page_mem_spread()) { 767 unsigned int cpuset_mems_cookie; 768 do { 769 cpuset_mems_cookie = read_mems_allowed_begin(); 770 n = cpuset_mem_spread_node(); 771 page = __alloc_pages_node(n, gfp, 0); 772 } while (!page && read_mems_allowed_retry(cpuset_mems_cookie)); 773 774 return page; 775 } 776 return alloc_pages(gfp, 0); 777 } 778 EXPORT_SYMBOL(__page_cache_alloc); 779 #endif 780 781 /* 782 * In order to wait for pages to become available there must be 783 * waitqueues associated with pages. By using a hash table of 784 * waitqueues where the bucket discipline is to maintain all 785 * waiters on the same queue and wake all when any of the pages 786 * become available, and for the woken contexts to check to be 787 * sure the appropriate page became available, this saves space 788 * at a cost of "thundering herd" phenomena during rare hash 789 * collisions. 790 */ 791 wait_queue_head_t *page_waitqueue(struct page *page) 792 { 793 return bit_waitqueue(page, 0); 794 } 795 EXPORT_SYMBOL(page_waitqueue); 796 797 void wait_on_page_bit(struct page *page, int bit_nr) 798 { 799 DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); 800 801 if (test_bit(bit_nr, &page->flags)) 802 __wait_on_bit(page_waitqueue(page), &wait, bit_wait_io, 803 TASK_UNINTERRUPTIBLE); 804 } 805 EXPORT_SYMBOL(wait_on_page_bit); 806 807 int wait_on_page_bit_killable(struct page *page, int bit_nr) 808 { 809 DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); 810 811 if (!test_bit(bit_nr, &page->flags)) 812 return 0; 813 814 return __wait_on_bit(page_waitqueue(page), &wait, 815 bit_wait_io, TASK_KILLABLE); 816 } 817 818 int wait_on_page_bit_killable_timeout(struct page *page, 819 int bit_nr, unsigned long timeout) 820 { 821 DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); 822 823 wait.key.timeout = jiffies + timeout; 824 if (!test_bit(bit_nr, &page->flags)) 825 return 0; 826 return __wait_on_bit(page_waitqueue(page), &wait, 827 bit_wait_io_timeout, TASK_KILLABLE); 828 } 829 EXPORT_SYMBOL_GPL(wait_on_page_bit_killable_timeout); 830 831 /** 832 * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue 833 * @page: Page defining the wait queue of interest 834 * @waiter: Waiter to add to the queue 835 * 836 * Add an arbitrary @waiter to the wait queue for the nominated @page. 837 */ 838 void add_page_wait_queue(struct page *page, wait_queue_t *waiter) 839 { 840 wait_queue_head_t *q = page_waitqueue(page); 841 unsigned long flags; 842 843 spin_lock_irqsave(&q->lock, flags); 844 __add_wait_queue(q, waiter); 845 spin_unlock_irqrestore(&q->lock, flags); 846 } 847 EXPORT_SYMBOL_GPL(add_page_wait_queue); 848 849 /** 850 * unlock_page - unlock a locked page 851 * @page: the page 852 * 853 * Unlocks the page and wakes up sleepers in ___wait_on_page_locked(). 854 * Also wakes sleepers in wait_on_page_writeback() because the wakeup 855 * mechanism between PageLocked pages and PageWriteback pages is shared. 856 * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep. 857 * 858 * The mb is necessary to enforce ordering between the clear_bit and the read 859 * of the waitqueue (to avoid SMP races with a parallel wait_on_page_locked()). 860 */ 861 void unlock_page(struct page *page) 862 { 863 page = compound_head(page); 864 VM_BUG_ON_PAGE(!PageLocked(page), page); 865 clear_bit_unlock(PG_locked, &page->flags); 866 smp_mb__after_atomic(); 867 wake_up_page(page, PG_locked); 868 } 869 EXPORT_SYMBOL(unlock_page); 870 871 /** 872 * end_page_writeback - end writeback against a page 873 * @page: the page 874 */ 875 void end_page_writeback(struct page *page) 876 { 877 /* 878 * TestClearPageReclaim could be used here but it is an atomic 879 * operation and overkill in this particular case. Failing to 880 * shuffle a page marked for immediate reclaim is too mild to 881 * justify taking an atomic operation penalty at the end of 882 * ever page writeback. 883 */ 884 if (PageReclaim(page)) { 885 ClearPageReclaim(page); 886 rotate_reclaimable_page(page); 887 } 888 889 if (!test_clear_page_writeback(page)) 890 BUG(); 891 892 smp_mb__after_atomic(); 893 wake_up_page(page, PG_writeback); 894 } 895 EXPORT_SYMBOL(end_page_writeback); 896 897 /* 898 * After completing I/O on a page, call this routine to update the page 899 * flags appropriately 900 */ 901 void page_endio(struct page *page, bool is_write, int err) 902 { 903 if (!is_write) { 904 if (!err) { 905 SetPageUptodate(page); 906 } else { 907 ClearPageUptodate(page); 908 SetPageError(page); 909 } 910 unlock_page(page); 911 } else { 912 if (err) { 913 SetPageError(page); 914 if (page->mapping) 915 mapping_set_error(page->mapping, err); 916 } 917 end_page_writeback(page); 918 } 919 } 920 EXPORT_SYMBOL_GPL(page_endio); 921 922 /** 923 * __lock_page - get a lock on the page, assuming we need to sleep to get it 924 * @page: the page to lock 925 */ 926 void __lock_page(struct page *page) 927 { 928 struct page *page_head = compound_head(page); 929 DEFINE_WAIT_BIT(wait, &page_head->flags, PG_locked); 930 931 __wait_on_bit_lock(page_waitqueue(page_head), &wait, bit_wait_io, 932 TASK_UNINTERRUPTIBLE); 933 } 934 EXPORT_SYMBOL(__lock_page); 935 936 int __lock_page_killable(struct page *page) 937 { 938 struct page *page_head = compound_head(page); 939 DEFINE_WAIT_BIT(wait, &page_head->flags, PG_locked); 940 941 return __wait_on_bit_lock(page_waitqueue(page_head), &wait, 942 bit_wait_io, TASK_KILLABLE); 943 } 944 EXPORT_SYMBOL_GPL(__lock_page_killable); 945 946 /* 947 * Return values: 948 * 1 - page is locked; mmap_sem is still held. 949 * 0 - page is not locked. 950 * mmap_sem has been released (up_read()), unless flags had both 951 * FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_RETRY_NOWAIT set, in 952 * which case mmap_sem is still held. 953 * 954 * If neither ALLOW_RETRY nor KILLABLE are set, will always return 1 955 * with the page locked and the mmap_sem unperturbed. 956 */ 957 int __lock_page_or_retry(struct page *page, struct mm_struct *mm, 958 unsigned int flags) 959 { 960 if (flags & FAULT_FLAG_ALLOW_RETRY) { 961 /* 962 * CAUTION! In this case, mmap_sem is not released 963 * even though return 0. 964 */ 965 if (flags & FAULT_FLAG_RETRY_NOWAIT) 966 return 0; 967 968 up_read(&mm->mmap_sem); 969 if (flags & FAULT_FLAG_KILLABLE) 970 wait_on_page_locked_killable(page); 971 else 972 wait_on_page_locked(page); 973 return 0; 974 } else { 975 if (flags & FAULT_FLAG_KILLABLE) { 976 int ret; 977 978 ret = __lock_page_killable(page); 979 if (ret) { 980 up_read(&mm->mmap_sem); 981 return 0; 982 } 983 } else 984 __lock_page(page); 985 return 1; 986 } 987 } 988 989 /** 990 * page_cache_next_hole - find the next hole (not-present entry) 991 * @mapping: mapping 992 * @index: index 993 * @max_scan: maximum range to search 994 * 995 * Search the set [index, min(index+max_scan-1, MAX_INDEX)] for the 996 * lowest indexed hole. 997 * 998 * Returns: the index of the hole if found, otherwise returns an index 999 * outside of the set specified (in which case 'return - index >= 1000 * max_scan' will be true). In rare cases of index wrap-around, 0 will 1001 * be returned. 1002 * 1003 * page_cache_next_hole may be called under rcu_read_lock. However, 1004 * like radix_tree_gang_lookup, this will not atomically search a 1005 * snapshot of the tree at a single point in time. For example, if a 1006 * hole is created at index 5, then subsequently a hole is created at 1007 * index 10, page_cache_next_hole covering both indexes may return 10 1008 * if called under rcu_read_lock. 1009 */ 1010 pgoff_t page_cache_next_hole(struct address_space *mapping, 1011 pgoff_t index, unsigned long max_scan) 1012 { 1013 unsigned long i; 1014 1015 for (i = 0; i < max_scan; i++) { 1016 struct page *page; 1017 1018 page = radix_tree_lookup(&mapping->page_tree, index); 1019 if (!page || radix_tree_exceptional_entry(page)) 1020 break; 1021 index++; 1022 if (index == 0) 1023 break; 1024 } 1025 1026 return index; 1027 } 1028 EXPORT_SYMBOL(page_cache_next_hole); 1029 1030 /** 1031 * page_cache_prev_hole - find the prev hole (not-present entry) 1032 * @mapping: mapping 1033 * @index: index 1034 * @max_scan: maximum range to search 1035 * 1036 * Search backwards in the range [max(index-max_scan+1, 0), index] for 1037 * the first hole. 1038 * 1039 * Returns: the index of the hole if found, otherwise returns an index 1040 * outside of the set specified (in which case 'index - return >= 1041 * max_scan' will be true). In rare cases of wrap-around, ULONG_MAX 1042 * will be returned. 1043 * 1044 * page_cache_prev_hole may be called under rcu_read_lock. However, 1045 * like radix_tree_gang_lookup, this will not atomically search a 1046 * snapshot of the tree at a single point in time. For example, if a 1047 * hole is created at index 10, then subsequently a hole is created at 1048 * index 5, page_cache_prev_hole covering both indexes may return 5 if 1049 * called under rcu_read_lock. 1050 */ 1051 pgoff_t page_cache_prev_hole(struct address_space *mapping, 1052 pgoff_t index, unsigned long max_scan) 1053 { 1054 unsigned long i; 1055 1056 for (i = 0; i < max_scan; i++) { 1057 struct page *page; 1058 1059 page = radix_tree_lookup(&mapping->page_tree, index); 1060 if (!page || radix_tree_exceptional_entry(page)) 1061 break; 1062 index--; 1063 if (index == ULONG_MAX) 1064 break; 1065 } 1066 1067 return index; 1068 } 1069 EXPORT_SYMBOL(page_cache_prev_hole); 1070 1071 /** 1072 * find_get_entry - find and get a page cache entry 1073 * @mapping: the address_space to search 1074 * @offset: the page cache index 1075 * 1076 * Looks up the page cache slot at @mapping & @offset. If there is a 1077 * page cache page, it is returned with an increased refcount. 1078 * 1079 * If the slot holds a shadow entry of a previously evicted page, or a 1080 * swap entry from shmem/tmpfs, it is returned. 1081 * 1082 * Otherwise, %NULL is returned. 1083 */ 1084 struct page *find_get_entry(struct address_space *mapping, pgoff_t offset) 1085 { 1086 void **pagep; 1087 struct page *head, *page; 1088 1089 rcu_read_lock(); 1090 repeat: 1091 page = NULL; 1092 pagep = radix_tree_lookup_slot(&mapping->page_tree, offset); 1093 if (pagep) { 1094 page = radix_tree_deref_slot(pagep); 1095 if (unlikely(!page)) 1096 goto out; 1097 if (radix_tree_exception(page)) { 1098 if (radix_tree_deref_retry(page)) 1099 goto repeat; 1100 /* 1101 * A shadow entry of a recently evicted page, 1102 * or a swap entry from shmem/tmpfs. Return 1103 * it without attempting to raise page count. 1104 */ 1105 goto out; 1106 } 1107 1108 head = compound_head(page); 1109 if (!page_cache_get_speculative(head)) 1110 goto repeat; 1111 1112 /* The page was split under us? */ 1113 if (compound_head(page) != head) { 1114 put_page(head); 1115 goto repeat; 1116 } 1117 1118 /* 1119 * Has the page moved? 1120 * This is part of the lockless pagecache protocol. See 1121 * include/linux/pagemap.h for details. 1122 */ 1123 if (unlikely(page != *pagep)) { 1124 put_page(head); 1125 goto repeat; 1126 } 1127 } 1128 out: 1129 rcu_read_unlock(); 1130 1131 return page; 1132 } 1133 EXPORT_SYMBOL(find_get_entry); 1134 1135 /** 1136 * find_lock_entry - locate, pin and lock a page cache entry 1137 * @mapping: the address_space to search 1138 * @offset: the page cache index 1139 * 1140 * Looks up the page cache slot at @mapping & @offset. If there is a 1141 * page cache page, it is returned locked and with an increased 1142 * refcount. 1143 * 1144 * If the slot holds a shadow entry of a previously evicted page, or a 1145 * swap entry from shmem/tmpfs, it is returned. 1146 * 1147 * Otherwise, %NULL is returned. 1148 * 1149 * find_lock_entry() may sleep. 1150 */ 1151 struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset) 1152 { 1153 struct page *page; 1154 1155 repeat: 1156 page = find_get_entry(mapping, offset); 1157 if (page && !radix_tree_exception(page)) { 1158 lock_page(page); 1159 /* Has the page been truncated? */ 1160 if (unlikely(page_mapping(page) != mapping)) { 1161 unlock_page(page); 1162 put_page(page); 1163 goto repeat; 1164 } 1165 VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page); 1166 } 1167 return page; 1168 } 1169 EXPORT_SYMBOL(find_lock_entry); 1170 1171 /** 1172 * pagecache_get_page - find and get a page reference 1173 * @mapping: the address_space to search 1174 * @offset: the page index 1175 * @fgp_flags: PCG flags 1176 * @gfp_mask: gfp mask to use for the page cache data page allocation 1177 * 1178 * Looks up the page cache slot at @mapping & @offset. 1179 * 1180 * PCG flags modify how the page is returned. 1181 * 1182 * FGP_ACCESSED: the page will be marked accessed 1183 * FGP_LOCK: Page is return locked 1184 * FGP_CREAT: If page is not present then a new page is allocated using 1185 * @gfp_mask and added to the page cache and the VM's LRU 1186 * list. The page is returned locked and with an increased 1187 * refcount. Otherwise, %NULL is returned. 1188 * 1189 * If FGP_LOCK or FGP_CREAT are specified then the function may sleep even 1190 * if the GFP flags specified for FGP_CREAT are atomic. 1191 * 1192 * If there is a page cache page, it is returned with an increased refcount. 1193 */ 1194 struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset, 1195 int fgp_flags, gfp_t gfp_mask) 1196 { 1197 struct page *page; 1198 1199 repeat: 1200 page = find_get_entry(mapping, offset); 1201 if (radix_tree_exceptional_entry(page)) 1202 page = NULL; 1203 if (!page) 1204 goto no_page; 1205 1206 if (fgp_flags & FGP_LOCK) { 1207 if (fgp_flags & FGP_NOWAIT) { 1208 if (!trylock_page(page)) { 1209 put_page(page); 1210 return NULL; 1211 } 1212 } else { 1213 lock_page(page); 1214 } 1215 1216 /* Has the page been truncated? */ 1217 if (unlikely(page->mapping != mapping)) { 1218 unlock_page(page); 1219 put_page(page); 1220 goto repeat; 1221 } 1222 VM_BUG_ON_PAGE(page->index != offset, page); 1223 } 1224 1225 if (page && (fgp_flags & FGP_ACCESSED)) 1226 mark_page_accessed(page); 1227 1228 no_page: 1229 if (!page && (fgp_flags & FGP_CREAT)) { 1230 int err; 1231 if ((fgp_flags & FGP_WRITE) && mapping_cap_account_dirty(mapping)) 1232 gfp_mask |= __GFP_WRITE; 1233 if (fgp_flags & FGP_NOFS) 1234 gfp_mask &= ~__GFP_FS; 1235 1236 page = __page_cache_alloc(gfp_mask); 1237 if (!page) 1238 return NULL; 1239 1240 if (WARN_ON_ONCE(!(fgp_flags & FGP_LOCK))) 1241 fgp_flags |= FGP_LOCK; 1242 1243 /* Init accessed so avoid atomic mark_page_accessed later */ 1244 if (fgp_flags & FGP_ACCESSED) 1245 __SetPageReferenced(page); 1246 1247 err = add_to_page_cache_lru(page, mapping, offset, 1248 gfp_mask & GFP_RECLAIM_MASK); 1249 if (unlikely(err)) { 1250 put_page(page); 1251 page = NULL; 1252 if (err == -EEXIST) 1253 goto repeat; 1254 } 1255 } 1256 1257 return page; 1258 } 1259 EXPORT_SYMBOL(pagecache_get_page); 1260 1261 /** 1262 * find_get_entries - gang pagecache lookup 1263 * @mapping: The address_space to search 1264 * @start: The starting page cache index 1265 * @nr_entries: The maximum number of entries 1266 * @entries: Where the resulting entries are placed 1267 * @indices: The cache indices corresponding to the entries in @entries 1268 * 1269 * find_get_entries() will search for and return a group of up to 1270 * @nr_entries entries in the mapping. The entries are placed at 1271 * @entries. find_get_entries() takes a reference against any actual 1272 * pages it returns. 1273 * 1274 * The search returns a group of mapping-contiguous page cache entries 1275 * with ascending indexes. There may be holes in the indices due to 1276 * not-present pages. 1277 * 1278 * Any shadow entries of evicted pages, or swap entries from 1279 * shmem/tmpfs, are included in the returned array. 1280 * 1281 * find_get_entries() returns the number of pages and shadow entries 1282 * which were found. 1283 */ 1284 unsigned find_get_entries(struct address_space *mapping, 1285 pgoff_t start, unsigned int nr_entries, 1286 struct page **entries, pgoff_t *indices) 1287 { 1288 void **slot; 1289 unsigned int ret = 0; 1290 struct radix_tree_iter iter; 1291 1292 if (!nr_entries) 1293 return 0; 1294 1295 rcu_read_lock(); 1296 radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { 1297 struct page *head, *page; 1298 repeat: 1299 page = radix_tree_deref_slot(slot); 1300 if (unlikely(!page)) 1301 continue; 1302 if (radix_tree_exception(page)) { 1303 if (radix_tree_deref_retry(page)) { 1304 slot = radix_tree_iter_retry(&iter); 1305 continue; 1306 } 1307 /* 1308 * A shadow entry of a recently evicted page, a swap 1309 * entry from shmem/tmpfs or a DAX entry. Return it 1310 * without attempting to raise page count. 1311 */ 1312 goto export; 1313 } 1314 1315 head = compound_head(page); 1316 if (!page_cache_get_speculative(head)) 1317 goto repeat; 1318 1319 /* The page was split under us? */ 1320 if (compound_head(page) != head) { 1321 put_page(head); 1322 goto repeat; 1323 } 1324 1325 /* Has the page moved? */ 1326 if (unlikely(page != *slot)) { 1327 put_page(head); 1328 goto repeat; 1329 } 1330 export: 1331 indices[ret] = iter.index; 1332 entries[ret] = page; 1333 if (++ret == nr_entries) 1334 break; 1335 } 1336 rcu_read_unlock(); 1337 return ret; 1338 } 1339 1340 /** 1341 * find_get_pages - gang pagecache lookup 1342 * @mapping: The address_space to search 1343 * @start: The starting page index 1344 * @nr_pages: The maximum number of pages 1345 * @pages: Where the resulting pages are placed 1346 * 1347 * find_get_pages() will search for and return a group of up to 1348 * @nr_pages pages in the mapping. The pages are placed at @pages. 1349 * find_get_pages() takes a reference against the returned pages. 1350 * 1351 * The search returns a group of mapping-contiguous pages with ascending 1352 * indexes. There may be holes in the indices due to not-present pages. 1353 * 1354 * find_get_pages() returns the number of pages which were found. 1355 */ 1356 unsigned find_get_pages(struct address_space *mapping, pgoff_t start, 1357 unsigned int nr_pages, struct page **pages) 1358 { 1359 struct radix_tree_iter iter; 1360 void **slot; 1361 unsigned ret = 0; 1362 1363 if (unlikely(!nr_pages)) 1364 return 0; 1365 1366 rcu_read_lock(); 1367 radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { 1368 struct page *head, *page; 1369 repeat: 1370 page = radix_tree_deref_slot(slot); 1371 if (unlikely(!page)) 1372 continue; 1373 1374 if (radix_tree_exception(page)) { 1375 if (radix_tree_deref_retry(page)) { 1376 slot = radix_tree_iter_retry(&iter); 1377 continue; 1378 } 1379 /* 1380 * A shadow entry of a recently evicted page, 1381 * or a swap entry from shmem/tmpfs. Skip 1382 * over it. 1383 */ 1384 continue; 1385 } 1386 1387 head = compound_head(page); 1388 if (!page_cache_get_speculative(head)) 1389 goto repeat; 1390 1391 /* The page was split under us? */ 1392 if (compound_head(page) != head) { 1393 put_page(head); 1394 goto repeat; 1395 } 1396 1397 /* Has the page moved? */ 1398 if (unlikely(page != *slot)) { 1399 put_page(head); 1400 goto repeat; 1401 } 1402 1403 pages[ret] = page; 1404 if (++ret == nr_pages) 1405 break; 1406 } 1407 1408 rcu_read_unlock(); 1409 return ret; 1410 } 1411 1412 /** 1413 * find_get_pages_contig - gang contiguous pagecache lookup 1414 * @mapping: The address_space to search 1415 * @index: The starting page index 1416 * @nr_pages: The maximum number of pages 1417 * @pages: Where the resulting pages are placed 1418 * 1419 * find_get_pages_contig() works exactly like find_get_pages(), except 1420 * that the returned number of pages are guaranteed to be contiguous. 1421 * 1422 * find_get_pages_contig() returns the number of pages which were found. 1423 */ 1424 unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, 1425 unsigned int nr_pages, struct page **pages) 1426 { 1427 struct radix_tree_iter iter; 1428 void **slot; 1429 unsigned int ret = 0; 1430 1431 if (unlikely(!nr_pages)) 1432 return 0; 1433 1434 rcu_read_lock(); 1435 radix_tree_for_each_contig(slot, &mapping->page_tree, &iter, index) { 1436 struct page *head, *page; 1437 repeat: 1438 page = radix_tree_deref_slot(slot); 1439 /* The hole, there no reason to continue */ 1440 if (unlikely(!page)) 1441 break; 1442 1443 if (radix_tree_exception(page)) { 1444 if (radix_tree_deref_retry(page)) { 1445 slot = radix_tree_iter_retry(&iter); 1446 continue; 1447 } 1448 /* 1449 * A shadow entry of a recently evicted page, 1450 * or a swap entry from shmem/tmpfs. Stop 1451 * looking for contiguous pages. 1452 */ 1453 break; 1454 } 1455 1456 head = compound_head(page); 1457 if (!page_cache_get_speculative(head)) 1458 goto repeat; 1459 1460 /* The page was split under us? */ 1461 if (compound_head(page) != head) { 1462 put_page(head); 1463 goto repeat; 1464 } 1465 1466 /* Has the page moved? */ 1467 if (unlikely(page != *slot)) { 1468 put_page(head); 1469 goto repeat; 1470 } 1471 1472 /* 1473 * must check mapping and index after taking the ref. 1474 * otherwise we can get both false positives and false 1475 * negatives, which is just confusing to the caller. 1476 */ 1477 if (page->mapping == NULL || page_to_pgoff(page) != iter.index) { 1478 put_page(page); 1479 break; 1480 } 1481 1482 pages[ret] = page; 1483 if (++ret == nr_pages) 1484 break; 1485 } 1486 rcu_read_unlock(); 1487 return ret; 1488 } 1489 EXPORT_SYMBOL(find_get_pages_contig); 1490 1491 /** 1492 * find_get_pages_tag - find and return pages that match @tag 1493 * @mapping: the address_space to search 1494 * @index: the starting page index 1495 * @tag: the tag index 1496 * @nr_pages: the maximum number of pages 1497 * @pages: where the resulting pages are placed 1498 * 1499 * Like find_get_pages, except we only return pages which are tagged with 1500 * @tag. We update @index to index the next page for the traversal. 1501 */ 1502 unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, 1503 int tag, unsigned int nr_pages, struct page **pages) 1504 { 1505 struct radix_tree_iter iter; 1506 void **slot; 1507 unsigned ret = 0; 1508 1509 if (unlikely(!nr_pages)) 1510 return 0; 1511 1512 rcu_read_lock(); 1513 radix_tree_for_each_tagged(slot, &mapping->page_tree, 1514 &iter, *index, tag) { 1515 struct page *head, *page; 1516 repeat: 1517 page = radix_tree_deref_slot(slot); 1518 if (unlikely(!page)) 1519 continue; 1520 1521 if (radix_tree_exception(page)) { 1522 if (radix_tree_deref_retry(page)) { 1523 slot = radix_tree_iter_retry(&iter); 1524 continue; 1525 } 1526 /* 1527 * A shadow entry of a recently evicted page. 1528 * 1529 * Those entries should never be tagged, but 1530 * this tree walk is lockless and the tags are 1531 * looked up in bulk, one radix tree node at a 1532 * time, so there is a sizable window for page 1533 * reclaim to evict a page we saw tagged. 1534 * 1535 * Skip over it. 1536 */ 1537 continue; 1538 } 1539 1540 head = compound_head(page); 1541 if (!page_cache_get_speculative(head)) 1542 goto repeat; 1543 1544 /* The page was split under us? */ 1545 if (compound_head(page) != head) { 1546 put_page(head); 1547 goto repeat; 1548 } 1549 1550 /* Has the page moved? */ 1551 if (unlikely(page != *slot)) { 1552 put_page(head); 1553 goto repeat; 1554 } 1555 1556 pages[ret] = page; 1557 if (++ret == nr_pages) 1558 break; 1559 } 1560 1561 rcu_read_unlock(); 1562 1563 if (ret) 1564 *index = pages[ret - 1]->index + 1; 1565 1566 return ret; 1567 } 1568 EXPORT_SYMBOL(find_get_pages_tag); 1569 1570 /** 1571 * find_get_entries_tag - find and return entries that match @tag 1572 * @mapping: the address_space to search 1573 * @start: the starting page cache index 1574 * @tag: the tag index 1575 * @nr_entries: the maximum number of entries 1576 * @entries: where the resulting entries are placed 1577 * @indices: the cache indices corresponding to the entries in @entries 1578 * 1579 * Like find_get_entries, except we only return entries which are tagged with 1580 * @tag. 1581 */ 1582 unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, 1583 int tag, unsigned int nr_entries, 1584 struct page **entries, pgoff_t *indices) 1585 { 1586 void **slot; 1587 unsigned int ret = 0; 1588 struct radix_tree_iter iter; 1589 1590 if (!nr_entries) 1591 return 0; 1592 1593 rcu_read_lock(); 1594 radix_tree_for_each_tagged(slot, &mapping->page_tree, 1595 &iter, start, tag) { 1596 struct page *head, *page; 1597 repeat: 1598 page = radix_tree_deref_slot(slot); 1599 if (unlikely(!page)) 1600 continue; 1601 if (radix_tree_exception(page)) { 1602 if (radix_tree_deref_retry(page)) { 1603 slot = radix_tree_iter_retry(&iter); 1604 continue; 1605 } 1606 1607 /* 1608 * A shadow entry of a recently evicted page, a swap 1609 * entry from shmem/tmpfs or a DAX entry. Return it 1610 * without attempting to raise page count. 1611 */ 1612 goto export; 1613 } 1614 1615 head = compound_head(page); 1616 if (!page_cache_get_speculative(head)) 1617 goto repeat; 1618 1619 /* The page was split under us? */ 1620 if (compound_head(page) != head) { 1621 put_page(head); 1622 goto repeat; 1623 } 1624 1625 /* Has the page moved? */ 1626 if (unlikely(page != *slot)) { 1627 put_page(head); 1628 goto repeat; 1629 } 1630 export: 1631 indices[ret] = iter.index; 1632 entries[ret] = page; 1633 if (++ret == nr_entries) 1634 break; 1635 } 1636 rcu_read_unlock(); 1637 return ret; 1638 } 1639 EXPORT_SYMBOL(find_get_entries_tag); 1640 1641 /* 1642 * CD/DVDs are error prone. When a medium error occurs, the driver may fail 1643 * a _large_ part of the i/o request. Imagine the worst scenario: 1644 * 1645 * ---R__________________________________________B__________ 1646 * ^ reading here ^ bad block(assume 4k) 1647 * 1648 * read(R) => miss => readahead(R...B) => media error => frustrating retries 1649 * => failing the whole request => read(R) => read(R+1) => 1650 * readahead(R+1...B+1) => bang => read(R+2) => read(R+3) => 1651 * readahead(R+3...B+2) => bang => read(R+3) => read(R+4) => 1652 * readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ...... 1653 * 1654 * It is going insane. Fix it by quickly scaling down the readahead size. 1655 */ 1656 static void shrink_readahead_size_eio(struct file *filp, 1657 struct file_ra_state *ra) 1658 { 1659 ra->ra_pages /= 4; 1660 } 1661 1662 /** 1663 * do_generic_file_read - generic file read routine 1664 * @filp: the file to read 1665 * @ppos: current file position 1666 * @iter: data destination 1667 * @written: already copied 1668 * 1669 * This is a generic file read routine, and uses the 1670 * mapping->a_ops->readpage() function for the actual low-level stuff. 1671 * 1672 * This is really ugly. But the goto's actually try to clarify some 1673 * of the logic when it comes to error handling etc. 1674 */ 1675 static ssize_t do_generic_file_read(struct file *filp, loff_t *ppos, 1676 struct iov_iter *iter, ssize_t written) 1677 { 1678 struct address_space *mapping = filp->f_mapping; 1679 struct inode *inode = mapping->host; 1680 struct file_ra_state *ra = &filp->f_ra; 1681 pgoff_t index; 1682 pgoff_t last_index; 1683 pgoff_t prev_index; 1684 unsigned long offset; /* offset into pagecache page */ 1685 unsigned int prev_offset; 1686 int error = 0; 1687 1688 if (unlikely(*ppos >= inode->i_sb->s_maxbytes)) 1689 return -EINVAL; 1690 iov_iter_truncate(iter, inode->i_sb->s_maxbytes); 1691 1692 index = *ppos >> PAGE_SHIFT; 1693 prev_index = ra->prev_pos >> PAGE_SHIFT; 1694 prev_offset = ra->prev_pos & (PAGE_SIZE-1); 1695 last_index = (*ppos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT; 1696 offset = *ppos & ~PAGE_MASK; 1697 1698 for (;;) { 1699 struct page *page; 1700 pgoff_t end_index; 1701 loff_t isize; 1702 unsigned long nr, ret; 1703 1704 cond_resched(); 1705 find_page: 1706 page = find_get_page(mapping, index); 1707 if (!page) { 1708 page_cache_sync_readahead(mapping, 1709 ra, filp, 1710 index, last_index - index); 1711 page = find_get_page(mapping, index); 1712 if (unlikely(page == NULL)) 1713 goto no_cached_page; 1714 } 1715 if (PageReadahead(page)) { 1716 page_cache_async_readahead(mapping, 1717 ra, filp, page, 1718 index, last_index - index); 1719 } 1720 if (!PageUptodate(page)) { 1721 /* 1722 * See comment in do_read_cache_page on why 1723 * wait_on_page_locked is used to avoid unnecessarily 1724 * serialisations and why it's safe. 1725 */ 1726 error = wait_on_page_locked_killable(page); 1727 if (unlikely(error)) 1728 goto readpage_error; 1729 if (PageUptodate(page)) 1730 goto page_ok; 1731 1732 if (inode->i_blkbits == PAGE_SHIFT || 1733 !mapping->a_ops->is_partially_uptodate) 1734 goto page_not_up_to_date; 1735 /* pipes can't handle partially uptodate pages */ 1736 if (unlikely(iter->type & ITER_PIPE)) 1737 goto page_not_up_to_date; 1738 if (!trylock_page(page)) 1739 goto page_not_up_to_date; 1740 /* Did it get truncated before we got the lock? */ 1741 if (!page->mapping) 1742 goto page_not_up_to_date_locked; 1743 if (!mapping->a_ops->is_partially_uptodate(page, 1744 offset, iter->count)) 1745 goto page_not_up_to_date_locked; 1746 unlock_page(page); 1747 } 1748 page_ok: 1749 /* 1750 * i_size must be checked after we know the page is Uptodate. 1751 * 1752 * Checking i_size after the check allows us to calculate 1753 * the correct value for "nr", which means the zero-filled 1754 * part of the page is not copied back to userspace (unless 1755 * another truncate extends the file - this is desired though). 1756 */ 1757 1758 isize = i_size_read(inode); 1759 end_index = (isize - 1) >> PAGE_SHIFT; 1760 if (unlikely(!isize || index > end_index)) { 1761 put_page(page); 1762 goto out; 1763 } 1764 1765 /* nr is the maximum number of bytes to copy from this page */ 1766 nr = PAGE_SIZE; 1767 if (index == end_index) { 1768 nr = ((isize - 1) & ~PAGE_MASK) + 1; 1769 if (nr <= offset) { 1770 put_page(page); 1771 goto out; 1772 } 1773 } 1774 nr = nr - offset; 1775 1776 /* If users can be writing to this page using arbitrary 1777 * virtual addresses, take care about potential aliasing 1778 * before reading the page on the kernel side. 1779 */ 1780 if (mapping_writably_mapped(mapping)) 1781 flush_dcache_page(page); 1782 1783 /* 1784 * When a sequential read accesses a page several times, 1785 * only mark it as accessed the first time. 1786 */ 1787 if (prev_index != index || offset != prev_offset) 1788 mark_page_accessed(page); 1789 prev_index = index; 1790 1791 /* 1792 * Ok, we have the page, and it's up-to-date, so 1793 * now we can copy it to user space... 1794 */ 1795 1796 ret = copy_page_to_iter(page, offset, nr, iter); 1797 offset += ret; 1798 index += offset >> PAGE_SHIFT; 1799 offset &= ~PAGE_MASK; 1800 prev_offset = offset; 1801 1802 put_page(page); 1803 written += ret; 1804 if (!iov_iter_count(iter)) 1805 goto out; 1806 if (ret < nr) { 1807 error = -EFAULT; 1808 goto out; 1809 } 1810 continue; 1811 1812 page_not_up_to_date: 1813 /* Get exclusive access to the page ... */ 1814 error = lock_page_killable(page); 1815 if (unlikely(error)) 1816 goto readpage_error; 1817 1818 page_not_up_to_date_locked: 1819 /* Did it get truncated before we got the lock? */ 1820 if (!page->mapping) { 1821 unlock_page(page); 1822 put_page(page); 1823 continue; 1824 } 1825 1826 /* Did somebody else fill it already? */ 1827 if (PageUptodate(page)) { 1828 unlock_page(page); 1829 goto page_ok; 1830 } 1831 1832 readpage: 1833 /* 1834 * A previous I/O error may have been due to temporary 1835 * failures, eg. multipath errors. 1836 * PG_error will be set again if readpage fails. 1837 */ 1838 ClearPageError(page); 1839 /* Start the actual read. The read will unlock the page. */ 1840 error = mapping->a_ops->readpage(filp, page); 1841 1842 if (unlikely(error)) { 1843 if (error == AOP_TRUNCATED_PAGE) { 1844 put_page(page); 1845 error = 0; 1846 goto find_page; 1847 } 1848 goto readpage_error; 1849 } 1850 1851 if (!PageUptodate(page)) { 1852 error = lock_page_killable(page); 1853 if (unlikely(error)) 1854 goto readpage_error; 1855 if (!PageUptodate(page)) { 1856 if (page->mapping == NULL) { 1857 /* 1858 * invalidate_mapping_pages got it 1859 */ 1860 unlock_page(page); 1861 put_page(page); 1862 goto find_page; 1863 } 1864 unlock_page(page); 1865 shrink_readahead_size_eio(filp, ra); 1866 error = -EIO; 1867 goto readpage_error; 1868 } 1869 unlock_page(page); 1870 } 1871 1872 goto page_ok; 1873 1874 readpage_error: 1875 /* UHHUH! A synchronous read error occurred. Report it */ 1876 put_page(page); 1877 goto out; 1878 1879 no_cached_page: 1880 /* 1881 * Ok, it wasn't cached, so we need to create a new 1882 * page.. 1883 */ 1884 page = page_cache_alloc_cold(mapping); 1885 if (!page) { 1886 error = -ENOMEM; 1887 goto out; 1888 } 1889 error = add_to_page_cache_lru(page, mapping, index, 1890 mapping_gfp_constraint(mapping, GFP_KERNEL)); 1891 if (error) { 1892 put_page(page); 1893 if (error == -EEXIST) { 1894 error = 0; 1895 goto find_page; 1896 } 1897 goto out; 1898 } 1899 goto readpage; 1900 } 1901 1902 out: 1903 ra->prev_pos = prev_index; 1904 ra->prev_pos <<= PAGE_SHIFT; 1905 ra->prev_pos |= prev_offset; 1906 1907 *ppos = ((loff_t)index << PAGE_SHIFT) + offset; 1908 file_accessed(filp); 1909 return written ? written : error; 1910 } 1911 1912 /** 1913 * generic_file_read_iter - generic filesystem read routine 1914 * @iocb: kernel I/O control block 1915 * @iter: destination for the data read 1916 * 1917 * This is the "read_iter()" routine for all filesystems 1918 * that can use the page cache directly. 1919 */ 1920 ssize_t 1921 generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) 1922 { 1923 struct file *file = iocb->ki_filp; 1924 ssize_t retval = 0; 1925 size_t count = iov_iter_count(iter); 1926 1927 if (!count) 1928 goto out; /* skip atime */ 1929 1930 if (iocb->ki_flags & IOCB_DIRECT) { 1931 struct address_space *mapping = file->f_mapping; 1932 struct inode *inode = mapping->host; 1933 struct iov_iter data = *iter; 1934 loff_t size; 1935 1936 size = i_size_read(inode); 1937 retval = filemap_write_and_wait_range(mapping, iocb->ki_pos, 1938 iocb->ki_pos + count - 1); 1939 if (retval < 0) 1940 goto out; 1941 1942 file_accessed(file); 1943 1944 retval = mapping->a_ops->direct_IO(iocb, &data); 1945 if (retval >= 0) { 1946 iocb->ki_pos += retval; 1947 iov_iter_advance(iter, retval); 1948 } 1949 1950 /* 1951 * Btrfs can have a short DIO read if we encounter 1952 * compressed extents, so if there was an error, or if 1953 * we've already read everything we wanted to, or if 1954 * there was a short read because we hit EOF, go ahead 1955 * and return. Otherwise fallthrough to buffered io for 1956 * the rest of the read. Buffered reads will not work for 1957 * DAX files, so don't bother trying. 1958 */ 1959 if (retval < 0 || !iov_iter_count(iter) || iocb->ki_pos >= size || 1960 IS_DAX(inode)) 1961 goto out; 1962 } 1963 1964 retval = do_generic_file_read(file, &iocb->ki_pos, iter, retval); 1965 out: 1966 return retval; 1967 } 1968 EXPORT_SYMBOL(generic_file_read_iter); 1969 1970 #ifdef CONFIG_MMU 1971 /** 1972 * page_cache_read - adds requested page to the page cache if not already there 1973 * @file: file to read 1974 * @offset: page index 1975 * @gfp_mask: memory allocation flags 1976 * 1977 * This adds the requested page to the page cache if it isn't already there, 1978 * and schedules an I/O to read in its contents from disk. 1979 */ 1980 static int page_cache_read(struct file *file, pgoff_t offset, gfp_t gfp_mask) 1981 { 1982 struct address_space *mapping = file->f_mapping; 1983 struct page *page; 1984 int ret; 1985 1986 do { 1987 page = __page_cache_alloc(gfp_mask|__GFP_COLD); 1988 if (!page) 1989 return -ENOMEM; 1990 1991 ret = add_to_page_cache_lru(page, mapping, offset, gfp_mask & GFP_KERNEL); 1992 if (ret == 0) 1993 ret = mapping->a_ops->readpage(file, page); 1994 else if (ret == -EEXIST) 1995 ret = 0; /* losing race to add is OK */ 1996 1997 put_page(page); 1998 1999 } while (ret == AOP_TRUNCATED_PAGE); 2000 2001 return ret; 2002 } 2003 2004 #define MMAP_LOTSAMISS (100) 2005 2006 /* 2007 * Synchronous readahead happens when we don't even find 2008 * a page in the page cache at all. 2009 */ 2010 static void do_sync_mmap_readahead(struct vm_area_struct *vma, 2011 struct file_ra_state *ra, 2012 struct file *file, 2013 pgoff_t offset) 2014 { 2015 struct address_space *mapping = file->f_mapping; 2016 2017 /* If we don't want any read-ahead, don't bother */ 2018 if (vma->vm_flags & VM_RAND_READ) 2019 return; 2020 if (!ra->ra_pages) 2021 return; 2022 2023 if (vma->vm_flags & VM_SEQ_READ) { 2024 page_cache_sync_readahead(mapping, ra, file, offset, 2025 ra->ra_pages); 2026 return; 2027 } 2028 2029 /* Avoid banging the cache line if not needed */ 2030 if (ra->mmap_miss < MMAP_LOTSAMISS * 10) 2031 ra->mmap_miss++; 2032 2033 /* 2034 * Do we miss much more than hit in this file? If so, 2035 * stop bothering with read-ahead. It will only hurt. 2036 */ 2037 if (ra->mmap_miss > MMAP_LOTSAMISS) 2038 return; 2039 2040 /* 2041 * mmap read-around 2042 */ 2043 ra->start = max_t(long, 0, offset - ra->ra_pages / 2); 2044 ra->size = ra->ra_pages; 2045 ra->async_size = ra->ra_pages / 4; 2046 ra_submit(ra, mapping, file); 2047 } 2048 2049 /* 2050 * Asynchronous readahead happens when we find the page and PG_readahead, 2051 * so we want to possibly extend the readahead further.. 2052 */ 2053 static void do_async_mmap_readahead(struct vm_area_struct *vma, 2054 struct file_ra_state *ra, 2055 struct file *file, 2056 struct page *page, 2057 pgoff_t offset) 2058 { 2059 struct address_space *mapping = file->f_mapping; 2060 2061 /* If we don't want any read-ahead, don't bother */ 2062 if (vma->vm_flags & VM_RAND_READ) 2063 return; 2064 if (ra->mmap_miss > 0) 2065 ra->mmap_miss--; 2066 if (PageReadahead(page)) 2067 page_cache_async_readahead(mapping, ra, file, 2068 page, offset, ra->ra_pages); 2069 } 2070 2071 /** 2072 * filemap_fault - read in file data for page fault handling 2073 * @vma: vma in which the fault was taken 2074 * @vmf: struct vm_fault containing details of the fault 2075 * 2076 * filemap_fault() is invoked via the vma operations vector for a 2077 * mapped memory region to read in file data during a page fault. 2078 * 2079 * The goto's are kind of ugly, but this streamlines the normal case of having 2080 * it in the page cache, and handles the special cases reasonably without 2081 * having a lot of duplicated code. 2082 * 2083 * vma->vm_mm->mmap_sem must be held on entry. 2084 * 2085 * If our return value has VM_FAULT_RETRY set, it's because 2086 * lock_page_or_retry() returned 0. 2087 * The mmap_sem has usually been released in this case. 2088 * See __lock_page_or_retry() for the exception. 2089 * 2090 * If our return value does not have VM_FAULT_RETRY set, the mmap_sem 2091 * has not been released. 2092 * 2093 * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set. 2094 */ 2095 int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 2096 { 2097 int error; 2098 struct file *file = vma->vm_file; 2099 struct address_space *mapping = file->f_mapping; 2100 struct file_ra_state *ra = &file->f_ra; 2101 struct inode *inode = mapping->host; 2102 pgoff_t offset = vmf->pgoff; 2103 struct page *page; 2104 loff_t size; 2105 int ret = 0; 2106 2107 size = round_up(i_size_read(inode), PAGE_SIZE); 2108 if (offset >= size >> PAGE_SHIFT) 2109 return VM_FAULT_SIGBUS; 2110 2111 /* 2112 * Do we have something in the page cache already? 2113 */ 2114 page = find_get_page(mapping, offset); 2115 if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) { 2116 /* 2117 * We found the page, so try async readahead before 2118 * waiting for the lock. 2119 */ 2120 do_async_mmap_readahead(vma, ra, file, page, offset); 2121 } else if (!page) { 2122 /* No page in the page cache at all */ 2123 do_sync_mmap_readahead(vma, ra, file, offset); 2124 count_vm_event(PGMAJFAULT); 2125 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); 2126 ret = VM_FAULT_MAJOR; 2127 retry_find: 2128 page = find_get_page(mapping, offset); 2129 if (!page) 2130 goto no_cached_page; 2131 } 2132 2133 if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) { 2134 put_page(page); 2135 return ret | VM_FAULT_RETRY; 2136 } 2137 2138 /* Did it get truncated? */ 2139 if (unlikely(page->mapping != mapping)) { 2140 unlock_page(page); 2141 put_page(page); 2142 goto retry_find; 2143 } 2144 VM_BUG_ON_PAGE(page->index != offset, page); 2145 2146 /* 2147 * We have a locked page in the page cache, now we need to check 2148 * that it's up-to-date. If not, it is going to be due to an error. 2149 */ 2150 if (unlikely(!PageUptodate(page))) 2151 goto page_not_uptodate; 2152 2153 /* 2154 * Found the page and have a reference on it. 2155 * We must recheck i_size under page lock. 2156 */ 2157 size = round_up(i_size_read(inode), PAGE_SIZE); 2158 if (unlikely(offset >= size >> PAGE_SHIFT)) { 2159 unlock_page(page); 2160 put_page(page); 2161 return VM_FAULT_SIGBUS; 2162 } 2163 2164 vmf->page = page; 2165 return ret | VM_FAULT_LOCKED; 2166 2167 no_cached_page: 2168 /* 2169 * We're only likely to ever get here if MADV_RANDOM is in 2170 * effect. 2171 */ 2172 error = page_cache_read(file, offset, vmf->gfp_mask); 2173 2174 /* 2175 * The page we want has now been added to the page cache. 2176 * In the unlikely event that someone removed it in the 2177 * meantime, we'll just come back here and read it again. 2178 */ 2179 if (error >= 0) 2180 goto retry_find; 2181 2182 /* 2183 * An error return from page_cache_read can result if the 2184 * system is low on memory, or a problem occurs while trying 2185 * to schedule I/O. 2186 */ 2187 if (error == -ENOMEM) 2188 return VM_FAULT_OOM; 2189 return VM_FAULT_SIGBUS; 2190 2191 page_not_uptodate: 2192 /* 2193 * Umm, take care of errors if the page isn't up-to-date. 2194 * Try to re-read it _once_. We do this synchronously, 2195 * because there really aren't any performance issues here 2196 * and we need to check for errors. 2197 */ 2198 ClearPageError(page); 2199 error = mapping->a_ops->readpage(file, page); 2200 if (!error) { 2201 wait_on_page_locked(page); 2202 if (!PageUptodate(page)) 2203 error = -EIO; 2204 } 2205 put_page(page); 2206 2207 if (!error || error == AOP_TRUNCATED_PAGE) 2208 goto retry_find; 2209 2210 /* Things didn't work out. Return zero to tell the mm layer so. */ 2211 shrink_readahead_size_eio(file, ra); 2212 return VM_FAULT_SIGBUS; 2213 } 2214 EXPORT_SYMBOL(filemap_fault); 2215 2216 void filemap_map_pages(struct fault_env *fe, 2217 pgoff_t start_pgoff, pgoff_t end_pgoff) 2218 { 2219 struct radix_tree_iter iter; 2220 void **slot; 2221 struct file *file = fe->vma->vm_file; 2222 struct address_space *mapping = file->f_mapping; 2223 pgoff_t last_pgoff = start_pgoff; 2224 loff_t size; 2225 struct page *head, *page; 2226 2227 rcu_read_lock(); 2228 radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, 2229 start_pgoff) { 2230 if (iter.index > end_pgoff) 2231 break; 2232 repeat: 2233 page = radix_tree_deref_slot(slot); 2234 if (unlikely(!page)) 2235 goto next; 2236 if (radix_tree_exception(page)) { 2237 if (radix_tree_deref_retry(page)) { 2238 slot = radix_tree_iter_retry(&iter); 2239 continue; 2240 } 2241 goto next; 2242 } 2243 2244 head = compound_head(page); 2245 if (!page_cache_get_speculative(head)) 2246 goto repeat; 2247 2248 /* The page was split under us? */ 2249 if (compound_head(page) != head) { 2250 put_page(head); 2251 goto repeat; 2252 } 2253 2254 /* Has the page moved? */ 2255 if (unlikely(page != *slot)) { 2256 put_page(head); 2257 goto repeat; 2258 } 2259 2260 if (!PageUptodate(page) || 2261 PageReadahead(page) || 2262 PageHWPoison(page)) 2263 goto skip; 2264 if (!trylock_page(page)) 2265 goto skip; 2266 2267 if (page->mapping != mapping || !PageUptodate(page)) 2268 goto unlock; 2269 2270 size = round_up(i_size_read(mapping->host), PAGE_SIZE); 2271 if (page->index >= size >> PAGE_SHIFT) 2272 goto unlock; 2273 2274 if (file->f_ra.mmap_miss > 0) 2275 file->f_ra.mmap_miss--; 2276 2277 fe->address += (iter.index - last_pgoff) << PAGE_SHIFT; 2278 if (fe->pte) 2279 fe->pte += iter.index - last_pgoff; 2280 last_pgoff = iter.index; 2281 if (alloc_set_pte(fe, NULL, page)) 2282 goto unlock; 2283 unlock_page(page); 2284 goto next; 2285 unlock: 2286 unlock_page(page); 2287 skip: 2288 put_page(page); 2289 next: 2290 /* Huge page is mapped? No need to proceed. */ 2291 if (pmd_trans_huge(*fe->pmd)) 2292 break; 2293 if (iter.index == end_pgoff) 2294 break; 2295 } 2296 rcu_read_unlock(); 2297 } 2298 EXPORT_SYMBOL(filemap_map_pages); 2299 2300 int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) 2301 { 2302 struct page *page = vmf->page; 2303 struct inode *inode = file_inode(vma->vm_file); 2304 int ret = VM_FAULT_LOCKED; 2305 2306 sb_start_pagefault(inode->i_sb); 2307 file_update_time(vma->vm_file); 2308 lock_page(page); 2309 if (page->mapping != inode->i_mapping) { 2310 unlock_page(page); 2311 ret = VM_FAULT_NOPAGE; 2312 goto out; 2313 } 2314 /* 2315 * We mark the page dirty already here so that when freeze is in 2316 * progress, we are guaranteed that writeback during freezing will 2317 * see the dirty page and writeprotect it again. 2318 */ 2319 set_page_dirty(page); 2320 wait_for_stable_page(page); 2321 out: 2322 sb_end_pagefault(inode->i_sb); 2323 return ret; 2324 } 2325 EXPORT_SYMBOL(filemap_page_mkwrite); 2326 2327 const struct vm_operations_struct generic_file_vm_ops = { 2328 .fault = filemap_fault, 2329 .map_pages = filemap_map_pages, 2330 .page_mkwrite = filemap_page_mkwrite, 2331 }; 2332 2333 /* This is used for a general mmap of a disk file */ 2334 2335 int generic_file_mmap(struct file * file, struct vm_area_struct * vma) 2336 { 2337 struct address_space *mapping = file->f_mapping; 2338 2339 if (!mapping->a_ops->readpage) 2340 return -ENOEXEC; 2341 file_accessed(file); 2342 vma->vm_ops = &generic_file_vm_ops; 2343 return 0; 2344 } 2345 2346 /* 2347 * This is for filesystems which do not implement ->writepage. 2348 */ 2349 int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma) 2350 { 2351 if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) 2352 return -EINVAL; 2353 return generic_file_mmap(file, vma); 2354 } 2355 #else 2356 int generic_file_mmap(struct file * file, struct vm_area_struct * vma) 2357 { 2358 return -ENOSYS; 2359 } 2360 int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma) 2361 { 2362 return -ENOSYS; 2363 } 2364 #endif /* CONFIG_MMU */ 2365 2366 EXPORT_SYMBOL(generic_file_mmap); 2367 EXPORT_SYMBOL(generic_file_readonly_mmap); 2368 2369 static struct page *wait_on_page_read(struct page *page) 2370 { 2371 if (!IS_ERR(page)) { 2372 wait_on_page_locked(page); 2373 if (!PageUptodate(page)) { 2374 put_page(page); 2375 page = ERR_PTR(-EIO); 2376 } 2377 } 2378 return page; 2379 } 2380 2381 static struct page *do_read_cache_page(struct address_space *mapping, 2382 pgoff_t index, 2383 int (*filler)(void *, struct page *), 2384 void *data, 2385 gfp_t gfp) 2386 { 2387 struct page *page; 2388 int err; 2389 repeat: 2390 page = find_get_page(mapping, index); 2391 if (!page) { 2392 page = __page_cache_alloc(gfp | __GFP_COLD); 2393 if (!page) 2394 return ERR_PTR(-ENOMEM); 2395 err = add_to_page_cache_lru(page, mapping, index, gfp); 2396 if (unlikely(err)) { 2397 put_page(page); 2398 if (err == -EEXIST) 2399 goto repeat; 2400 /* Presumably ENOMEM for radix tree node */ 2401 return ERR_PTR(err); 2402 } 2403 2404 filler: 2405 err = filler(data, page); 2406 if (err < 0) { 2407 put_page(page); 2408 return ERR_PTR(err); 2409 } 2410 2411 page = wait_on_page_read(page); 2412 if (IS_ERR(page)) 2413 return page; 2414 goto out; 2415 } 2416 if (PageUptodate(page)) 2417 goto out; 2418 2419 /* 2420 * Page is not up to date and may be locked due one of the following 2421 * case a: Page is being filled and the page lock is held 2422 * case b: Read/write error clearing the page uptodate status 2423 * case c: Truncation in progress (page locked) 2424 * case d: Reclaim in progress 2425 * 2426 * Case a, the page will be up to date when the page is unlocked. 2427 * There is no need to serialise on the page lock here as the page 2428 * is pinned so the lock gives no additional protection. Even if the 2429 * the page is truncated, the data is still valid if PageUptodate as 2430 * it's a race vs truncate race. 2431 * Case b, the page will not be up to date 2432 * Case c, the page may be truncated but in itself, the data may still 2433 * be valid after IO completes as it's a read vs truncate race. The 2434 * operation must restart if the page is not uptodate on unlock but 2435 * otherwise serialising on page lock to stabilise the mapping gives 2436 * no additional guarantees to the caller as the page lock is 2437 * released before return. 2438 * Case d, similar to truncation. If reclaim holds the page lock, it 2439 * will be a race with remove_mapping that determines if the mapping 2440 * is valid on unlock but otherwise the data is valid and there is 2441 * no need to serialise with page lock. 2442 * 2443 * As the page lock gives no additional guarantee, we optimistically 2444 * wait on the page to be unlocked and check if it's up to date and 2445 * use the page if it is. Otherwise, the page lock is required to 2446 * distinguish between the different cases. The motivation is that we 2447 * avoid spurious serialisations and wakeups when multiple processes 2448 * wait on the same page for IO to complete. 2449 */ 2450 wait_on_page_locked(page); 2451 if (PageUptodate(page)) 2452 goto out; 2453 2454 /* Distinguish between all the cases under the safety of the lock */ 2455 lock_page(page); 2456 2457 /* Case c or d, restart the operation */ 2458 if (!page->mapping) { 2459 unlock_page(page); 2460 put_page(page); 2461 goto repeat; 2462 } 2463 2464 /* Someone else locked and filled the page in a very small window */ 2465 if (PageUptodate(page)) { 2466 unlock_page(page); 2467 goto out; 2468 } 2469 goto filler; 2470 2471 out: 2472 mark_page_accessed(page); 2473 return page; 2474 } 2475 2476 /** 2477 * read_cache_page - read into page cache, fill it if needed 2478 * @mapping: the page's address_space 2479 * @index: the page index 2480 * @filler: function to perform the read 2481 * @data: first arg to filler(data, page) function, often left as NULL 2482 * 2483 * Read into the page cache. If a page already exists, and PageUptodate() is 2484 * not set, try to fill the page and wait for it to become unlocked. 2485 * 2486 * If the page does not get brought uptodate, return -EIO. 2487 */ 2488 struct page *read_cache_page(struct address_space *mapping, 2489 pgoff_t index, 2490 int (*filler)(void *, struct page *), 2491 void *data) 2492 { 2493 return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping)); 2494 } 2495 EXPORT_SYMBOL(read_cache_page); 2496 2497 /** 2498 * read_cache_page_gfp - read into page cache, using specified page allocation flags. 2499 * @mapping: the page's address_space 2500 * @index: the page index 2501 * @gfp: the page allocator flags to use if allocating 2502 * 2503 * This is the same as "read_mapping_page(mapping, index, NULL)", but with 2504 * any new page allocations done using the specified allocation flags. 2505 * 2506 * If the page does not get brought uptodate, return -EIO. 2507 */ 2508 struct page *read_cache_page_gfp(struct address_space *mapping, 2509 pgoff_t index, 2510 gfp_t gfp) 2511 { 2512 filler_t *filler = (filler_t *)mapping->a_ops->readpage; 2513 2514 return do_read_cache_page(mapping, index, filler, NULL, gfp); 2515 } 2516 EXPORT_SYMBOL(read_cache_page_gfp); 2517 2518 /* 2519 * Performs necessary checks before doing a write 2520 * 2521 * Can adjust writing position or amount of bytes to write. 2522 * Returns appropriate error code that caller should return or 2523 * zero in case that write should be allowed. 2524 */ 2525 inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from) 2526 { 2527 struct file *file = iocb->ki_filp; 2528 struct inode *inode = file->f_mapping->host; 2529 unsigned long limit = rlimit(RLIMIT_FSIZE); 2530 loff_t pos; 2531 2532 if (!iov_iter_count(from)) 2533 return 0; 2534 2535 /* FIXME: this is for backwards compatibility with 2.4 */ 2536 if (iocb->ki_flags & IOCB_APPEND) 2537 iocb->ki_pos = i_size_read(inode); 2538 2539 pos = iocb->ki_pos; 2540 2541 if (limit != RLIM_INFINITY) { 2542 if (iocb->ki_pos >= limit) { 2543 send_sig(SIGXFSZ, current, 0); 2544 return -EFBIG; 2545 } 2546 iov_iter_truncate(from, limit - (unsigned long)pos); 2547 } 2548 2549 /* 2550 * LFS rule 2551 */ 2552 if (unlikely(pos + iov_iter_count(from) > MAX_NON_LFS && 2553 !(file->f_flags & O_LARGEFILE))) { 2554 if (pos >= MAX_NON_LFS) 2555 return -EFBIG; 2556 iov_iter_truncate(from, MAX_NON_LFS - (unsigned long)pos); 2557 } 2558 2559 /* 2560 * Are we about to exceed the fs block limit ? 2561 * 2562 * If we have written data it becomes a short write. If we have 2563 * exceeded without writing data we send a signal and return EFBIG. 2564 * Linus frestrict idea will clean these up nicely.. 2565 */ 2566 if (unlikely(pos >= inode->i_sb->s_maxbytes)) 2567 return -EFBIG; 2568 2569 iov_iter_truncate(from, inode->i_sb->s_maxbytes - pos); 2570 return iov_iter_count(from); 2571 } 2572 EXPORT_SYMBOL(generic_write_checks); 2573 2574 int pagecache_write_begin(struct file *file, struct address_space *mapping, 2575 loff_t pos, unsigned len, unsigned flags, 2576 struct page **pagep, void **fsdata) 2577 { 2578 const struct address_space_operations *aops = mapping->a_ops; 2579 2580 return aops->write_begin(file, mapping, pos, len, flags, 2581 pagep, fsdata); 2582 } 2583 EXPORT_SYMBOL(pagecache_write_begin); 2584 2585 int pagecache_write_end(struct file *file, struct address_space *mapping, 2586 loff_t pos, unsigned len, unsigned copied, 2587 struct page *page, void *fsdata) 2588 { 2589 const struct address_space_operations *aops = mapping->a_ops; 2590 2591 return aops->write_end(file, mapping, pos, len, copied, page, fsdata); 2592 } 2593 EXPORT_SYMBOL(pagecache_write_end); 2594 2595 ssize_t 2596 generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) 2597 { 2598 struct file *file = iocb->ki_filp; 2599 struct address_space *mapping = file->f_mapping; 2600 struct inode *inode = mapping->host; 2601 loff_t pos = iocb->ki_pos; 2602 ssize_t written; 2603 size_t write_len; 2604 pgoff_t end; 2605 struct iov_iter data; 2606 2607 write_len = iov_iter_count(from); 2608 end = (pos + write_len - 1) >> PAGE_SHIFT; 2609 2610 written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1); 2611 if (written) 2612 goto out; 2613 2614 /* 2615 * After a write we want buffered reads to be sure to go to disk to get 2616 * the new data. We invalidate clean cached page from the region we're 2617 * about to write. We do this *before* the write so that we can return 2618 * without clobbering -EIOCBQUEUED from ->direct_IO(). 2619 */ 2620 if (mapping->nrpages) { 2621 written = invalidate_inode_pages2_range(mapping, 2622 pos >> PAGE_SHIFT, end); 2623 /* 2624 * If a page can not be invalidated, return 0 to fall back 2625 * to buffered write. 2626 */ 2627 if (written) { 2628 if (written == -EBUSY) 2629 return 0; 2630 goto out; 2631 } 2632 } 2633 2634 data = *from; 2635 written = mapping->a_ops->direct_IO(iocb, &data); 2636 2637 /* 2638 * Finally, try again to invalidate clean pages which might have been 2639 * cached by non-direct readahead, or faulted in by get_user_pages() 2640 * if the source of the write was an mmap'ed region of the file 2641 * we're writing. Either one is a pretty crazy thing to do, 2642 * so we don't support it 100%. If this invalidation 2643 * fails, tough, the write still worked... 2644 */ 2645 if (mapping->nrpages) { 2646 invalidate_inode_pages2_range(mapping, 2647 pos >> PAGE_SHIFT, end); 2648 } 2649 2650 if (written > 0) { 2651 pos += written; 2652 iov_iter_advance(from, written); 2653 if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { 2654 i_size_write(inode, pos); 2655 mark_inode_dirty(inode); 2656 } 2657 iocb->ki_pos = pos; 2658 } 2659 out: 2660 return written; 2661 } 2662 EXPORT_SYMBOL(generic_file_direct_write); 2663 2664 /* 2665 * Find or create a page at the given pagecache position. Return the locked 2666 * page. This function is specifically for buffered writes. 2667 */ 2668 struct page *grab_cache_page_write_begin(struct address_space *mapping, 2669 pgoff_t index, unsigned flags) 2670 { 2671 struct page *page; 2672 int fgp_flags = FGP_LOCK|FGP_WRITE|FGP_CREAT; 2673 2674 if (flags & AOP_FLAG_NOFS) 2675 fgp_flags |= FGP_NOFS; 2676 2677 page = pagecache_get_page(mapping, index, fgp_flags, 2678 mapping_gfp_mask(mapping)); 2679 if (page) 2680 wait_for_stable_page(page); 2681 2682 return page; 2683 } 2684 EXPORT_SYMBOL(grab_cache_page_write_begin); 2685 2686 ssize_t generic_perform_write(struct file *file, 2687 struct iov_iter *i, loff_t pos) 2688 { 2689 struct address_space *mapping = file->f_mapping; 2690 const struct address_space_operations *a_ops = mapping->a_ops; 2691 long status = 0; 2692 ssize_t written = 0; 2693 unsigned int flags = 0; 2694 2695 /* 2696 * Copies from kernel address space cannot fail (NFSD is a big user). 2697 */ 2698 if (!iter_is_iovec(i)) 2699 flags |= AOP_FLAG_UNINTERRUPTIBLE; 2700 2701 do { 2702 struct page *page; 2703 unsigned long offset; /* Offset into pagecache page */ 2704 unsigned long bytes; /* Bytes to write to page */ 2705 size_t copied; /* Bytes copied from user */ 2706 void *fsdata; 2707 2708 offset = (pos & (PAGE_SIZE - 1)); 2709 bytes = min_t(unsigned long, PAGE_SIZE - offset, 2710 iov_iter_count(i)); 2711 2712 again: 2713 /* 2714 * Bring in the user page that we will copy from _first_. 2715 * Otherwise there's a nasty deadlock on copying from the 2716 * same page as we're writing to, without it being marked 2717 * up-to-date. 2718 * 2719 * Not only is this an optimisation, but it is also required 2720 * to check that the address is actually valid, when atomic 2721 * usercopies are used, below. 2722 */ 2723 if (unlikely(iov_iter_fault_in_readable(i, bytes))) { 2724 status = -EFAULT; 2725 break; 2726 } 2727 2728 if (fatal_signal_pending(current)) { 2729 status = -EINTR; 2730 break; 2731 } 2732 2733 status = a_ops->write_begin(file, mapping, pos, bytes, flags, 2734 &page, &fsdata); 2735 if (unlikely(status < 0)) 2736 break; 2737 2738 if (mapping_writably_mapped(mapping)) 2739 flush_dcache_page(page); 2740 2741 copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); 2742 flush_dcache_page(page); 2743 2744 status = a_ops->write_end(file, mapping, pos, bytes, copied, 2745 page, fsdata); 2746 if (unlikely(status < 0)) 2747 break; 2748 copied = status; 2749 2750 cond_resched(); 2751 2752 iov_iter_advance(i, copied); 2753 if (unlikely(copied == 0)) { 2754 /* 2755 * If we were unable to copy any data at all, we must 2756 * fall back to a single segment length write. 2757 * 2758 * If we didn't fallback here, we could livelock 2759 * because not all segments in the iov can be copied at 2760 * once without a pagefault. 2761 */ 2762 bytes = min_t(unsigned long, PAGE_SIZE - offset, 2763 iov_iter_single_seg_count(i)); 2764 goto again; 2765 } 2766 pos += copied; 2767 written += copied; 2768 2769 balance_dirty_pages_ratelimited(mapping); 2770 } while (iov_iter_count(i)); 2771 2772 return written ? written : status; 2773 } 2774 EXPORT_SYMBOL(generic_perform_write); 2775 2776 /** 2777 * __generic_file_write_iter - write data to a file 2778 * @iocb: IO state structure (file, offset, etc.) 2779 * @from: iov_iter with data to write 2780 * 2781 * This function does all the work needed for actually writing data to a 2782 * file. It does all basic checks, removes SUID from the file, updates 2783 * modification times and calls proper subroutines depending on whether we 2784 * do direct IO or a standard buffered write. 2785 * 2786 * It expects i_mutex to be grabbed unless we work on a block device or similar 2787 * object which does not need locking at all. 2788 * 2789 * This function does *not* take care of syncing data in case of O_SYNC write. 2790 * A caller has to handle it. This is mainly due to the fact that we want to 2791 * avoid syncing under i_mutex. 2792 */ 2793 ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) 2794 { 2795 struct file *file = iocb->ki_filp; 2796 struct address_space * mapping = file->f_mapping; 2797 struct inode *inode = mapping->host; 2798 ssize_t written = 0; 2799 ssize_t err; 2800 ssize_t status; 2801 2802 /* We can write back this queue in page reclaim */ 2803 current->backing_dev_info = inode_to_bdi(inode); 2804 err = file_remove_privs(file); 2805 if (err) 2806 goto out; 2807 2808 err = file_update_time(file); 2809 if (err) 2810 goto out; 2811 2812 if (iocb->ki_flags & IOCB_DIRECT) { 2813 loff_t pos, endbyte; 2814 2815 written = generic_file_direct_write(iocb, from); 2816 /* 2817 * If the write stopped short of completing, fall back to 2818 * buffered writes. Some filesystems do this for writes to 2819 * holes, for example. For DAX files, a buffered write will 2820 * not succeed (even if it did, DAX does not handle dirty 2821 * page-cache pages correctly). 2822 */ 2823 if (written < 0 || !iov_iter_count(from) || IS_DAX(inode)) 2824 goto out; 2825 2826 status = generic_perform_write(file, from, pos = iocb->ki_pos); 2827 /* 2828 * If generic_perform_write() returned a synchronous error 2829 * then we want to return the number of bytes which were 2830 * direct-written, or the error code if that was zero. Note 2831 * that this differs from normal direct-io semantics, which 2832 * will return -EFOO even if some bytes were written. 2833 */ 2834 if (unlikely(status < 0)) { 2835 err = status; 2836 goto out; 2837 } 2838 /* 2839 * We need to ensure that the page cache pages are written to 2840 * disk and invalidated to preserve the expected O_DIRECT 2841 * semantics. 2842 */ 2843 endbyte = pos + status - 1; 2844 err = filemap_write_and_wait_range(mapping, pos, endbyte); 2845 if (err == 0) { 2846 iocb->ki_pos = endbyte + 1; 2847 written += status; 2848 invalidate_mapping_pages(mapping, 2849 pos >> PAGE_SHIFT, 2850 endbyte >> PAGE_SHIFT); 2851 } else { 2852 /* 2853 * We don't know how much we wrote, so just return 2854 * the number of bytes which were direct-written 2855 */ 2856 } 2857 } else { 2858 written = generic_perform_write(file, from, iocb->ki_pos); 2859 if (likely(written > 0)) 2860 iocb->ki_pos += written; 2861 } 2862 out: 2863 current->backing_dev_info = NULL; 2864 return written ? written : err; 2865 } 2866 EXPORT_SYMBOL(__generic_file_write_iter); 2867 2868 /** 2869 * generic_file_write_iter - write data to a file 2870 * @iocb: IO state structure 2871 * @from: iov_iter with data to write 2872 * 2873 * This is a wrapper around __generic_file_write_iter() to be used by most 2874 * filesystems. It takes care of syncing the file in case of O_SYNC file 2875 * and acquires i_mutex as needed. 2876 */ 2877 ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) 2878 { 2879 struct file *file = iocb->ki_filp; 2880 struct inode *inode = file->f_mapping->host; 2881 ssize_t ret; 2882 2883 inode_lock(inode); 2884 ret = generic_write_checks(iocb, from); 2885 if (ret > 0) 2886 ret = __generic_file_write_iter(iocb, from); 2887 inode_unlock(inode); 2888 2889 if (ret > 0) 2890 ret = generic_write_sync(iocb, ret); 2891 return ret; 2892 } 2893 EXPORT_SYMBOL(generic_file_write_iter); 2894 2895 /** 2896 * try_to_release_page() - release old fs-specific metadata on a page 2897 * 2898 * @page: the page which the kernel is trying to free 2899 * @gfp_mask: memory allocation flags (and I/O mode) 2900 * 2901 * The address_space is to try to release any data against the page 2902 * (presumably at page->private). If the release was successful, return `1'. 2903 * Otherwise return zero. 2904 * 2905 * This may also be called if PG_fscache is set on a page, indicating that the 2906 * page is known to the local caching routines. 2907 * 2908 * The @gfp_mask argument specifies whether I/O may be performed to release 2909 * this page (__GFP_IO), and whether the call may block (__GFP_RECLAIM & __GFP_FS). 2910 * 2911 */ 2912 int try_to_release_page(struct page *page, gfp_t gfp_mask) 2913 { 2914 struct address_space * const mapping = page->mapping; 2915 2916 BUG_ON(!PageLocked(page)); 2917 if (PageWriteback(page)) 2918 return 0; 2919 2920 if (mapping && mapping->a_ops->releasepage) 2921 return mapping->a_ops->releasepage(page, gfp_mask); 2922 return try_to_free_buffers(page); 2923 } 2924 2925 EXPORT_SYMBOL(try_to_release_page); 2926