1 /* 2 * linux/mm/filemap.c 3 * 4 * Copyright (C) 1994-1999 Linus Torvalds 5 */ 6 7 /* 8 * This file handles the generic file mmap semantics used by 9 * most "normal" filesystems (but you don't /have/ to use this: 10 * the NFS filesystem used to do this differently, for example) 11 */ 12 #include <linux/export.h> 13 #include <linux/compiler.h> 14 #include <linux/dax.h> 15 #include <linux/fs.h> 16 #include <linux/sched/signal.h> 17 #include <linux/uaccess.h> 18 #include <linux/capability.h> 19 #include <linux/kernel_stat.h> 20 #include <linux/gfp.h> 21 #include <linux/mm.h> 22 #include <linux/swap.h> 23 #include <linux/mman.h> 24 #include <linux/pagemap.h> 25 #include <linux/file.h> 26 #include <linux/uio.h> 27 #include <linux/hash.h> 28 #include <linux/writeback.h> 29 #include <linux/backing-dev.h> 30 #include <linux/pagevec.h> 31 #include <linux/blkdev.h> 32 #include <linux/security.h> 33 #include <linux/cpuset.h> 34 #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ 35 #include <linux/hugetlb.h> 36 #include <linux/memcontrol.h> 37 #include <linux/cleancache.h> 38 #include <linux/rmap.h> 39 #include "internal.h" 40 41 #define CREATE_TRACE_POINTS 42 #include <trace/events/filemap.h> 43 44 /* 45 * FIXME: remove all knowledge of the buffer layer from the core VM 46 */ 47 #include <linux/buffer_head.h> /* for try_to_free_buffers */ 48 49 #include <asm/mman.h> 50 51 /* 52 * Shared mappings implemented 30.11.1994. It's not fully working yet, 53 * though. 54 * 55 * Shared mappings now work. 15.8.1995 Bruno. 56 * 57 * finished 'unifying' the page and buffer cache and SMP-threaded the 58 * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com> 59 * 60 * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de> 61 */ 62 63 /* 64 * Lock ordering: 65 * 66 * ->i_mmap_rwsem (truncate_pagecache) 67 * ->private_lock (__free_pte->__set_page_dirty_buffers) 68 * ->swap_lock (exclusive_swap_page, others) 69 * ->mapping->tree_lock 70 * 71 * ->i_mutex 72 * ->i_mmap_rwsem (truncate->unmap_mapping_range) 73 * 74 * ->mmap_sem 75 * ->i_mmap_rwsem 76 * ->page_table_lock or pte_lock (various, mainly in memory.c) 77 * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock) 78 * 79 * ->mmap_sem 80 * ->lock_page (access_process_vm) 81 * 82 * ->i_mutex (generic_perform_write) 83 * ->mmap_sem (fault_in_pages_readable->do_page_fault) 84 * 85 * bdi->wb.list_lock 86 * sb_lock (fs/fs-writeback.c) 87 * ->mapping->tree_lock (__sync_single_inode) 88 * 89 * ->i_mmap_rwsem 90 * ->anon_vma.lock (vma_adjust) 91 * 92 * ->anon_vma.lock 93 * ->page_table_lock or pte_lock (anon_vma_prepare and various) 94 * 95 * ->page_table_lock or pte_lock 96 * ->swap_lock (try_to_unmap_one) 97 * ->private_lock (try_to_unmap_one) 98 * ->tree_lock (try_to_unmap_one) 99 * ->zone_lru_lock(zone) (follow_page->mark_page_accessed) 100 * ->zone_lru_lock(zone) (check_pte_range->isolate_lru_page) 101 * ->private_lock (page_remove_rmap->set_page_dirty) 102 * ->tree_lock (page_remove_rmap->set_page_dirty) 103 * bdi.wb->list_lock (page_remove_rmap->set_page_dirty) 104 * ->inode->i_lock (page_remove_rmap->set_page_dirty) 105 * ->memcg->move_lock (page_remove_rmap->lock_page_memcg) 106 * bdi.wb->list_lock (zap_pte_range->set_page_dirty) 107 * ->inode->i_lock (zap_pte_range->set_page_dirty) 108 * ->private_lock (zap_pte_range->__set_page_dirty_buffers) 109 * 110 * ->i_mmap_rwsem 111 * ->tasklist_lock (memory_failure, collect_procs_ao) 112 */ 113 114 static int page_cache_tree_insert(struct address_space *mapping, 115 struct page *page, void **shadowp) 116 { 117 struct radix_tree_node *node; 118 void **slot; 119 int error; 120 121 error = __radix_tree_create(&mapping->page_tree, page->index, 0, 122 &node, &slot); 123 if (error) 124 return error; 125 if (*slot) { 126 void *p; 127 128 p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock); 129 if (!radix_tree_exceptional_entry(p)) 130 return -EEXIST; 131 132 mapping->nrexceptional--; 133 if (!dax_mapping(mapping)) { 134 if (shadowp) 135 *shadowp = p; 136 } else { 137 /* DAX can replace empty locked entry with a hole */ 138 WARN_ON_ONCE(p != 139 dax_radix_locked_entry(0, RADIX_DAX_EMPTY)); 140 /* Wakeup waiters for exceptional entry lock */ 141 dax_wake_mapping_entry_waiter(mapping, page->index, p, 142 true); 143 } 144 } 145 __radix_tree_replace(&mapping->page_tree, node, slot, page, 146 workingset_update_node, mapping); 147 mapping->nrpages++; 148 return 0; 149 } 150 151 static void page_cache_tree_delete(struct address_space *mapping, 152 struct page *page, void *shadow) 153 { 154 int i, nr; 155 156 /* hugetlb pages are represented by one entry in the radix tree */ 157 nr = PageHuge(page) ? 1 : hpage_nr_pages(page); 158 159 VM_BUG_ON_PAGE(!PageLocked(page), page); 160 VM_BUG_ON_PAGE(PageTail(page), page); 161 VM_BUG_ON_PAGE(nr != 1 && shadow, page); 162 163 for (i = 0; i < nr; i++) { 164 struct radix_tree_node *node; 165 void **slot; 166 167 __radix_tree_lookup(&mapping->page_tree, page->index + i, 168 &node, &slot); 169 170 VM_BUG_ON_PAGE(!node && nr != 1, page); 171 172 radix_tree_clear_tags(&mapping->page_tree, node, slot); 173 __radix_tree_replace(&mapping->page_tree, node, slot, shadow, 174 workingset_update_node, mapping); 175 } 176 177 if (shadow) { 178 mapping->nrexceptional += nr; 179 /* 180 * Make sure the nrexceptional update is committed before 181 * the nrpages update so that final truncate racing 182 * with reclaim does not see both counters 0 at the 183 * same time and miss a shadow entry. 184 */ 185 smp_wmb(); 186 } 187 mapping->nrpages -= nr; 188 } 189 190 /* 191 * Delete a page from the page cache and free it. Caller has to make 192 * sure the page is locked and that nobody else uses it - or that usage 193 * is safe. The caller must hold the mapping's tree_lock. 194 */ 195 void __delete_from_page_cache(struct page *page, void *shadow) 196 { 197 struct address_space *mapping = page->mapping; 198 int nr = hpage_nr_pages(page); 199 200 trace_mm_filemap_delete_from_page_cache(page); 201 /* 202 * if we're uptodate, flush out into the cleancache, otherwise 203 * invalidate any existing cleancache entries. We can't leave 204 * stale data around in the cleancache once our page is gone 205 */ 206 if (PageUptodate(page) && PageMappedToDisk(page)) 207 cleancache_put_page(page); 208 else 209 cleancache_invalidate_page(mapping, page); 210 211 VM_BUG_ON_PAGE(PageTail(page), page); 212 VM_BUG_ON_PAGE(page_mapped(page), page); 213 if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(page_mapped(page))) { 214 int mapcount; 215 216 pr_alert("BUG: Bad page cache in process %s pfn:%05lx\n", 217 current->comm, page_to_pfn(page)); 218 dump_page(page, "still mapped when deleted"); 219 dump_stack(); 220 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); 221 222 mapcount = page_mapcount(page); 223 if (mapping_exiting(mapping) && 224 page_count(page) >= mapcount + 2) { 225 /* 226 * All vmas have already been torn down, so it's 227 * a good bet that actually the page is unmapped, 228 * and we'd prefer not to leak it: if we're wrong, 229 * some other bad page check should catch it later. 230 */ 231 page_mapcount_reset(page); 232 page_ref_sub(page, mapcount); 233 } 234 } 235 236 page_cache_tree_delete(mapping, page, shadow); 237 238 page->mapping = NULL; 239 /* Leave page->index set: truncation lookup relies upon it */ 240 241 /* hugetlb pages do not participate in page cache accounting. */ 242 if (PageHuge(page)) 243 return; 244 245 __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr); 246 if (PageSwapBacked(page)) { 247 __mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr); 248 if (PageTransHuge(page)) 249 __dec_node_page_state(page, NR_SHMEM_THPS); 250 } else { 251 VM_BUG_ON_PAGE(PageTransHuge(page), page); 252 } 253 254 /* 255 * At this point page must be either written or cleaned by truncate. 256 * Dirty page here signals a bug and loss of unwritten data. 257 * 258 * This fixes dirty accounting after removing the page entirely but 259 * leaves PageDirty set: it has no effect for truncated page and 260 * anyway will be cleared before returning page into buddy allocator. 261 */ 262 if (WARN_ON_ONCE(PageDirty(page))) 263 account_page_cleaned(page, mapping, inode_to_wb(mapping->host)); 264 } 265 266 /** 267 * delete_from_page_cache - delete page from page cache 268 * @page: the page which the kernel is trying to remove from page cache 269 * 270 * This must be called only on pages that have been verified to be in the page 271 * cache and locked. It will never put the page into the free list, the caller 272 * has a reference on the page. 273 */ 274 void delete_from_page_cache(struct page *page) 275 { 276 struct address_space *mapping = page_mapping(page); 277 unsigned long flags; 278 void (*freepage)(struct page *); 279 280 BUG_ON(!PageLocked(page)); 281 282 freepage = mapping->a_ops->freepage; 283 284 spin_lock_irqsave(&mapping->tree_lock, flags); 285 __delete_from_page_cache(page, NULL); 286 spin_unlock_irqrestore(&mapping->tree_lock, flags); 287 288 if (freepage) 289 freepage(page); 290 291 if (PageTransHuge(page) && !PageHuge(page)) { 292 page_ref_sub(page, HPAGE_PMD_NR); 293 VM_BUG_ON_PAGE(page_count(page) <= 0, page); 294 } else { 295 put_page(page); 296 } 297 } 298 EXPORT_SYMBOL(delete_from_page_cache); 299 300 int filemap_check_errors(struct address_space *mapping) 301 { 302 int ret = 0; 303 /* Check for outstanding write errors */ 304 if (test_bit(AS_ENOSPC, &mapping->flags) && 305 test_and_clear_bit(AS_ENOSPC, &mapping->flags)) 306 ret = -ENOSPC; 307 if (test_bit(AS_EIO, &mapping->flags) && 308 test_and_clear_bit(AS_EIO, &mapping->flags)) 309 ret = -EIO; 310 return ret; 311 } 312 EXPORT_SYMBOL(filemap_check_errors); 313 314 static int filemap_check_and_keep_errors(struct address_space *mapping) 315 { 316 /* Check for outstanding write errors */ 317 if (test_bit(AS_EIO, &mapping->flags)) 318 return -EIO; 319 if (test_bit(AS_ENOSPC, &mapping->flags)) 320 return -ENOSPC; 321 return 0; 322 } 323 324 /** 325 * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range 326 * @mapping: address space structure to write 327 * @start: offset in bytes where the range starts 328 * @end: offset in bytes where the range ends (inclusive) 329 * @sync_mode: enable synchronous operation 330 * 331 * Start writeback against all of a mapping's dirty pages that lie 332 * within the byte offsets <start, end> inclusive. 333 * 334 * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as 335 * opposed to a regular memory cleansing writeback. The difference between 336 * these two operations is that if a dirty page/buffer is encountered, it must 337 * be waited upon, and not just skipped over. 338 */ 339 int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, 340 loff_t end, int sync_mode) 341 { 342 int ret; 343 struct writeback_control wbc = { 344 .sync_mode = sync_mode, 345 .nr_to_write = LONG_MAX, 346 .range_start = start, 347 .range_end = end, 348 }; 349 350 if (!mapping_cap_writeback_dirty(mapping)) 351 return 0; 352 353 wbc_attach_fdatawrite_inode(&wbc, mapping->host); 354 ret = do_writepages(mapping, &wbc); 355 wbc_detach_inode(&wbc); 356 return ret; 357 } 358 359 static inline int __filemap_fdatawrite(struct address_space *mapping, 360 int sync_mode) 361 { 362 return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode); 363 } 364 365 int filemap_fdatawrite(struct address_space *mapping) 366 { 367 return __filemap_fdatawrite(mapping, WB_SYNC_ALL); 368 } 369 EXPORT_SYMBOL(filemap_fdatawrite); 370 371 int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, 372 loff_t end) 373 { 374 return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL); 375 } 376 EXPORT_SYMBOL(filemap_fdatawrite_range); 377 378 /** 379 * filemap_flush - mostly a non-blocking flush 380 * @mapping: target address_space 381 * 382 * This is a mostly non-blocking flush. Not suitable for data-integrity 383 * purposes - I/O may not be started against all dirty pages. 384 */ 385 int filemap_flush(struct address_space *mapping) 386 { 387 return __filemap_fdatawrite(mapping, WB_SYNC_NONE); 388 } 389 EXPORT_SYMBOL(filemap_flush); 390 391 /** 392 * filemap_range_has_page - check if a page exists in range. 393 * @mapping: address space within which to check 394 * @start_byte: offset in bytes where the range starts 395 * @end_byte: offset in bytes where the range ends (inclusive) 396 * 397 * Find at least one page in the range supplied, usually used to check if 398 * direct writing in this range will trigger a writeback. 399 */ 400 bool filemap_range_has_page(struct address_space *mapping, 401 loff_t start_byte, loff_t end_byte) 402 { 403 pgoff_t index = start_byte >> PAGE_SHIFT; 404 pgoff_t end = end_byte >> PAGE_SHIFT; 405 struct pagevec pvec; 406 bool ret; 407 408 if (end_byte < start_byte) 409 return false; 410 411 if (mapping->nrpages == 0) 412 return false; 413 414 pagevec_init(&pvec, 0); 415 if (!pagevec_lookup(&pvec, mapping, index, 1)) 416 return false; 417 ret = (pvec.pages[0]->index <= end); 418 pagevec_release(&pvec); 419 return ret; 420 } 421 EXPORT_SYMBOL(filemap_range_has_page); 422 423 static void __filemap_fdatawait_range(struct address_space *mapping, 424 loff_t start_byte, loff_t end_byte) 425 { 426 pgoff_t index = start_byte >> PAGE_SHIFT; 427 pgoff_t end = end_byte >> PAGE_SHIFT; 428 struct pagevec pvec; 429 int nr_pages; 430 431 if (end_byte < start_byte) 432 return; 433 434 pagevec_init(&pvec, 0); 435 while ((index <= end) && 436 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, 437 PAGECACHE_TAG_WRITEBACK, 438 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) { 439 unsigned i; 440 441 for (i = 0; i < nr_pages; i++) { 442 struct page *page = pvec.pages[i]; 443 444 /* until radix tree lookup accepts end_index */ 445 if (page->index > end) 446 continue; 447 448 wait_on_page_writeback(page); 449 ClearPageError(page); 450 } 451 pagevec_release(&pvec); 452 cond_resched(); 453 } 454 } 455 456 /** 457 * filemap_fdatawait_range - wait for writeback to complete 458 * @mapping: address space structure to wait for 459 * @start_byte: offset in bytes where the range starts 460 * @end_byte: offset in bytes where the range ends (inclusive) 461 * 462 * Walk the list of under-writeback pages of the given address space 463 * in the given range and wait for all of them. Check error status of 464 * the address space and return it. 465 * 466 * Since the error status of the address space is cleared by this function, 467 * callers are responsible for checking the return value and handling and/or 468 * reporting the error. 469 */ 470 int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, 471 loff_t end_byte) 472 { 473 __filemap_fdatawait_range(mapping, start_byte, end_byte); 474 return filemap_check_errors(mapping); 475 } 476 EXPORT_SYMBOL(filemap_fdatawait_range); 477 478 /** 479 * filemap_fdatawait_keep_errors - wait for writeback without clearing errors 480 * @mapping: address space structure to wait for 481 * 482 * Walk the list of under-writeback pages of the given address space 483 * and wait for all of them. Unlike filemap_fdatawait(), this function 484 * does not clear error status of the address space. 485 * 486 * Use this function if callers don't handle errors themselves. Expected 487 * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2), 488 * fsfreeze(8) 489 */ 490 int filemap_fdatawait_keep_errors(struct address_space *mapping) 491 { 492 loff_t i_size = i_size_read(mapping->host); 493 494 if (i_size == 0) 495 return 0; 496 497 __filemap_fdatawait_range(mapping, 0, i_size - 1); 498 return filemap_check_and_keep_errors(mapping); 499 } 500 EXPORT_SYMBOL(filemap_fdatawait_keep_errors); 501 502 /** 503 * filemap_fdatawait - wait for all under-writeback pages to complete 504 * @mapping: address space structure to wait for 505 * 506 * Walk the list of under-writeback pages of the given address space 507 * and wait for all of them. Check error status of the address space 508 * and return it. 509 * 510 * Since the error status of the address space is cleared by this function, 511 * callers are responsible for checking the return value and handling and/or 512 * reporting the error. 513 */ 514 int filemap_fdatawait(struct address_space *mapping) 515 { 516 loff_t i_size = i_size_read(mapping->host); 517 518 if (i_size == 0) 519 return 0; 520 521 return filemap_fdatawait_range(mapping, 0, i_size - 1); 522 } 523 EXPORT_SYMBOL(filemap_fdatawait); 524 525 int filemap_write_and_wait(struct address_space *mapping) 526 { 527 int err = 0; 528 529 if ((!dax_mapping(mapping) && mapping->nrpages) || 530 (dax_mapping(mapping) && mapping->nrexceptional)) { 531 err = filemap_fdatawrite(mapping); 532 /* 533 * Even if the above returned error, the pages may be 534 * written partially (e.g. -ENOSPC), so we wait for it. 535 * But the -EIO is special case, it may indicate the worst 536 * thing (e.g. bug) happened, so we avoid waiting for it. 537 */ 538 if (err != -EIO) { 539 int err2 = filemap_fdatawait(mapping); 540 if (!err) 541 err = err2; 542 } else { 543 /* Clear any previously stored errors */ 544 filemap_check_errors(mapping); 545 } 546 } else { 547 err = filemap_check_errors(mapping); 548 } 549 return err; 550 } 551 EXPORT_SYMBOL(filemap_write_and_wait); 552 553 /** 554 * filemap_write_and_wait_range - write out & wait on a file range 555 * @mapping: the address_space for the pages 556 * @lstart: offset in bytes where the range starts 557 * @lend: offset in bytes where the range ends (inclusive) 558 * 559 * Write out and wait upon file offsets lstart->lend, inclusive. 560 * 561 * Note that @lend is inclusive (describes the last byte to be written) so 562 * that this function can be used to write to the very end-of-file (end = -1). 563 */ 564 int filemap_write_and_wait_range(struct address_space *mapping, 565 loff_t lstart, loff_t lend) 566 { 567 int err = 0; 568 569 if ((!dax_mapping(mapping) && mapping->nrpages) || 570 (dax_mapping(mapping) && mapping->nrexceptional)) { 571 err = __filemap_fdatawrite_range(mapping, lstart, lend, 572 WB_SYNC_ALL); 573 /* See comment of filemap_write_and_wait() */ 574 if (err != -EIO) { 575 int err2 = filemap_fdatawait_range(mapping, 576 lstart, lend); 577 if (!err) 578 err = err2; 579 } else { 580 /* Clear any previously stored errors */ 581 filemap_check_errors(mapping); 582 } 583 } else { 584 err = filemap_check_errors(mapping); 585 } 586 return err; 587 } 588 EXPORT_SYMBOL(filemap_write_and_wait_range); 589 590 void __filemap_set_wb_err(struct address_space *mapping, int err) 591 { 592 errseq_t eseq = __errseq_set(&mapping->wb_err, err); 593 594 trace_filemap_set_wb_err(mapping, eseq); 595 } 596 EXPORT_SYMBOL(__filemap_set_wb_err); 597 598 /** 599 * file_check_and_advance_wb_err - report wb error (if any) that was previously 600 * and advance wb_err to current one 601 * @file: struct file on which the error is being reported 602 * 603 * When userland calls fsync (or something like nfsd does the equivalent), we 604 * want to report any writeback errors that occurred since the last fsync (or 605 * since the file was opened if there haven't been any). 606 * 607 * Grab the wb_err from the mapping. If it matches what we have in the file, 608 * then just quickly return 0. The file is all caught up. 609 * 610 * If it doesn't match, then take the mapping value, set the "seen" flag in 611 * it and try to swap it into place. If it works, or another task beat us 612 * to it with the new value, then update the f_wb_err and return the error 613 * portion. The error at this point must be reported via proper channels 614 * (a'la fsync, or NFS COMMIT operation, etc.). 615 * 616 * While we handle mapping->wb_err with atomic operations, the f_wb_err 617 * value is protected by the f_lock since we must ensure that it reflects 618 * the latest value swapped in for this file descriptor. 619 */ 620 int file_check_and_advance_wb_err(struct file *file) 621 { 622 int err = 0; 623 errseq_t old = READ_ONCE(file->f_wb_err); 624 struct address_space *mapping = file->f_mapping; 625 626 /* Locklessly handle the common case where nothing has changed */ 627 if (errseq_check(&mapping->wb_err, old)) { 628 /* Something changed, must use slow path */ 629 spin_lock(&file->f_lock); 630 old = file->f_wb_err; 631 err = errseq_check_and_advance(&mapping->wb_err, 632 &file->f_wb_err); 633 trace_file_check_and_advance_wb_err(file, old); 634 spin_unlock(&file->f_lock); 635 } 636 return err; 637 } 638 EXPORT_SYMBOL(file_check_and_advance_wb_err); 639 640 /** 641 * file_write_and_wait_range - write out & wait on a file range 642 * @file: file pointing to address_space with pages 643 * @lstart: offset in bytes where the range starts 644 * @lend: offset in bytes where the range ends (inclusive) 645 * 646 * Write out and wait upon file offsets lstart->lend, inclusive. 647 * 648 * Note that @lend is inclusive (describes the last byte to be written) so 649 * that this function can be used to write to the very end-of-file (end = -1). 650 * 651 * After writing out and waiting on the data, we check and advance the 652 * f_wb_err cursor to the latest value, and return any errors detected there. 653 */ 654 int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend) 655 { 656 int err = 0, err2; 657 struct address_space *mapping = file->f_mapping; 658 659 if ((!dax_mapping(mapping) && mapping->nrpages) || 660 (dax_mapping(mapping) && mapping->nrexceptional)) { 661 err = __filemap_fdatawrite_range(mapping, lstart, lend, 662 WB_SYNC_ALL); 663 /* See comment of filemap_write_and_wait() */ 664 if (err != -EIO) 665 __filemap_fdatawait_range(mapping, lstart, lend); 666 } 667 err2 = file_check_and_advance_wb_err(file); 668 if (!err) 669 err = err2; 670 return err; 671 } 672 EXPORT_SYMBOL(file_write_and_wait_range); 673 674 /** 675 * replace_page_cache_page - replace a pagecache page with a new one 676 * @old: page to be replaced 677 * @new: page to replace with 678 * @gfp_mask: allocation mode 679 * 680 * This function replaces a page in the pagecache with a new one. On 681 * success it acquires the pagecache reference for the new page and 682 * drops it for the old page. Both the old and new pages must be 683 * locked. This function does not add the new page to the LRU, the 684 * caller must do that. 685 * 686 * The remove + add is atomic. The only way this function can fail is 687 * memory allocation failure. 688 */ 689 int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) 690 { 691 int error; 692 693 VM_BUG_ON_PAGE(!PageLocked(old), old); 694 VM_BUG_ON_PAGE(!PageLocked(new), new); 695 VM_BUG_ON_PAGE(new->mapping, new); 696 697 error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); 698 if (!error) { 699 struct address_space *mapping = old->mapping; 700 void (*freepage)(struct page *); 701 unsigned long flags; 702 703 pgoff_t offset = old->index; 704 freepage = mapping->a_ops->freepage; 705 706 get_page(new); 707 new->mapping = mapping; 708 new->index = offset; 709 710 spin_lock_irqsave(&mapping->tree_lock, flags); 711 __delete_from_page_cache(old, NULL); 712 error = page_cache_tree_insert(mapping, new, NULL); 713 BUG_ON(error); 714 715 /* 716 * hugetlb pages do not participate in page cache accounting. 717 */ 718 if (!PageHuge(new)) 719 __inc_node_page_state(new, NR_FILE_PAGES); 720 if (PageSwapBacked(new)) 721 __inc_node_page_state(new, NR_SHMEM); 722 spin_unlock_irqrestore(&mapping->tree_lock, flags); 723 mem_cgroup_migrate(old, new); 724 radix_tree_preload_end(); 725 if (freepage) 726 freepage(old); 727 put_page(old); 728 } 729 730 return error; 731 } 732 EXPORT_SYMBOL_GPL(replace_page_cache_page); 733 734 static int __add_to_page_cache_locked(struct page *page, 735 struct address_space *mapping, 736 pgoff_t offset, gfp_t gfp_mask, 737 void **shadowp) 738 { 739 int huge = PageHuge(page); 740 struct mem_cgroup *memcg; 741 int error; 742 743 VM_BUG_ON_PAGE(!PageLocked(page), page); 744 VM_BUG_ON_PAGE(PageSwapBacked(page), page); 745 746 if (!huge) { 747 error = mem_cgroup_try_charge(page, current->mm, 748 gfp_mask, &memcg, false); 749 if (error) 750 return error; 751 } 752 753 error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM); 754 if (error) { 755 if (!huge) 756 mem_cgroup_cancel_charge(page, memcg, false); 757 return error; 758 } 759 760 get_page(page); 761 page->mapping = mapping; 762 page->index = offset; 763 764 spin_lock_irq(&mapping->tree_lock); 765 error = page_cache_tree_insert(mapping, page, shadowp); 766 radix_tree_preload_end(); 767 if (unlikely(error)) 768 goto err_insert; 769 770 /* hugetlb pages do not participate in page cache accounting. */ 771 if (!huge) 772 __inc_node_page_state(page, NR_FILE_PAGES); 773 spin_unlock_irq(&mapping->tree_lock); 774 if (!huge) 775 mem_cgroup_commit_charge(page, memcg, false, false); 776 trace_mm_filemap_add_to_page_cache(page); 777 return 0; 778 err_insert: 779 page->mapping = NULL; 780 /* Leave page->index set: truncation relies upon it */ 781 spin_unlock_irq(&mapping->tree_lock); 782 if (!huge) 783 mem_cgroup_cancel_charge(page, memcg, false); 784 put_page(page); 785 return error; 786 } 787 788 /** 789 * add_to_page_cache_locked - add a locked page to the pagecache 790 * @page: page to add 791 * @mapping: the page's address_space 792 * @offset: page index 793 * @gfp_mask: page allocation mode 794 * 795 * This function is used to add a page to the pagecache. It must be locked. 796 * This function does not add the page to the LRU. The caller must do that. 797 */ 798 int add_to_page_cache_locked(struct page *page, struct address_space *mapping, 799 pgoff_t offset, gfp_t gfp_mask) 800 { 801 return __add_to_page_cache_locked(page, mapping, offset, 802 gfp_mask, NULL); 803 } 804 EXPORT_SYMBOL(add_to_page_cache_locked); 805 806 int add_to_page_cache_lru(struct page *page, struct address_space *mapping, 807 pgoff_t offset, gfp_t gfp_mask) 808 { 809 void *shadow = NULL; 810 int ret; 811 812 __SetPageLocked(page); 813 ret = __add_to_page_cache_locked(page, mapping, offset, 814 gfp_mask, &shadow); 815 if (unlikely(ret)) 816 __ClearPageLocked(page); 817 else { 818 /* 819 * The page might have been evicted from cache only 820 * recently, in which case it should be activated like 821 * any other repeatedly accessed page. 822 * The exception is pages getting rewritten; evicting other 823 * data from the working set, only to cache data that will 824 * get overwritten with something else, is a waste of memory. 825 */ 826 if (!(gfp_mask & __GFP_WRITE) && 827 shadow && workingset_refault(shadow)) { 828 SetPageActive(page); 829 workingset_activation(page); 830 } else 831 ClearPageActive(page); 832 lru_cache_add(page); 833 } 834 return ret; 835 } 836 EXPORT_SYMBOL_GPL(add_to_page_cache_lru); 837 838 #ifdef CONFIG_NUMA 839 struct page *__page_cache_alloc(gfp_t gfp) 840 { 841 int n; 842 struct page *page; 843 844 if (cpuset_do_page_mem_spread()) { 845 unsigned int cpuset_mems_cookie; 846 do { 847 cpuset_mems_cookie = read_mems_allowed_begin(); 848 n = cpuset_mem_spread_node(); 849 page = __alloc_pages_node(n, gfp, 0); 850 } while (!page && read_mems_allowed_retry(cpuset_mems_cookie)); 851 852 return page; 853 } 854 return alloc_pages(gfp, 0); 855 } 856 EXPORT_SYMBOL(__page_cache_alloc); 857 #endif 858 859 /* 860 * In order to wait for pages to become available there must be 861 * waitqueues associated with pages. By using a hash table of 862 * waitqueues where the bucket discipline is to maintain all 863 * waiters on the same queue and wake all when any of the pages 864 * become available, and for the woken contexts to check to be 865 * sure the appropriate page became available, this saves space 866 * at a cost of "thundering herd" phenomena during rare hash 867 * collisions. 868 */ 869 #define PAGE_WAIT_TABLE_BITS 8 870 #define PAGE_WAIT_TABLE_SIZE (1 << PAGE_WAIT_TABLE_BITS) 871 static wait_queue_head_t page_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned; 872 873 static wait_queue_head_t *page_waitqueue(struct page *page) 874 { 875 return &page_wait_table[hash_ptr(page, PAGE_WAIT_TABLE_BITS)]; 876 } 877 878 void __init pagecache_init(void) 879 { 880 int i; 881 882 for (i = 0; i < PAGE_WAIT_TABLE_SIZE; i++) 883 init_waitqueue_head(&page_wait_table[i]); 884 885 page_writeback_init(); 886 } 887 888 struct wait_page_key { 889 struct page *page; 890 int bit_nr; 891 int page_match; 892 }; 893 894 struct wait_page_queue { 895 struct page *page; 896 int bit_nr; 897 wait_queue_entry_t wait; 898 }; 899 900 static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg) 901 { 902 struct wait_page_key *key = arg; 903 struct wait_page_queue *wait_page 904 = container_of(wait, struct wait_page_queue, wait); 905 906 if (wait_page->page != key->page) 907 return 0; 908 key->page_match = 1; 909 910 if (wait_page->bit_nr != key->bit_nr) 911 return 0; 912 if (test_bit(key->bit_nr, &key->page->flags)) 913 return 0; 914 915 return autoremove_wake_function(wait, mode, sync, key); 916 } 917 918 static void wake_up_page_bit(struct page *page, int bit_nr) 919 { 920 wait_queue_head_t *q = page_waitqueue(page); 921 struct wait_page_key key; 922 unsigned long flags; 923 924 key.page = page; 925 key.bit_nr = bit_nr; 926 key.page_match = 0; 927 928 spin_lock_irqsave(&q->lock, flags); 929 __wake_up_locked_key(q, TASK_NORMAL, &key); 930 /* 931 * It is possible for other pages to have collided on the waitqueue 932 * hash, so in that case check for a page match. That prevents a long- 933 * term waiter 934 * 935 * It is still possible to miss a case here, when we woke page waiters 936 * and removed them from the waitqueue, but there are still other 937 * page waiters. 938 */ 939 if (!waitqueue_active(q) || !key.page_match) { 940 ClearPageWaiters(page); 941 /* 942 * It's possible to miss clearing Waiters here, when we woke 943 * our page waiters, but the hashed waitqueue has waiters for 944 * other pages on it. 945 * 946 * That's okay, it's a rare case. The next waker will clear it. 947 */ 948 } 949 spin_unlock_irqrestore(&q->lock, flags); 950 } 951 952 static void wake_up_page(struct page *page, int bit) 953 { 954 if (!PageWaiters(page)) 955 return; 956 wake_up_page_bit(page, bit); 957 } 958 959 static inline int wait_on_page_bit_common(wait_queue_head_t *q, 960 struct page *page, int bit_nr, int state, bool lock) 961 { 962 struct wait_page_queue wait_page; 963 wait_queue_entry_t *wait = &wait_page.wait; 964 int ret = 0; 965 966 init_wait(wait); 967 wait->func = wake_page_function; 968 wait_page.page = page; 969 wait_page.bit_nr = bit_nr; 970 971 for (;;) { 972 spin_lock_irq(&q->lock); 973 974 if (likely(list_empty(&wait->entry))) { 975 if (lock) 976 __add_wait_queue_entry_tail_exclusive(q, wait); 977 else 978 __add_wait_queue(q, wait); 979 SetPageWaiters(page); 980 } 981 982 set_current_state(state); 983 984 spin_unlock_irq(&q->lock); 985 986 if (likely(test_bit(bit_nr, &page->flags))) { 987 io_schedule(); 988 if (unlikely(signal_pending_state(state, current))) { 989 ret = -EINTR; 990 break; 991 } 992 } 993 994 if (lock) { 995 if (!test_and_set_bit_lock(bit_nr, &page->flags)) 996 break; 997 } else { 998 if (!test_bit(bit_nr, &page->flags)) 999 break; 1000 } 1001 } 1002 1003 finish_wait(q, wait); 1004 1005 /* 1006 * A signal could leave PageWaiters set. Clearing it here if 1007 * !waitqueue_active would be possible (by open-coding finish_wait), 1008 * but still fail to catch it in the case of wait hash collision. We 1009 * already can fail to clear wait hash collision cases, so don't 1010 * bother with signals either. 1011 */ 1012 1013 return ret; 1014 } 1015 1016 void wait_on_page_bit(struct page *page, int bit_nr) 1017 { 1018 wait_queue_head_t *q = page_waitqueue(page); 1019 wait_on_page_bit_common(q, page, bit_nr, TASK_UNINTERRUPTIBLE, false); 1020 } 1021 EXPORT_SYMBOL(wait_on_page_bit); 1022 1023 int wait_on_page_bit_killable(struct page *page, int bit_nr) 1024 { 1025 wait_queue_head_t *q = page_waitqueue(page); 1026 return wait_on_page_bit_common(q, page, bit_nr, TASK_KILLABLE, false); 1027 } 1028 1029 /** 1030 * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue 1031 * @page: Page defining the wait queue of interest 1032 * @waiter: Waiter to add to the queue 1033 * 1034 * Add an arbitrary @waiter to the wait queue for the nominated @page. 1035 */ 1036 void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter) 1037 { 1038 wait_queue_head_t *q = page_waitqueue(page); 1039 unsigned long flags; 1040 1041 spin_lock_irqsave(&q->lock, flags); 1042 __add_wait_queue(q, waiter); 1043 SetPageWaiters(page); 1044 spin_unlock_irqrestore(&q->lock, flags); 1045 } 1046 EXPORT_SYMBOL_GPL(add_page_wait_queue); 1047 1048 #ifndef clear_bit_unlock_is_negative_byte 1049 1050 /* 1051 * PG_waiters is the high bit in the same byte as PG_lock. 1052 * 1053 * On x86 (and on many other architectures), we can clear PG_lock and 1054 * test the sign bit at the same time. But if the architecture does 1055 * not support that special operation, we just do this all by hand 1056 * instead. 1057 * 1058 * The read of PG_waiters has to be after (or concurrently with) PG_locked 1059 * being cleared, but a memory barrier should be unneccssary since it is 1060 * in the same byte as PG_locked. 1061 */ 1062 static inline bool clear_bit_unlock_is_negative_byte(long nr, volatile void *mem) 1063 { 1064 clear_bit_unlock(nr, mem); 1065 /* smp_mb__after_atomic(); */ 1066 return test_bit(PG_waiters, mem); 1067 } 1068 1069 #endif 1070 1071 /** 1072 * unlock_page - unlock a locked page 1073 * @page: the page 1074 * 1075 * Unlocks the page and wakes up sleepers in ___wait_on_page_locked(). 1076 * Also wakes sleepers in wait_on_page_writeback() because the wakeup 1077 * mechanism between PageLocked pages and PageWriteback pages is shared. 1078 * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep. 1079 * 1080 * Note that this depends on PG_waiters being the sign bit in the byte 1081 * that contains PG_locked - thus the BUILD_BUG_ON(). That allows us to 1082 * clear the PG_locked bit and test PG_waiters at the same time fairly 1083 * portably (architectures that do LL/SC can test any bit, while x86 can 1084 * test the sign bit). 1085 */ 1086 void unlock_page(struct page *page) 1087 { 1088 BUILD_BUG_ON(PG_waiters != 7); 1089 page = compound_head(page); 1090 VM_BUG_ON_PAGE(!PageLocked(page), page); 1091 if (clear_bit_unlock_is_negative_byte(PG_locked, &page->flags)) 1092 wake_up_page_bit(page, PG_locked); 1093 } 1094 EXPORT_SYMBOL(unlock_page); 1095 1096 /** 1097 * end_page_writeback - end writeback against a page 1098 * @page: the page 1099 */ 1100 void end_page_writeback(struct page *page) 1101 { 1102 /* 1103 * TestClearPageReclaim could be used here but it is an atomic 1104 * operation and overkill in this particular case. Failing to 1105 * shuffle a page marked for immediate reclaim is too mild to 1106 * justify taking an atomic operation penalty at the end of 1107 * ever page writeback. 1108 */ 1109 if (PageReclaim(page)) { 1110 ClearPageReclaim(page); 1111 rotate_reclaimable_page(page); 1112 } 1113 1114 if (!test_clear_page_writeback(page)) 1115 BUG(); 1116 1117 smp_mb__after_atomic(); 1118 wake_up_page(page, PG_writeback); 1119 } 1120 EXPORT_SYMBOL(end_page_writeback); 1121 1122 /* 1123 * After completing I/O on a page, call this routine to update the page 1124 * flags appropriately 1125 */ 1126 void page_endio(struct page *page, bool is_write, int err) 1127 { 1128 if (!is_write) { 1129 if (!err) { 1130 SetPageUptodate(page); 1131 } else { 1132 ClearPageUptodate(page); 1133 SetPageError(page); 1134 } 1135 unlock_page(page); 1136 } else { 1137 if (err) { 1138 struct address_space *mapping; 1139 1140 SetPageError(page); 1141 mapping = page_mapping(page); 1142 if (mapping) 1143 mapping_set_error(mapping, err); 1144 } 1145 end_page_writeback(page); 1146 } 1147 } 1148 EXPORT_SYMBOL_GPL(page_endio); 1149 1150 /** 1151 * __lock_page - get a lock on the page, assuming we need to sleep to get it 1152 * @__page: the page to lock 1153 */ 1154 void __lock_page(struct page *__page) 1155 { 1156 struct page *page = compound_head(__page); 1157 wait_queue_head_t *q = page_waitqueue(page); 1158 wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE, true); 1159 } 1160 EXPORT_SYMBOL(__lock_page); 1161 1162 int __lock_page_killable(struct page *__page) 1163 { 1164 struct page *page = compound_head(__page); 1165 wait_queue_head_t *q = page_waitqueue(page); 1166 return wait_on_page_bit_common(q, page, PG_locked, TASK_KILLABLE, true); 1167 } 1168 EXPORT_SYMBOL_GPL(__lock_page_killable); 1169 1170 /* 1171 * Return values: 1172 * 1 - page is locked; mmap_sem is still held. 1173 * 0 - page is not locked. 1174 * mmap_sem has been released (up_read()), unless flags had both 1175 * FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_RETRY_NOWAIT set, in 1176 * which case mmap_sem is still held. 1177 * 1178 * If neither ALLOW_RETRY nor KILLABLE are set, will always return 1 1179 * with the page locked and the mmap_sem unperturbed. 1180 */ 1181 int __lock_page_or_retry(struct page *page, struct mm_struct *mm, 1182 unsigned int flags) 1183 { 1184 if (flags & FAULT_FLAG_ALLOW_RETRY) { 1185 /* 1186 * CAUTION! In this case, mmap_sem is not released 1187 * even though return 0. 1188 */ 1189 if (flags & FAULT_FLAG_RETRY_NOWAIT) 1190 return 0; 1191 1192 up_read(&mm->mmap_sem); 1193 if (flags & FAULT_FLAG_KILLABLE) 1194 wait_on_page_locked_killable(page); 1195 else 1196 wait_on_page_locked(page); 1197 return 0; 1198 } else { 1199 if (flags & FAULT_FLAG_KILLABLE) { 1200 int ret; 1201 1202 ret = __lock_page_killable(page); 1203 if (ret) { 1204 up_read(&mm->mmap_sem); 1205 return 0; 1206 } 1207 } else 1208 __lock_page(page); 1209 return 1; 1210 } 1211 } 1212 1213 /** 1214 * page_cache_next_hole - find the next hole (not-present entry) 1215 * @mapping: mapping 1216 * @index: index 1217 * @max_scan: maximum range to search 1218 * 1219 * Search the set [index, min(index+max_scan-1, MAX_INDEX)] for the 1220 * lowest indexed hole. 1221 * 1222 * Returns: the index of the hole if found, otherwise returns an index 1223 * outside of the set specified (in which case 'return - index >= 1224 * max_scan' will be true). In rare cases of index wrap-around, 0 will 1225 * be returned. 1226 * 1227 * page_cache_next_hole may be called under rcu_read_lock. However, 1228 * like radix_tree_gang_lookup, this will not atomically search a 1229 * snapshot of the tree at a single point in time. For example, if a 1230 * hole is created at index 5, then subsequently a hole is created at 1231 * index 10, page_cache_next_hole covering both indexes may return 10 1232 * if called under rcu_read_lock. 1233 */ 1234 pgoff_t page_cache_next_hole(struct address_space *mapping, 1235 pgoff_t index, unsigned long max_scan) 1236 { 1237 unsigned long i; 1238 1239 for (i = 0; i < max_scan; i++) { 1240 struct page *page; 1241 1242 page = radix_tree_lookup(&mapping->page_tree, index); 1243 if (!page || radix_tree_exceptional_entry(page)) 1244 break; 1245 index++; 1246 if (index == 0) 1247 break; 1248 } 1249 1250 return index; 1251 } 1252 EXPORT_SYMBOL(page_cache_next_hole); 1253 1254 /** 1255 * page_cache_prev_hole - find the prev hole (not-present entry) 1256 * @mapping: mapping 1257 * @index: index 1258 * @max_scan: maximum range to search 1259 * 1260 * Search backwards in the range [max(index-max_scan+1, 0), index] for 1261 * the first hole. 1262 * 1263 * Returns: the index of the hole if found, otherwise returns an index 1264 * outside of the set specified (in which case 'index - return >= 1265 * max_scan' will be true). In rare cases of wrap-around, ULONG_MAX 1266 * will be returned. 1267 * 1268 * page_cache_prev_hole may be called under rcu_read_lock. However, 1269 * like radix_tree_gang_lookup, this will not atomically search a 1270 * snapshot of the tree at a single point in time. For example, if a 1271 * hole is created at index 10, then subsequently a hole is created at 1272 * index 5, page_cache_prev_hole covering both indexes may return 5 if 1273 * called under rcu_read_lock. 1274 */ 1275 pgoff_t page_cache_prev_hole(struct address_space *mapping, 1276 pgoff_t index, unsigned long max_scan) 1277 { 1278 unsigned long i; 1279 1280 for (i = 0; i < max_scan; i++) { 1281 struct page *page; 1282 1283 page = radix_tree_lookup(&mapping->page_tree, index); 1284 if (!page || radix_tree_exceptional_entry(page)) 1285 break; 1286 index--; 1287 if (index == ULONG_MAX) 1288 break; 1289 } 1290 1291 return index; 1292 } 1293 EXPORT_SYMBOL(page_cache_prev_hole); 1294 1295 /** 1296 * find_get_entry - find and get a page cache entry 1297 * @mapping: the address_space to search 1298 * @offset: the page cache index 1299 * 1300 * Looks up the page cache slot at @mapping & @offset. If there is a 1301 * page cache page, it is returned with an increased refcount. 1302 * 1303 * If the slot holds a shadow entry of a previously evicted page, or a 1304 * swap entry from shmem/tmpfs, it is returned. 1305 * 1306 * Otherwise, %NULL is returned. 1307 */ 1308 struct page *find_get_entry(struct address_space *mapping, pgoff_t offset) 1309 { 1310 void **pagep; 1311 struct page *head, *page; 1312 1313 rcu_read_lock(); 1314 repeat: 1315 page = NULL; 1316 pagep = radix_tree_lookup_slot(&mapping->page_tree, offset); 1317 if (pagep) { 1318 page = radix_tree_deref_slot(pagep); 1319 if (unlikely(!page)) 1320 goto out; 1321 if (radix_tree_exception(page)) { 1322 if (radix_tree_deref_retry(page)) 1323 goto repeat; 1324 /* 1325 * A shadow entry of a recently evicted page, 1326 * or a swap entry from shmem/tmpfs. Return 1327 * it without attempting to raise page count. 1328 */ 1329 goto out; 1330 } 1331 1332 head = compound_head(page); 1333 if (!page_cache_get_speculative(head)) 1334 goto repeat; 1335 1336 /* The page was split under us? */ 1337 if (compound_head(page) != head) { 1338 put_page(head); 1339 goto repeat; 1340 } 1341 1342 /* 1343 * Has the page moved? 1344 * This is part of the lockless pagecache protocol. See 1345 * include/linux/pagemap.h for details. 1346 */ 1347 if (unlikely(page != *pagep)) { 1348 put_page(head); 1349 goto repeat; 1350 } 1351 } 1352 out: 1353 rcu_read_unlock(); 1354 1355 return page; 1356 } 1357 EXPORT_SYMBOL(find_get_entry); 1358 1359 /** 1360 * find_lock_entry - locate, pin and lock a page cache entry 1361 * @mapping: the address_space to search 1362 * @offset: the page cache index 1363 * 1364 * Looks up the page cache slot at @mapping & @offset. If there is a 1365 * page cache page, it is returned locked and with an increased 1366 * refcount. 1367 * 1368 * If the slot holds a shadow entry of a previously evicted page, or a 1369 * swap entry from shmem/tmpfs, it is returned. 1370 * 1371 * Otherwise, %NULL is returned. 1372 * 1373 * find_lock_entry() may sleep. 1374 */ 1375 struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset) 1376 { 1377 struct page *page; 1378 1379 repeat: 1380 page = find_get_entry(mapping, offset); 1381 if (page && !radix_tree_exception(page)) { 1382 lock_page(page); 1383 /* Has the page been truncated? */ 1384 if (unlikely(page_mapping(page) != mapping)) { 1385 unlock_page(page); 1386 put_page(page); 1387 goto repeat; 1388 } 1389 VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page); 1390 } 1391 return page; 1392 } 1393 EXPORT_SYMBOL(find_lock_entry); 1394 1395 /** 1396 * pagecache_get_page - find and get a page reference 1397 * @mapping: the address_space to search 1398 * @offset: the page index 1399 * @fgp_flags: PCG flags 1400 * @gfp_mask: gfp mask to use for the page cache data page allocation 1401 * 1402 * Looks up the page cache slot at @mapping & @offset. 1403 * 1404 * PCG flags modify how the page is returned. 1405 * 1406 * @fgp_flags can be: 1407 * 1408 * - FGP_ACCESSED: the page will be marked accessed 1409 * - FGP_LOCK: Page is return locked 1410 * - FGP_CREAT: If page is not present then a new page is allocated using 1411 * @gfp_mask and added to the page cache and the VM's LRU 1412 * list. The page is returned locked and with an increased 1413 * refcount. Otherwise, NULL is returned. 1414 * 1415 * If FGP_LOCK or FGP_CREAT are specified then the function may sleep even 1416 * if the GFP flags specified for FGP_CREAT are atomic. 1417 * 1418 * If there is a page cache page, it is returned with an increased refcount. 1419 */ 1420 struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset, 1421 int fgp_flags, gfp_t gfp_mask) 1422 { 1423 struct page *page; 1424 1425 repeat: 1426 page = find_get_entry(mapping, offset); 1427 if (radix_tree_exceptional_entry(page)) 1428 page = NULL; 1429 if (!page) 1430 goto no_page; 1431 1432 if (fgp_flags & FGP_LOCK) { 1433 if (fgp_flags & FGP_NOWAIT) { 1434 if (!trylock_page(page)) { 1435 put_page(page); 1436 return NULL; 1437 } 1438 } else { 1439 lock_page(page); 1440 } 1441 1442 /* Has the page been truncated? */ 1443 if (unlikely(page->mapping != mapping)) { 1444 unlock_page(page); 1445 put_page(page); 1446 goto repeat; 1447 } 1448 VM_BUG_ON_PAGE(page->index != offset, page); 1449 } 1450 1451 if (page && (fgp_flags & FGP_ACCESSED)) 1452 mark_page_accessed(page); 1453 1454 no_page: 1455 if (!page && (fgp_flags & FGP_CREAT)) { 1456 int err; 1457 if ((fgp_flags & FGP_WRITE) && mapping_cap_account_dirty(mapping)) 1458 gfp_mask |= __GFP_WRITE; 1459 if (fgp_flags & FGP_NOFS) 1460 gfp_mask &= ~__GFP_FS; 1461 1462 page = __page_cache_alloc(gfp_mask); 1463 if (!page) 1464 return NULL; 1465 1466 if (WARN_ON_ONCE(!(fgp_flags & FGP_LOCK))) 1467 fgp_flags |= FGP_LOCK; 1468 1469 /* Init accessed so avoid atomic mark_page_accessed later */ 1470 if (fgp_flags & FGP_ACCESSED) 1471 __SetPageReferenced(page); 1472 1473 err = add_to_page_cache_lru(page, mapping, offset, 1474 gfp_mask & GFP_RECLAIM_MASK); 1475 if (unlikely(err)) { 1476 put_page(page); 1477 page = NULL; 1478 if (err == -EEXIST) 1479 goto repeat; 1480 } 1481 } 1482 1483 return page; 1484 } 1485 EXPORT_SYMBOL(pagecache_get_page); 1486 1487 /** 1488 * find_get_entries - gang pagecache lookup 1489 * @mapping: The address_space to search 1490 * @start: The starting page cache index 1491 * @nr_entries: The maximum number of entries 1492 * @entries: Where the resulting entries are placed 1493 * @indices: The cache indices corresponding to the entries in @entries 1494 * 1495 * find_get_entries() will search for and return a group of up to 1496 * @nr_entries entries in the mapping. The entries are placed at 1497 * @entries. find_get_entries() takes a reference against any actual 1498 * pages it returns. 1499 * 1500 * The search returns a group of mapping-contiguous page cache entries 1501 * with ascending indexes. There may be holes in the indices due to 1502 * not-present pages. 1503 * 1504 * Any shadow entries of evicted pages, or swap entries from 1505 * shmem/tmpfs, are included in the returned array. 1506 * 1507 * find_get_entries() returns the number of pages and shadow entries 1508 * which were found. 1509 */ 1510 unsigned find_get_entries(struct address_space *mapping, 1511 pgoff_t start, unsigned int nr_entries, 1512 struct page **entries, pgoff_t *indices) 1513 { 1514 void **slot; 1515 unsigned int ret = 0; 1516 struct radix_tree_iter iter; 1517 1518 if (!nr_entries) 1519 return 0; 1520 1521 rcu_read_lock(); 1522 radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { 1523 struct page *head, *page; 1524 repeat: 1525 page = radix_tree_deref_slot(slot); 1526 if (unlikely(!page)) 1527 continue; 1528 if (radix_tree_exception(page)) { 1529 if (radix_tree_deref_retry(page)) { 1530 slot = radix_tree_iter_retry(&iter); 1531 continue; 1532 } 1533 /* 1534 * A shadow entry of a recently evicted page, a swap 1535 * entry from shmem/tmpfs or a DAX entry. Return it 1536 * without attempting to raise page count. 1537 */ 1538 goto export; 1539 } 1540 1541 head = compound_head(page); 1542 if (!page_cache_get_speculative(head)) 1543 goto repeat; 1544 1545 /* The page was split under us? */ 1546 if (compound_head(page) != head) { 1547 put_page(head); 1548 goto repeat; 1549 } 1550 1551 /* Has the page moved? */ 1552 if (unlikely(page != *slot)) { 1553 put_page(head); 1554 goto repeat; 1555 } 1556 export: 1557 indices[ret] = iter.index; 1558 entries[ret] = page; 1559 if (++ret == nr_entries) 1560 break; 1561 } 1562 rcu_read_unlock(); 1563 return ret; 1564 } 1565 1566 /** 1567 * find_get_pages - gang pagecache lookup 1568 * @mapping: The address_space to search 1569 * @start: The starting page index 1570 * @nr_pages: The maximum number of pages 1571 * @pages: Where the resulting pages are placed 1572 * 1573 * find_get_pages() will search for and return a group of up to 1574 * @nr_pages pages in the mapping. The pages are placed at @pages. 1575 * find_get_pages() takes a reference against the returned pages. 1576 * 1577 * The search returns a group of mapping-contiguous pages with ascending 1578 * indexes. There may be holes in the indices due to not-present pages. 1579 * 1580 * find_get_pages() returns the number of pages which were found. 1581 */ 1582 unsigned find_get_pages(struct address_space *mapping, pgoff_t start, 1583 unsigned int nr_pages, struct page **pages) 1584 { 1585 struct radix_tree_iter iter; 1586 void **slot; 1587 unsigned ret = 0; 1588 1589 if (unlikely(!nr_pages)) 1590 return 0; 1591 1592 rcu_read_lock(); 1593 radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { 1594 struct page *head, *page; 1595 repeat: 1596 page = radix_tree_deref_slot(slot); 1597 if (unlikely(!page)) 1598 continue; 1599 1600 if (radix_tree_exception(page)) { 1601 if (radix_tree_deref_retry(page)) { 1602 slot = radix_tree_iter_retry(&iter); 1603 continue; 1604 } 1605 /* 1606 * A shadow entry of a recently evicted page, 1607 * or a swap entry from shmem/tmpfs. Skip 1608 * over it. 1609 */ 1610 continue; 1611 } 1612 1613 head = compound_head(page); 1614 if (!page_cache_get_speculative(head)) 1615 goto repeat; 1616 1617 /* The page was split under us? */ 1618 if (compound_head(page) != head) { 1619 put_page(head); 1620 goto repeat; 1621 } 1622 1623 /* Has the page moved? */ 1624 if (unlikely(page != *slot)) { 1625 put_page(head); 1626 goto repeat; 1627 } 1628 1629 pages[ret] = page; 1630 if (++ret == nr_pages) 1631 break; 1632 } 1633 1634 rcu_read_unlock(); 1635 return ret; 1636 } 1637 1638 /** 1639 * find_get_pages_contig - gang contiguous pagecache lookup 1640 * @mapping: The address_space to search 1641 * @index: The starting page index 1642 * @nr_pages: The maximum number of pages 1643 * @pages: Where the resulting pages are placed 1644 * 1645 * find_get_pages_contig() works exactly like find_get_pages(), except 1646 * that the returned number of pages are guaranteed to be contiguous. 1647 * 1648 * find_get_pages_contig() returns the number of pages which were found. 1649 */ 1650 unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, 1651 unsigned int nr_pages, struct page **pages) 1652 { 1653 struct radix_tree_iter iter; 1654 void **slot; 1655 unsigned int ret = 0; 1656 1657 if (unlikely(!nr_pages)) 1658 return 0; 1659 1660 rcu_read_lock(); 1661 radix_tree_for_each_contig(slot, &mapping->page_tree, &iter, index) { 1662 struct page *head, *page; 1663 repeat: 1664 page = radix_tree_deref_slot(slot); 1665 /* The hole, there no reason to continue */ 1666 if (unlikely(!page)) 1667 break; 1668 1669 if (radix_tree_exception(page)) { 1670 if (radix_tree_deref_retry(page)) { 1671 slot = radix_tree_iter_retry(&iter); 1672 continue; 1673 } 1674 /* 1675 * A shadow entry of a recently evicted page, 1676 * or a swap entry from shmem/tmpfs. Stop 1677 * looking for contiguous pages. 1678 */ 1679 break; 1680 } 1681 1682 head = compound_head(page); 1683 if (!page_cache_get_speculative(head)) 1684 goto repeat; 1685 1686 /* The page was split under us? */ 1687 if (compound_head(page) != head) { 1688 put_page(head); 1689 goto repeat; 1690 } 1691 1692 /* Has the page moved? */ 1693 if (unlikely(page != *slot)) { 1694 put_page(head); 1695 goto repeat; 1696 } 1697 1698 /* 1699 * must check mapping and index after taking the ref. 1700 * otherwise we can get both false positives and false 1701 * negatives, which is just confusing to the caller. 1702 */ 1703 if (page->mapping == NULL || page_to_pgoff(page) != iter.index) { 1704 put_page(page); 1705 break; 1706 } 1707 1708 pages[ret] = page; 1709 if (++ret == nr_pages) 1710 break; 1711 } 1712 rcu_read_unlock(); 1713 return ret; 1714 } 1715 EXPORT_SYMBOL(find_get_pages_contig); 1716 1717 /** 1718 * find_get_pages_tag - find and return pages that match @tag 1719 * @mapping: the address_space to search 1720 * @index: the starting page index 1721 * @tag: the tag index 1722 * @nr_pages: the maximum number of pages 1723 * @pages: where the resulting pages are placed 1724 * 1725 * Like find_get_pages, except we only return pages which are tagged with 1726 * @tag. We update @index to index the next page for the traversal. 1727 */ 1728 unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, 1729 int tag, unsigned int nr_pages, struct page **pages) 1730 { 1731 struct radix_tree_iter iter; 1732 void **slot; 1733 unsigned ret = 0; 1734 1735 if (unlikely(!nr_pages)) 1736 return 0; 1737 1738 rcu_read_lock(); 1739 radix_tree_for_each_tagged(slot, &mapping->page_tree, 1740 &iter, *index, tag) { 1741 struct page *head, *page; 1742 repeat: 1743 page = radix_tree_deref_slot(slot); 1744 if (unlikely(!page)) 1745 continue; 1746 1747 if (radix_tree_exception(page)) { 1748 if (radix_tree_deref_retry(page)) { 1749 slot = radix_tree_iter_retry(&iter); 1750 continue; 1751 } 1752 /* 1753 * A shadow entry of a recently evicted page. 1754 * 1755 * Those entries should never be tagged, but 1756 * this tree walk is lockless and the tags are 1757 * looked up in bulk, one radix tree node at a 1758 * time, so there is a sizable window for page 1759 * reclaim to evict a page we saw tagged. 1760 * 1761 * Skip over it. 1762 */ 1763 continue; 1764 } 1765 1766 head = compound_head(page); 1767 if (!page_cache_get_speculative(head)) 1768 goto repeat; 1769 1770 /* The page was split under us? */ 1771 if (compound_head(page) != head) { 1772 put_page(head); 1773 goto repeat; 1774 } 1775 1776 /* Has the page moved? */ 1777 if (unlikely(page != *slot)) { 1778 put_page(head); 1779 goto repeat; 1780 } 1781 1782 pages[ret] = page; 1783 if (++ret == nr_pages) 1784 break; 1785 } 1786 1787 rcu_read_unlock(); 1788 1789 if (ret) 1790 *index = pages[ret - 1]->index + 1; 1791 1792 return ret; 1793 } 1794 EXPORT_SYMBOL(find_get_pages_tag); 1795 1796 /** 1797 * find_get_entries_tag - find and return entries that match @tag 1798 * @mapping: the address_space to search 1799 * @start: the starting page cache index 1800 * @tag: the tag index 1801 * @nr_entries: the maximum number of entries 1802 * @entries: where the resulting entries are placed 1803 * @indices: the cache indices corresponding to the entries in @entries 1804 * 1805 * Like find_get_entries, except we only return entries which are tagged with 1806 * @tag. 1807 */ 1808 unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, 1809 int tag, unsigned int nr_entries, 1810 struct page **entries, pgoff_t *indices) 1811 { 1812 void **slot; 1813 unsigned int ret = 0; 1814 struct radix_tree_iter iter; 1815 1816 if (!nr_entries) 1817 return 0; 1818 1819 rcu_read_lock(); 1820 radix_tree_for_each_tagged(slot, &mapping->page_tree, 1821 &iter, start, tag) { 1822 struct page *head, *page; 1823 repeat: 1824 page = radix_tree_deref_slot(slot); 1825 if (unlikely(!page)) 1826 continue; 1827 if (radix_tree_exception(page)) { 1828 if (radix_tree_deref_retry(page)) { 1829 slot = radix_tree_iter_retry(&iter); 1830 continue; 1831 } 1832 1833 /* 1834 * A shadow entry of a recently evicted page, a swap 1835 * entry from shmem/tmpfs or a DAX entry. Return it 1836 * without attempting to raise page count. 1837 */ 1838 goto export; 1839 } 1840 1841 head = compound_head(page); 1842 if (!page_cache_get_speculative(head)) 1843 goto repeat; 1844 1845 /* The page was split under us? */ 1846 if (compound_head(page) != head) { 1847 put_page(head); 1848 goto repeat; 1849 } 1850 1851 /* Has the page moved? */ 1852 if (unlikely(page != *slot)) { 1853 put_page(head); 1854 goto repeat; 1855 } 1856 export: 1857 indices[ret] = iter.index; 1858 entries[ret] = page; 1859 if (++ret == nr_entries) 1860 break; 1861 } 1862 rcu_read_unlock(); 1863 return ret; 1864 } 1865 EXPORT_SYMBOL(find_get_entries_tag); 1866 1867 /* 1868 * CD/DVDs are error prone. When a medium error occurs, the driver may fail 1869 * a _large_ part of the i/o request. Imagine the worst scenario: 1870 * 1871 * ---R__________________________________________B__________ 1872 * ^ reading here ^ bad block(assume 4k) 1873 * 1874 * read(R) => miss => readahead(R...B) => media error => frustrating retries 1875 * => failing the whole request => read(R) => read(R+1) => 1876 * readahead(R+1...B+1) => bang => read(R+2) => read(R+3) => 1877 * readahead(R+3...B+2) => bang => read(R+3) => read(R+4) => 1878 * readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ...... 1879 * 1880 * It is going insane. Fix it by quickly scaling down the readahead size. 1881 */ 1882 static void shrink_readahead_size_eio(struct file *filp, 1883 struct file_ra_state *ra) 1884 { 1885 ra->ra_pages /= 4; 1886 } 1887 1888 /** 1889 * do_generic_file_read - generic file read routine 1890 * @filp: the file to read 1891 * @ppos: current file position 1892 * @iter: data destination 1893 * @written: already copied 1894 * 1895 * This is a generic file read routine, and uses the 1896 * mapping->a_ops->readpage() function for the actual low-level stuff. 1897 * 1898 * This is really ugly. But the goto's actually try to clarify some 1899 * of the logic when it comes to error handling etc. 1900 */ 1901 static ssize_t do_generic_file_read(struct file *filp, loff_t *ppos, 1902 struct iov_iter *iter, ssize_t written) 1903 { 1904 struct address_space *mapping = filp->f_mapping; 1905 struct inode *inode = mapping->host; 1906 struct file_ra_state *ra = &filp->f_ra; 1907 pgoff_t index; 1908 pgoff_t last_index; 1909 pgoff_t prev_index; 1910 unsigned long offset; /* offset into pagecache page */ 1911 unsigned int prev_offset; 1912 int error = 0; 1913 1914 if (unlikely(*ppos >= inode->i_sb->s_maxbytes)) 1915 return 0; 1916 iov_iter_truncate(iter, inode->i_sb->s_maxbytes); 1917 1918 index = *ppos >> PAGE_SHIFT; 1919 prev_index = ra->prev_pos >> PAGE_SHIFT; 1920 prev_offset = ra->prev_pos & (PAGE_SIZE-1); 1921 last_index = (*ppos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT; 1922 offset = *ppos & ~PAGE_MASK; 1923 1924 for (;;) { 1925 struct page *page; 1926 pgoff_t end_index; 1927 loff_t isize; 1928 unsigned long nr, ret; 1929 1930 cond_resched(); 1931 find_page: 1932 if (fatal_signal_pending(current)) { 1933 error = -EINTR; 1934 goto out; 1935 } 1936 1937 page = find_get_page(mapping, index); 1938 if (!page) { 1939 page_cache_sync_readahead(mapping, 1940 ra, filp, 1941 index, last_index - index); 1942 page = find_get_page(mapping, index); 1943 if (unlikely(page == NULL)) 1944 goto no_cached_page; 1945 } 1946 if (PageReadahead(page)) { 1947 page_cache_async_readahead(mapping, 1948 ra, filp, page, 1949 index, last_index - index); 1950 } 1951 if (!PageUptodate(page)) { 1952 /* 1953 * See comment in do_read_cache_page on why 1954 * wait_on_page_locked is used to avoid unnecessarily 1955 * serialisations and why it's safe. 1956 */ 1957 error = wait_on_page_locked_killable(page); 1958 if (unlikely(error)) 1959 goto readpage_error; 1960 if (PageUptodate(page)) 1961 goto page_ok; 1962 1963 if (inode->i_blkbits == PAGE_SHIFT || 1964 !mapping->a_ops->is_partially_uptodate) 1965 goto page_not_up_to_date; 1966 /* pipes can't handle partially uptodate pages */ 1967 if (unlikely(iter->type & ITER_PIPE)) 1968 goto page_not_up_to_date; 1969 if (!trylock_page(page)) 1970 goto page_not_up_to_date; 1971 /* Did it get truncated before we got the lock? */ 1972 if (!page->mapping) 1973 goto page_not_up_to_date_locked; 1974 if (!mapping->a_ops->is_partially_uptodate(page, 1975 offset, iter->count)) 1976 goto page_not_up_to_date_locked; 1977 unlock_page(page); 1978 } 1979 page_ok: 1980 /* 1981 * i_size must be checked after we know the page is Uptodate. 1982 * 1983 * Checking i_size after the check allows us to calculate 1984 * the correct value for "nr", which means the zero-filled 1985 * part of the page is not copied back to userspace (unless 1986 * another truncate extends the file - this is desired though). 1987 */ 1988 1989 isize = i_size_read(inode); 1990 end_index = (isize - 1) >> PAGE_SHIFT; 1991 if (unlikely(!isize || index > end_index)) { 1992 put_page(page); 1993 goto out; 1994 } 1995 1996 /* nr is the maximum number of bytes to copy from this page */ 1997 nr = PAGE_SIZE; 1998 if (index == end_index) { 1999 nr = ((isize - 1) & ~PAGE_MASK) + 1; 2000 if (nr <= offset) { 2001 put_page(page); 2002 goto out; 2003 } 2004 } 2005 nr = nr - offset; 2006 2007 /* If users can be writing to this page using arbitrary 2008 * virtual addresses, take care about potential aliasing 2009 * before reading the page on the kernel side. 2010 */ 2011 if (mapping_writably_mapped(mapping)) 2012 flush_dcache_page(page); 2013 2014 /* 2015 * When a sequential read accesses a page several times, 2016 * only mark it as accessed the first time. 2017 */ 2018 if (prev_index != index || offset != prev_offset) 2019 mark_page_accessed(page); 2020 prev_index = index; 2021 2022 /* 2023 * Ok, we have the page, and it's up-to-date, so 2024 * now we can copy it to user space... 2025 */ 2026 2027 ret = copy_page_to_iter(page, offset, nr, iter); 2028 offset += ret; 2029 index += offset >> PAGE_SHIFT; 2030 offset &= ~PAGE_MASK; 2031 prev_offset = offset; 2032 2033 put_page(page); 2034 written += ret; 2035 if (!iov_iter_count(iter)) 2036 goto out; 2037 if (ret < nr) { 2038 error = -EFAULT; 2039 goto out; 2040 } 2041 continue; 2042 2043 page_not_up_to_date: 2044 /* Get exclusive access to the page ... */ 2045 error = lock_page_killable(page); 2046 if (unlikely(error)) 2047 goto readpage_error; 2048 2049 page_not_up_to_date_locked: 2050 /* Did it get truncated before we got the lock? */ 2051 if (!page->mapping) { 2052 unlock_page(page); 2053 put_page(page); 2054 continue; 2055 } 2056 2057 /* Did somebody else fill it already? */ 2058 if (PageUptodate(page)) { 2059 unlock_page(page); 2060 goto page_ok; 2061 } 2062 2063 readpage: 2064 /* 2065 * A previous I/O error may have been due to temporary 2066 * failures, eg. multipath errors. 2067 * PG_error will be set again if readpage fails. 2068 */ 2069 ClearPageError(page); 2070 /* Start the actual read. The read will unlock the page. */ 2071 error = mapping->a_ops->readpage(filp, page); 2072 2073 if (unlikely(error)) { 2074 if (error == AOP_TRUNCATED_PAGE) { 2075 put_page(page); 2076 error = 0; 2077 goto find_page; 2078 } 2079 goto readpage_error; 2080 } 2081 2082 if (!PageUptodate(page)) { 2083 error = lock_page_killable(page); 2084 if (unlikely(error)) 2085 goto readpage_error; 2086 if (!PageUptodate(page)) { 2087 if (page->mapping == NULL) { 2088 /* 2089 * invalidate_mapping_pages got it 2090 */ 2091 unlock_page(page); 2092 put_page(page); 2093 goto find_page; 2094 } 2095 unlock_page(page); 2096 shrink_readahead_size_eio(filp, ra); 2097 error = -EIO; 2098 goto readpage_error; 2099 } 2100 unlock_page(page); 2101 } 2102 2103 goto page_ok; 2104 2105 readpage_error: 2106 /* UHHUH! A synchronous read error occurred. Report it */ 2107 put_page(page); 2108 goto out; 2109 2110 no_cached_page: 2111 /* 2112 * Ok, it wasn't cached, so we need to create a new 2113 * page.. 2114 */ 2115 page = page_cache_alloc_cold(mapping); 2116 if (!page) { 2117 error = -ENOMEM; 2118 goto out; 2119 } 2120 error = add_to_page_cache_lru(page, mapping, index, 2121 mapping_gfp_constraint(mapping, GFP_KERNEL)); 2122 if (error) { 2123 put_page(page); 2124 if (error == -EEXIST) { 2125 error = 0; 2126 goto find_page; 2127 } 2128 goto out; 2129 } 2130 goto readpage; 2131 } 2132 2133 out: 2134 ra->prev_pos = prev_index; 2135 ra->prev_pos <<= PAGE_SHIFT; 2136 ra->prev_pos |= prev_offset; 2137 2138 *ppos = ((loff_t)index << PAGE_SHIFT) + offset; 2139 file_accessed(filp); 2140 return written ? written : error; 2141 } 2142 2143 /** 2144 * generic_file_read_iter - generic filesystem read routine 2145 * @iocb: kernel I/O control block 2146 * @iter: destination for the data read 2147 * 2148 * This is the "read_iter()" routine for all filesystems 2149 * that can use the page cache directly. 2150 */ 2151 ssize_t 2152 generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) 2153 { 2154 struct file *file = iocb->ki_filp; 2155 ssize_t retval = 0; 2156 size_t count = iov_iter_count(iter); 2157 2158 if (!count) 2159 goto out; /* skip atime */ 2160 2161 if (iocb->ki_flags & IOCB_DIRECT) { 2162 struct address_space *mapping = file->f_mapping; 2163 struct inode *inode = mapping->host; 2164 loff_t size; 2165 2166 size = i_size_read(inode); 2167 if (iocb->ki_flags & IOCB_NOWAIT) { 2168 if (filemap_range_has_page(mapping, iocb->ki_pos, 2169 iocb->ki_pos + count - 1)) 2170 return -EAGAIN; 2171 } else { 2172 retval = filemap_write_and_wait_range(mapping, 2173 iocb->ki_pos, 2174 iocb->ki_pos + count - 1); 2175 if (retval < 0) 2176 goto out; 2177 } 2178 2179 file_accessed(file); 2180 2181 retval = mapping->a_ops->direct_IO(iocb, iter); 2182 if (retval >= 0) { 2183 iocb->ki_pos += retval; 2184 count -= retval; 2185 } 2186 iov_iter_revert(iter, count - iov_iter_count(iter)); 2187 2188 /* 2189 * Btrfs can have a short DIO read if we encounter 2190 * compressed extents, so if there was an error, or if 2191 * we've already read everything we wanted to, or if 2192 * there was a short read because we hit EOF, go ahead 2193 * and return. Otherwise fallthrough to buffered io for 2194 * the rest of the read. Buffered reads will not work for 2195 * DAX files, so don't bother trying. 2196 */ 2197 if (retval < 0 || !count || iocb->ki_pos >= size || 2198 IS_DAX(inode)) 2199 goto out; 2200 } 2201 2202 retval = do_generic_file_read(file, &iocb->ki_pos, iter, retval); 2203 out: 2204 return retval; 2205 } 2206 EXPORT_SYMBOL(generic_file_read_iter); 2207 2208 #ifdef CONFIG_MMU 2209 /** 2210 * page_cache_read - adds requested page to the page cache if not already there 2211 * @file: file to read 2212 * @offset: page index 2213 * @gfp_mask: memory allocation flags 2214 * 2215 * This adds the requested page to the page cache if it isn't already there, 2216 * and schedules an I/O to read in its contents from disk. 2217 */ 2218 static int page_cache_read(struct file *file, pgoff_t offset, gfp_t gfp_mask) 2219 { 2220 struct address_space *mapping = file->f_mapping; 2221 struct page *page; 2222 int ret; 2223 2224 do { 2225 page = __page_cache_alloc(gfp_mask|__GFP_COLD); 2226 if (!page) 2227 return -ENOMEM; 2228 2229 ret = add_to_page_cache_lru(page, mapping, offset, gfp_mask & GFP_KERNEL); 2230 if (ret == 0) 2231 ret = mapping->a_ops->readpage(file, page); 2232 else if (ret == -EEXIST) 2233 ret = 0; /* losing race to add is OK */ 2234 2235 put_page(page); 2236 2237 } while (ret == AOP_TRUNCATED_PAGE); 2238 2239 return ret; 2240 } 2241 2242 #define MMAP_LOTSAMISS (100) 2243 2244 /* 2245 * Synchronous readahead happens when we don't even find 2246 * a page in the page cache at all. 2247 */ 2248 static void do_sync_mmap_readahead(struct vm_area_struct *vma, 2249 struct file_ra_state *ra, 2250 struct file *file, 2251 pgoff_t offset) 2252 { 2253 struct address_space *mapping = file->f_mapping; 2254 2255 /* If we don't want any read-ahead, don't bother */ 2256 if (vma->vm_flags & VM_RAND_READ) 2257 return; 2258 if (!ra->ra_pages) 2259 return; 2260 2261 if (vma->vm_flags & VM_SEQ_READ) { 2262 page_cache_sync_readahead(mapping, ra, file, offset, 2263 ra->ra_pages); 2264 return; 2265 } 2266 2267 /* Avoid banging the cache line if not needed */ 2268 if (ra->mmap_miss < MMAP_LOTSAMISS * 10) 2269 ra->mmap_miss++; 2270 2271 /* 2272 * Do we miss much more than hit in this file? If so, 2273 * stop bothering with read-ahead. It will only hurt. 2274 */ 2275 if (ra->mmap_miss > MMAP_LOTSAMISS) 2276 return; 2277 2278 /* 2279 * mmap read-around 2280 */ 2281 ra->start = max_t(long, 0, offset - ra->ra_pages / 2); 2282 ra->size = ra->ra_pages; 2283 ra->async_size = ra->ra_pages / 4; 2284 ra_submit(ra, mapping, file); 2285 } 2286 2287 /* 2288 * Asynchronous readahead happens when we find the page and PG_readahead, 2289 * so we want to possibly extend the readahead further.. 2290 */ 2291 static void do_async_mmap_readahead(struct vm_area_struct *vma, 2292 struct file_ra_state *ra, 2293 struct file *file, 2294 struct page *page, 2295 pgoff_t offset) 2296 { 2297 struct address_space *mapping = file->f_mapping; 2298 2299 /* If we don't want any read-ahead, don't bother */ 2300 if (vma->vm_flags & VM_RAND_READ) 2301 return; 2302 if (ra->mmap_miss > 0) 2303 ra->mmap_miss--; 2304 if (PageReadahead(page)) 2305 page_cache_async_readahead(mapping, ra, file, 2306 page, offset, ra->ra_pages); 2307 } 2308 2309 /** 2310 * filemap_fault - read in file data for page fault handling 2311 * @vmf: struct vm_fault containing details of the fault 2312 * 2313 * filemap_fault() is invoked via the vma operations vector for a 2314 * mapped memory region to read in file data during a page fault. 2315 * 2316 * The goto's are kind of ugly, but this streamlines the normal case of having 2317 * it in the page cache, and handles the special cases reasonably without 2318 * having a lot of duplicated code. 2319 * 2320 * vma->vm_mm->mmap_sem must be held on entry. 2321 * 2322 * If our return value has VM_FAULT_RETRY set, it's because 2323 * lock_page_or_retry() returned 0. 2324 * The mmap_sem has usually been released in this case. 2325 * See __lock_page_or_retry() for the exception. 2326 * 2327 * If our return value does not have VM_FAULT_RETRY set, the mmap_sem 2328 * has not been released. 2329 * 2330 * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set. 2331 */ 2332 int filemap_fault(struct vm_fault *vmf) 2333 { 2334 int error; 2335 struct file *file = vmf->vma->vm_file; 2336 struct address_space *mapping = file->f_mapping; 2337 struct file_ra_state *ra = &file->f_ra; 2338 struct inode *inode = mapping->host; 2339 pgoff_t offset = vmf->pgoff; 2340 pgoff_t max_off; 2341 struct page *page; 2342 int ret = 0; 2343 2344 max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); 2345 if (unlikely(offset >= max_off)) 2346 return VM_FAULT_SIGBUS; 2347 2348 /* 2349 * Do we have something in the page cache already? 2350 */ 2351 page = find_get_page(mapping, offset); 2352 if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) { 2353 /* 2354 * We found the page, so try async readahead before 2355 * waiting for the lock. 2356 */ 2357 do_async_mmap_readahead(vmf->vma, ra, file, page, offset); 2358 } else if (!page) { 2359 /* No page in the page cache at all */ 2360 do_sync_mmap_readahead(vmf->vma, ra, file, offset); 2361 count_vm_event(PGMAJFAULT); 2362 count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT); 2363 ret = VM_FAULT_MAJOR; 2364 retry_find: 2365 page = find_get_page(mapping, offset); 2366 if (!page) 2367 goto no_cached_page; 2368 } 2369 2370 if (!lock_page_or_retry(page, vmf->vma->vm_mm, vmf->flags)) { 2371 put_page(page); 2372 return ret | VM_FAULT_RETRY; 2373 } 2374 2375 /* Did it get truncated? */ 2376 if (unlikely(page->mapping != mapping)) { 2377 unlock_page(page); 2378 put_page(page); 2379 goto retry_find; 2380 } 2381 VM_BUG_ON_PAGE(page->index != offset, page); 2382 2383 /* 2384 * We have a locked page in the page cache, now we need to check 2385 * that it's up-to-date. If not, it is going to be due to an error. 2386 */ 2387 if (unlikely(!PageUptodate(page))) 2388 goto page_not_uptodate; 2389 2390 /* 2391 * Found the page and have a reference on it. 2392 * We must recheck i_size under page lock. 2393 */ 2394 max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); 2395 if (unlikely(offset >= max_off)) { 2396 unlock_page(page); 2397 put_page(page); 2398 return VM_FAULT_SIGBUS; 2399 } 2400 2401 vmf->page = page; 2402 return ret | VM_FAULT_LOCKED; 2403 2404 no_cached_page: 2405 /* 2406 * We're only likely to ever get here if MADV_RANDOM is in 2407 * effect. 2408 */ 2409 error = page_cache_read(file, offset, vmf->gfp_mask); 2410 2411 /* 2412 * The page we want has now been added to the page cache. 2413 * In the unlikely event that someone removed it in the 2414 * meantime, we'll just come back here and read it again. 2415 */ 2416 if (error >= 0) 2417 goto retry_find; 2418 2419 /* 2420 * An error return from page_cache_read can result if the 2421 * system is low on memory, or a problem occurs while trying 2422 * to schedule I/O. 2423 */ 2424 if (error == -ENOMEM) 2425 return VM_FAULT_OOM; 2426 return VM_FAULT_SIGBUS; 2427 2428 page_not_uptodate: 2429 /* 2430 * Umm, take care of errors if the page isn't up-to-date. 2431 * Try to re-read it _once_. We do this synchronously, 2432 * because there really aren't any performance issues here 2433 * and we need to check for errors. 2434 */ 2435 ClearPageError(page); 2436 error = mapping->a_ops->readpage(file, page); 2437 if (!error) { 2438 wait_on_page_locked(page); 2439 if (!PageUptodate(page)) 2440 error = -EIO; 2441 } 2442 put_page(page); 2443 2444 if (!error || error == AOP_TRUNCATED_PAGE) 2445 goto retry_find; 2446 2447 /* Things didn't work out. Return zero to tell the mm layer so. */ 2448 shrink_readahead_size_eio(file, ra); 2449 return VM_FAULT_SIGBUS; 2450 } 2451 EXPORT_SYMBOL(filemap_fault); 2452 2453 void filemap_map_pages(struct vm_fault *vmf, 2454 pgoff_t start_pgoff, pgoff_t end_pgoff) 2455 { 2456 struct radix_tree_iter iter; 2457 void **slot; 2458 struct file *file = vmf->vma->vm_file; 2459 struct address_space *mapping = file->f_mapping; 2460 pgoff_t last_pgoff = start_pgoff; 2461 unsigned long max_idx; 2462 struct page *head, *page; 2463 2464 rcu_read_lock(); 2465 radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, 2466 start_pgoff) { 2467 if (iter.index > end_pgoff) 2468 break; 2469 repeat: 2470 page = radix_tree_deref_slot(slot); 2471 if (unlikely(!page)) 2472 goto next; 2473 if (radix_tree_exception(page)) { 2474 if (radix_tree_deref_retry(page)) { 2475 slot = radix_tree_iter_retry(&iter); 2476 continue; 2477 } 2478 goto next; 2479 } 2480 2481 head = compound_head(page); 2482 if (!page_cache_get_speculative(head)) 2483 goto repeat; 2484 2485 /* The page was split under us? */ 2486 if (compound_head(page) != head) { 2487 put_page(head); 2488 goto repeat; 2489 } 2490 2491 /* Has the page moved? */ 2492 if (unlikely(page != *slot)) { 2493 put_page(head); 2494 goto repeat; 2495 } 2496 2497 if (!PageUptodate(page) || 2498 PageReadahead(page) || 2499 PageHWPoison(page)) 2500 goto skip; 2501 if (!trylock_page(page)) 2502 goto skip; 2503 2504 if (page->mapping != mapping || !PageUptodate(page)) 2505 goto unlock; 2506 2507 max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE); 2508 if (page->index >= max_idx) 2509 goto unlock; 2510 2511 if (file->f_ra.mmap_miss > 0) 2512 file->f_ra.mmap_miss--; 2513 2514 vmf->address += (iter.index - last_pgoff) << PAGE_SHIFT; 2515 if (vmf->pte) 2516 vmf->pte += iter.index - last_pgoff; 2517 last_pgoff = iter.index; 2518 if (alloc_set_pte(vmf, NULL, page)) 2519 goto unlock; 2520 unlock_page(page); 2521 goto next; 2522 unlock: 2523 unlock_page(page); 2524 skip: 2525 put_page(page); 2526 next: 2527 /* Huge page is mapped? No need to proceed. */ 2528 if (pmd_trans_huge(*vmf->pmd)) 2529 break; 2530 if (iter.index == end_pgoff) 2531 break; 2532 } 2533 rcu_read_unlock(); 2534 } 2535 EXPORT_SYMBOL(filemap_map_pages); 2536 2537 int filemap_page_mkwrite(struct vm_fault *vmf) 2538 { 2539 struct page *page = vmf->page; 2540 struct inode *inode = file_inode(vmf->vma->vm_file); 2541 int ret = VM_FAULT_LOCKED; 2542 2543 sb_start_pagefault(inode->i_sb); 2544 file_update_time(vmf->vma->vm_file); 2545 lock_page(page); 2546 if (page->mapping != inode->i_mapping) { 2547 unlock_page(page); 2548 ret = VM_FAULT_NOPAGE; 2549 goto out; 2550 } 2551 /* 2552 * We mark the page dirty already here so that when freeze is in 2553 * progress, we are guaranteed that writeback during freezing will 2554 * see the dirty page and writeprotect it again. 2555 */ 2556 set_page_dirty(page); 2557 wait_for_stable_page(page); 2558 out: 2559 sb_end_pagefault(inode->i_sb); 2560 return ret; 2561 } 2562 EXPORT_SYMBOL(filemap_page_mkwrite); 2563 2564 const struct vm_operations_struct generic_file_vm_ops = { 2565 .fault = filemap_fault, 2566 .map_pages = filemap_map_pages, 2567 .page_mkwrite = filemap_page_mkwrite, 2568 }; 2569 2570 /* This is used for a general mmap of a disk file */ 2571 2572 int generic_file_mmap(struct file * file, struct vm_area_struct * vma) 2573 { 2574 struct address_space *mapping = file->f_mapping; 2575 2576 if (!mapping->a_ops->readpage) 2577 return -ENOEXEC; 2578 file_accessed(file); 2579 vma->vm_ops = &generic_file_vm_ops; 2580 return 0; 2581 } 2582 2583 /* 2584 * This is for filesystems which do not implement ->writepage. 2585 */ 2586 int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma) 2587 { 2588 if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) 2589 return -EINVAL; 2590 return generic_file_mmap(file, vma); 2591 } 2592 #else 2593 int generic_file_mmap(struct file * file, struct vm_area_struct * vma) 2594 { 2595 return -ENOSYS; 2596 } 2597 int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma) 2598 { 2599 return -ENOSYS; 2600 } 2601 #endif /* CONFIG_MMU */ 2602 2603 EXPORT_SYMBOL(generic_file_mmap); 2604 EXPORT_SYMBOL(generic_file_readonly_mmap); 2605 2606 static struct page *wait_on_page_read(struct page *page) 2607 { 2608 if (!IS_ERR(page)) { 2609 wait_on_page_locked(page); 2610 if (!PageUptodate(page)) { 2611 put_page(page); 2612 page = ERR_PTR(-EIO); 2613 } 2614 } 2615 return page; 2616 } 2617 2618 static struct page *do_read_cache_page(struct address_space *mapping, 2619 pgoff_t index, 2620 int (*filler)(void *, struct page *), 2621 void *data, 2622 gfp_t gfp) 2623 { 2624 struct page *page; 2625 int err; 2626 repeat: 2627 page = find_get_page(mapping, index); 2628 if (!page) { 2629 page = __page_cache_alloc(gfp | __GFP_COLD); 2630 if (!page) 2631 return ERR_PTR(-ENOMEM); 2632 err = add_to_page_cache_lru(page, mapping, index, gfp); 2633 if (unlikely(err)) { 2634 put_page(page); 2635 if (err == -EEXIST) 2636 goto repeat; 2637 /* Presumably ENOMEM for radix tree node */ 2638 return ERR_PTR(err); 2639 } 2640 2641 filler: 2642 err = filler(data, page); 2643 if (err < 0) { 2644 put_page(page); 2645 return ERR_PTR(err); 2646 } 2647 2648 page = wait_on_page_read(page); 2649 if (IS_ERR(page)) 2650 return page; 2651 goto out; 2652 } 2653 if (PageUptodate(page)) 2654 goto out; 2655 2656 /* 2657 * Page is not up to date and may be locked due one of the following 2658 * case a: Page is being filled and the page lock is held 2659 * case b: Read/write error clearing the page uptodate status 2660 * case c: Truncation in progress (page locked) 2661 * case d: Reclaim in progress 2662 * 2663 * Case a, the page will be up to date when the page is unlocked. 2664 * There is no need to serialise on the page lock here as the page 2665 * is pinned so the lock gives no additional protection. Even if the 2666 * the page is truncated, the data is still valid if PageUptodate as 2667 * it's a race vs truncate race. 2668 * Case b, the page will not be up to date 2669 * Case c, the page may be truncated but in itself, the data may still 2670 * be valid after IO completes as it's a read vs truncate race. The 2671 * operation must restart if the page is not uptodate on unlock but 2672 * otherwise serialising on page lock to stabilise the mapping gives 2673 * no additional guarantees to the caller as the page lock is 2674 * released before return. 2675 * Case d, similar to truncation. If reclaim holds the page lock, it 2676 * will be a race with remove_mapping that determines if the mapping 2677 * is valid on unlock but otherwise the data is valid and there is 2678 * no need to serialise with page lock. 2679 * 2680 * As the page lock gives no additional guarantee, we optimistically 2681 * wait on the page to be unlocked and check if it's up to date and 2682 * use the page if it is. Otherwise, the page lock is required to 2683 * distinguish between the different cases. The motivation is that we 2684 * avoid spurious serialisations and wakeups when multiple processes 2685 * wait on the same page for IO to complete. 2686 */ 2687 wait_on_page_locked(page); 2688 if (PageUptodate(page)) 2689 goto out; 2690 2691 /* Distinguish between all the cases under the safety of the lock */ 2692 lock_page(page); 2693 2694 /* Case c or d, restart the operation */ 2695 if (!page->mapping) { 2696 unlock_page(page); 2697 put_page(page); 2698 goto repeat; 2699 } 2700 2701 /* Someone else locked and filled the page in a very small window */ 2702 if (PageUptodate(page)) { 2703 unlock_page(page); 2704 goto out; 2705 } 2706 goto filler; 2707 2708 out: 2709 mark_page_accessed(page); 2710 return page; 2711 } 2712 2713 /** 2714 * read_cache_page - read into page cache, fill it if needed 2715 * @mapping: the page's address_space 2716 * @index: the page index 2717 * @filler: function to perform the read 2718 * @data: first arg to filler(data, page) function, often left as NULL 2719 * 2720 * Read into the page cache. If a page already exists, and PageUptodate() is 2721 * not set, try to fill the page and wait for it to become unlocked. 2722 * 2723 * If the page does not get brought uptodate, return -EIO. 2724 */ 2725 struct page *read_cache_page(struct address_space *mapping, 2726 pgoff_t index, 2727 int (*filler)(void *, struct page *), 2728 void *data) 2729 { 2730 return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping)); 2731 } 2732 EXPORT_SYMBOL(read_cache_page); 2733 2734 /** 2735 * read_cache_page_gfp - read into page cache, using specified page allocation flags. 2736 * @mapping: the page's address_space 2737 * @index: the page index 2738 * @gfp: the page allocator flags to use if allocating 2739 * 2740 * This is the same as "read_mapping_page(mapping, index, NULL)", but with 2741 * any new page allocations done using the specified allocation flags. 2742 * 2743 * If the page does not get brought uptodate, return -EIO. 2744 */ 2745 struct page *read_cache_page_gfp(struct address_space *mapping, 2746 pgoff_t index, 2747 gfp_t gfp) 2748 { 2749 filler_t *filler = (filler_t *)mapping->a_ops->readpage; 2750 2751 return do_read_cache_page(mapping, index, filler, NULL, gfp); 2752 } 2753 EXPORT_SYMBOL(read_cache_page_gfp); 2754 2755 /* 2756 * Performs necessary checks before doing a write 2757 * 2758 * Can adjust writing position or amount of bytes to write. 2759 * Returns appropriate error code that caller should return or 2760 * zero in case that write should be allowed. 2761 */ 2762 inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from) 2763 { 2764 struct file *file = iocb->ki_filp; 2765 struct inode *inode = file->f_mapping->host; 2766 unsigned long limit = rlimit(RLIMIT_FSIZE); 2767 loff_t pos; 2768 2769 if (!iov_iter_count(from)) 2770 return 0; 2771 2772 /* FIXME: this is for backwards compatibility with 2.4 */ 2773 if (iocb->ki_flags & IOCB_APPEND) 2774 iocb->ki_pos = i_size_read(inode); 2775 2776 pos = iocb->ki_pos; 2777 2778 if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT)) 2779 return -EINVAL; 2780 2781 if (limit != RLIM_INFINITY) { 2782 if (iocb->ki_pos >= limit) { 2783 send_sig(SIGXFSZ, current, 0); 2784 return -EFBIG; 2785 } 2786 iov_iter_truncate(from, limit - (unsigned long)pos); 2787 } 2788 2789 /* 2790 * LFS rule 2791 */ 2792 if (unlikely(pos + iov_iter_count(from) > MAX_NON_LFS && 2793 !(file->f_flags & O_LARGEFILE))) { 2794 if (pos >= MAX_NON_LFS) 2795 return -EFBIG; 2796 iov_iter_truncate(from, MAX_NON_LFS - (unsigned long)pos); 2797 } 2798 2799 /* 2800 * Are we about to exceed the fs block limit ? 2801 * 2802 * If we have written data it becomes a short write. If we have 2803 * exceeded without writing data we send a signal and return EFBIG. 2804 * Linus frestrict idea will clean these up nicely.. 2805 */ 2806 if (unlikely(pos >= inode->i_sb->s_maxbytes)) 2807 return -EFBIG; 2808 2809 iov_iter_truncate(from, inode->i_sb->s_maxbytes - pos); 2810 return iov_iter_count(from); 2811 } 2812 EXPORT_SYMBOL(generic_write_checks); 2813 2814 int pagecache_write_begin(struct file *file, struct address_space *mapping, 2815 loff_t pos, unsigned len, unsigned flags, 2816 struct page **pagep, void **fsdata) 2817 { 2818 const struct address_space_operations *aops = mapping->a_ops; 2819 2820 return aops->write_begin(file, mapping, pos, len, flags, 2821 pagep, fsdata); 2822 } 2823 EXPORT_SYMBOL(pagecache_write_begin); 2824 2825 int pagecache_write_end(struct file *file, struct address_space *mapping, 2826 loff_t pos, unsigned len, unsigned copied, 2827 struct page *page, void *fsdata) 2828 { 2829 const struct address_space_operations *aops = mapping->a_ops; 2830 2831 return aops->write_end(file, mapping, pos, len, copied, page, fsdata); 2832 } 2833 EXPORT_SYMBOL(pagecache_write_end); 2834 2835 ssize_t 2836 generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) 2837 { 2838 struct file *file = iocb->ki_filp; 2839 struct address_space *mapping = file->f_mapping; 2840 struct inode *inode = mapping->host; 2841 loff_t pos = iocb->ki_pos; 2842 ssize_t written; 2843 size_t write_len; 2844 pgoff_t end; 2845 2846 write_len = iov_iter_count(from); 2847 end = (pos + write_len - 1) >> PAGE_SHIFT; 2848 2849 if (iocb->ki_flags & IOCB_NOWAIT) { 2850 /* If there are pages to writeback, return */ 2851 if (filemap_range_has_page(inode->i_mapping, pos, 2852 pos + iov_iter_count(from))) 2853 return -EAGAIN; 2854 } else { 2855 written = filemap_write_and_wait_range(mapping, pos, 2856 pos + write_len - 1); 2857 if (written) 2858 goto out; 2859 } 2860 2861 /* 2862 * After a write we want buffered reads to be sure to go to disk to get 2863 * the new data. We invalidate clean cached page from the region we're 2864 * about to write. We do this *before* the write so that we can return 2865 * without clobbering -EIOCBQUEUED from ->direct_IO(). 2866 */ 2867 written = invalidate_inode_pages2_range(mapping, 2868 pos >> PAGE_SHIFT, end); 2869 /* 2870 * If a page can not be invalidated, return 0 to fall back 2871 * to buffered write. 2872 */ 2873 if (written) { 2874 if (written == -EBUSY) 2875 return 0; 2876 goto out; 2877 } 2878 2879 written = mapping->a_ops->direct_IO(iocb, from); 2880 2881 /* 2882 * Finally, try again to invalidate clean pages which might have been 2883 * cached by non-direct readahead, or faulted in by get_user_pages() 2884 * if the source of the write was an mmap'ed region of the file 2885 * we're writing. Either one is a pretty crazy thing to do, 2886 * so we don't support it 100%. If this invalidation 2887 * fails, tough, the write still worked... 2888 */ 2889 invalidate_inode_pages2_range(mapping, 2890 pos >> PAGE_SHIFT, end); 2891 2892 if (written > 0) { 2893 pos += written; 2894 write_len -= written; 2895 if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { 2896 i_size_write(inode, pos); 2897 mark_inode_dirty(inode); 2898 } 2899 iocb->ki_pos = pos; 2900 } 2901 iov_iter_revert(from, write_len - iov_iter_count(from)); 2902 out: 2903 return written; 2904 } 2905 EXPORT_SYMBOL(generic_file_direct_write); 2906 2907 /* 2908 * Find or create a page at the given pagecache position. Return the locked 2909 * page. This function is specifically for buffered writes. 2910 */ 2911 struct page *grab_cache_page_write_begin(struct address_space *mapping, 2912 pgoff_t index, unsigned flags) 2913 { 2914 struct page *page; 2915 int fgp_flags = FGP_LOCK|FGP_WRITE|FGP_CREAT; 2916 2917 if (flags & AOP_FLAG_NOFS) 2918 fgp_flags |= FGP_NOFS; 2919 2920 page = pagecache_get_page(mapping, index, fgp_flags, 2921 mapping_gfp_mask(mapping)); 2922 if (page) 2923 wait_for_stable_page(page); 2924 2925 return page; 2926 } 2927 EXPORT_SYMBOL(grab_cache_page_write_begin); 2928 2929 ssize_t generic_perform_write(struct file *file, 2930 struct iov_iter *i, loff_t pos) 2931 { 2932 struct address_space *mapping = file->f_mapping; 2933 const struct address_space_operations *a_ops = mapping->a_ops; 2934 long status = 0; 2935 ssize_t written = 0; 2936 unsigned int flags = 0; 2937 2938 do { 2939 struct page *page; 2940 unsigned long offset; /* Offset into pagecache page */ 2941 unsigned long bytes; /* Bytes to write to page */ 2942 size_t copied; /* Bytes copied from user */ 2943 void *fsdata; 2944 2945 offset = (pos & (PAGE_SIZE - 1)); 2946 bytes = min_t(unsigned long, PAGE_SIZE - offset, 2947 iov_iter_count(i)); 2948 2949 again: 2950 /* 2951 * Bring in the user page that we will copy from _first_. 2952 * Otherwise there's a nasty deadlock on copying from the 2953 * same page as we're writing to, without it being marked 2954 * up-to-date. 2955 * 2956 * Not only is this an optimisation, but it is also required 2957 * to check that the address is actually valid, when atomic 2958 * usercopies are used, below. 2959 */ 2960 if (unlikely(iov_iter_fault_in_readable(i, bytes))) { 2961 status = -EFAULT; 2962 break; 2963 } 2964 2965 if (fatal_signal_pending(current)) { 2966 status = -EINTR; 2967 break; 2968 } 2969 2970 status = a_ops->write_begin(file, mapping, pos, bytes, flags, 2971 &page, &fsdata); 2972 if (unlikely(status < 0)) 2973 break; 2974 2975 if (mapping_writably_mapped(mapping)) 2976 flush_dcache_page(page); 2977 2978 copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); 2979 flush_dcache_page(page); 2980 2981 status = a_ops->write_end(file, mapping, pos, bytes, copied, 2982 page, fsdata); 2983 if (unlikely(status < 0)) 2984 break; 2985 copied = status; 2986 2987 cond_resched(); 2988 2989 iov_iter_advance(i, copied); 2990 if (unlikely(copied == 0)) { 2991 /* 2992 * If we were unable to copy any data at all, we must 2993 * fall back to a single segment length write. 2994 * 2995 * If we didn't fallback here, we could livelock 2996 * because not all segments in the iov can be copied at 2997 * once without a pagefault. 2998 */ 2999 bytes = min_t(unsigned long, PAGE_SIZE - offset, 3000 iov_iter_single_seg_count(i)); 3001 goto again; 3002 } 3003 pos += copied; 3004 written += copied; 3005 3006 balance_dirty_pages_ratelimited(mapping); 3007 } while (iov_iter_count(i)); 3008 3009 return written ? written : status; 3010 } 3011 EXPORT_SYMBOL(generic_perform_write); 3012 3013 /** 3014 * __generic_file_write_iter - write data to a file 3015 * @iocb: IO state structure (file, offset, etc.) 3016 * @from: iov_iter with data to write 3017 * 3018 * This function does all the work needed for actually writing data to a 3019 * file. It does all basic checks, removes SUID from the file, updates 3020 * modification times and calls proper subroutines depending on whether we 3021 * do direct IO or a standard buffered write. 3022 * 3023 * It expects i_mutex to be grabbed unless we work on a block device or similar 3024 * object which does not need locking at all. 3025 * 3026 * This function does *not* take care of syncing data in case of O_SYNC write. 3027 * A caller has to handle it. This is mainly due to the fact that we want to 3028 * avoid syncing under i_mutex. 3029 */ 3030 ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) 3031 { 3032 struct file *file = iocb->ki_filp; 3033 struct address_space * mapping = file->f_mapping; 3034 struct inode *inode = mapping->host; 3035 ssize_t written = 0; 3036 ssize_t err; 3037 ssize_t status; 3038 3039 /* We can write back this queue in page reclaim */ 3040 current->backing_dev_info = inode_to_bdi(inode); 3041 err = file_remove_privs(file); 3042 if (err) 3043 goto out; 3044 3045 err = file_update_time(file); 3046 if (err) 3047 goto out; 3048 3049 if (iocb->ki_flags & IOCB_DIRECT) { 3050 loff_t pos, endbyte; 3051 3052 written = generic_file_direct_write(iocb, from); 3053 /* 3054 * If the write stopped short of completing, fall back to 3055 * buffered writes. Some filesystems do this for writes to 3056 * holes, for example. For DAX files, a buffered write will 3057 * not succeed (even if it did, DAX does not handle dirty 3058 * page-cache pages correctly). 3059 */ 3060 if (written < 0 || !iov_iter_count(from) || IS_DAX(inode)) 3061 goto out; 3062 3063 status = generic_perform_write(file, from, pos = iocb->ki_pos); 3064 /* 3065 * If generic_perform_write() returned a synchronous error 3066 * then we want to return the number of bytes which were 3067 * direct-written, or the error code if that was zero. Note 3068 * that this differs from normal direct-io semantics, which 3069 * will return -EFOO even if some bytes were written. 3070 */ 3071 if (unlikely(status < 0)) { 3072 err = status; 3073 goto out; 3074 } 3075 /* 3076 * We need to ensure that the page cache pages are written to 3077 * disk and invalidated to preserve the expected O_DIRECT 3078 * semantics. 3079 */ 3080 endbyte = pos + status - 1; 3081 err = filemap_write_and_wait_range(mapping, pos, endbyte); 3082 if (err == 0) { 3083 iocb->ki_pos = endbyte + 1; 3084 written += status; 3085 invalidate_mapping_pages(mapping, 3086 pos >> PAGE_SHIFT, 3087 endbyte >> PAGE_SHIFT); 3088 } else { 3089 /* 3090 * We don't know how much we wrote, so just return 3091 * the number of bytes which were direct-written 3092 */ 3093 } 3094 } else { 3095 written = generic_perform_write(file, from, iocb->ki_pos); 3096 if (likely(written > 0)) 3097 iocb->ki_pos += written; 3098 } 3099 out: 3100 current->backing_dev_info = NULL; 3101 return written ? written : err; 3102 } 3103 EXPORT_SYMBOL(__generic_file_write_iter); 3104 3105 /** 3106 * generic_file_write_iter - write data to a file 3107 * @iocb: IO state structure 3108 * @from: iov_iter with data to write 3109 * 3110 * This is a wrapper around __generic_file_write_iter() to be used by most 3111 * filesystems. It takes care of syncing the file in case of O_SYNC file 3112 * and acquires i_mutex as needed. 3113 */ 3114 ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) 3115 { 3116 struct file *file = iocb->ki_filp; 3117 struct inode *inode = file->f_mapping->host; 3118 ssize_t ret; 3119 3120 inode_lock(inode); 3121 ret = generic_write_checks(iocb, from); 3122 if (ret > 0) 3123 ret = __generic_file_write_iter(iocb, from); 3124 inode_unlock(inode); 3125 3126 if (ret > 0) 3127 ret = generic_write_sync(iocb, ret); 3128 return ret; 3129 } 3130 EXPORT_SYMBOL(generic_file_write_iter); 3131 3132 /** 3133 * try_to_release_page() - release old fs-specific metadata on a page 3134 * 3135 * @page: the page which the kernel is trying to free 3136 * @gfp_mask: memory allocation flags (and I/O mode) 3137 * 3138 * The address_space is to try to release any data against the page 3139 * (presumably at page->private). If the release was successful, return '1'. 3140 * Otherwise return zero. 3141 * 3142 * This may also be called if PG_fscache is set on a page, indicating that the 3143 * page is known to the local caching routines. 3144 * 3145 * The @gfp_mask argument specifies whether I/O may be performed to release 3146 * this page (__GFP_IO), and whether the call may block (__GFP_RECLAIM & __GFP_FS). 3147 * 3148 */ 3149 int try_to_release_page(struct page *page, gfp_t gfp_mask) 3150 { 3151 struct address_space * const mapping = page->mapping; 3152 3153 BUG_ON(!PageLocked(page)); 3154 if (PageWriteback(page)) 3155 return 0; 3156 3157 if (mapping && mapping->a_ops->releasepage) 3158 return mapping->a_ops->releasepage(page, gfp_mask); 3159 return try_to_free_buffers(page); 3160 } 3161 3162 EXPORT_SYMBOL(try_to_release_page); 3163