1 /* 2 * linux/mm/filemap.c 3 * 4 * Copyright (C) 1994-1999 Linus Torvalds 5 */ 6 7 /* 8 * This file handles the generic file mmap semantics used by 9 * most "normal" filesystems (but you don't /have/ to use this: 10 * the NFS filesystem used to do this differently, for example) 11 */ 12 #include <linux/module.h> 13 #include <linux/compiler.h> 14 #include <linux/fs.h> 15 #include <linux/uaccess.h> 16 #include <linux/aio.h> 17 #include <linux/capability.h> 18 #include <linux/kernel_stat.h> 19 #include <linux/gfp.h> 20 #include <linux/mm.h> 21 #include <linux/swap.h> 22 #include <linux/mman.h> 23 #include <linux/pagemap.h> 24 #include <linux/file.h> 25 #include <linux/uio.h> 26 #include <linux/hash.h> 27 #include <linux/writeback.h> 28 #include <linux/backing-dev.h> 29 #include <linux/pagevec.h> 30 #include <linux/blkdev.h> 31 #include <linux/security.h> 32 #include <linux/syscalls.h> 33 #include <linux/cpuset.h> 34 #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ 35 #include <linux/memcontrol.h> 36 #include <linux/mm_inline.h> /* for page_is_file_cache() */ 37 #include "internal.h" 38 39 /* 40 * FIXME: remove all knowledge of the buffer layer from the core VM 41 */ 42 #include <linux/buffer_head.h> /* for try_to_free_buffers */ 43 44 #include <asm/mman.h> 45 46 /* 47 * Shared mappings implemented 30.11.1994. It's not fully working yet, 48 * though. 49 * 50 * Shared mappings now work. 15.8.1995 Bruno. 51 * 52 * finished 'unifying' the page and buffer cache and SMP-threaded the 53 * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com> 54 * 55 * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de> 56 */ 57 58 /* 59 * Lock ordering: 60 * 61 * ->i_mmap_lock (truncate_pagecache) 62 * ->private_lock (__free_pte->__set_page_dirty_buffers) 63 * ->swap_lock (exclusive_swap_page, others) 64 * ->mapping->tree_lock 65 * 66 * ->i_mutex 67 * ->i_mmap_lock (truncate->unmap_mapping_range) 68 * 69 * ->mmap_sem 70 * ->i_mmap_lock 71 * ->page_table_lock or pte_lock (various, mainly in memory.c) 72 * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock) 73 * 74 * ->mmap_sem 75 * ->lock_page (access_process_vm) 76 * 77 * ->i_mutex (generic_file_buffered_write) 78 * ->mmap_sem (fault_in_pages_readable->do_page_fault) 79 * 80 * ->i_mutex 81 * ->i_alloc_sem (various) 82 * 83 * ->inode_lock 84 * ->sb_lock (fs/fs-writeback.c) 85 * ->mapping->tree_lock (__sync_single_inode) 86 * 87 * ->i_mmap_lock 88 * ->anon_vma.lock (vma_adjust) 89 * 90 * ->anon_vma.lock 91 * ->page_table_lock or pte_lock (anon_vma_prepare and various) 92 * 93 * ->page_table_lock or pte_lock 94 * ->swap_lock (try_to_unmap_one) 95 * ->private_lock (try_to_unmap_one) 96 * ->tree_lock (try_to_unmap_one) 97 * ->zone.lru_lock (follow_page->mark_page_accessed) 98 * ->zone.lru_lock (check_pte_range->isolate_lru_page) 99 * ->private_lock (page_remove_rmap->set_page_dirty) 100 * ->tree_lock (page_remove_rmap->set_page_dirty) 101 * ->inode_lock (page_remove_rmap->set_page_dirty) 102 * ->inode_lock (zap_pte_range->set_page_dirty) 103 * ->private_lock (zap_pte_range->__set_page_dirty_buffers) 104 * 105 * (code doesn't rely on that order, so you could switch it around) 106 * ->tasklist_lock (memory_failure, collect_procs_ao) 107 * ->i_mmap_lock 108 */ 109 110 /* 111 * Remove a page from the page cache and free it. Caller has to make 112 * sure the page is locked and that nobody else uses it - or that usage 113 * is safe. The caller must hold the mapping's tree_lock. 114 */ 115 void __remove_from_page_cache(struct page *page) 116 { 117 struct address_space *mapping = page->mapping; 118 119 radix_tree_delete(&mapping->page_tree, page->index); 120 page->mapping = NULL; 121 mapping->nrpages--; 122 __dec_zone_page_state(page, NR_FILE_PAGES); 123 if (PageSwapBacked(page)) 124 __dec_zone_page_state(page, NR_SHMEM); 125 BUG_ON(page_mapped(page)); 126 127 /* 128 * Some filesystems seem to re-dirty the page even after 129 * the VM has canceled the dirty bit (eg ext3 journaling). 130 * 131 * Fix it up by doing a final dirty accounting check after 132 * having removed the page entirely. 133 */ 134 if (PageDirty(page) && mapping_cap_account_dirty(mapping)) { 135 dec_zone_page_state(page, NR_FILE_DIRTY); 136 dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); 137 } 138 } 139 140 void remove_from_page_cache(struct page *page) 141 { 142 struct address_space *mapping = page->mapping; 143 void (*freepage)(struct page *); 144 145 BUG_ON(!PageLocked(page)); 146 147 freepage = mapping->a_ops->freepage; 148 spin_lock_irq(&mapping->tree_lock); 149 __remove_from_page_cache(page); 150 spin_unlock_irq(&mapping->tree_lock); 151 mem_cgroup_uncharge_cache_page(page); 152 153 if (freepage) 154 freepage(page); 155 } 156 EXPORT_SYMBOL(remove_from_page_cache); 157 158 static int sync_page(void *word) 159 { 160 struct address_space *mapping; 161 struct page *page; 162 163 page = container_of((unsigned long *)word, struct page, flags); 164 165 /* 166 * page_mapping() is being called without PG_locked held. 167 * Some knowledge of the state and use of the page is used to 168 * reduce the requirements down to a memory barrier. 169 * The danger here is of a stale page_mapping() return value 170 * indicating a struct address_space different from the one it's 171 * associated with when it is associated with one. 172 * After smp_mb(), it's either the correct page_mapping() for 173 * the page, or an old page_mapping() and the page's own 174 * page_mapping() has gone NULL. 175 * The ->sync_page() address_space operation must tolerate 176 * page_mapping() going NULL. By an amazing coincidence, 177 * this comes about because none of the users of the page 178 * in the ->sync_page() methods make essential use of the 179 * page_mapping(), merely passing the page down to the backing 180 * device's unplug functions when it's non-NULL, which in turn 181 * ignore it for all cases but swap, where only page_private(page) is 182 * of interest. When page_mapping() does go NULL, the entire 183 * call stack gracefully ignores the page and returns. 184 * -- wli 185 */ 186 smp_mb(); 187 mapping = page_mapping(page); 188 if (mapping && mapping->a_ops && mapping->a_ops->sync_page) 189 mapping->a_ops->sync_page(page); 190 io_schedule(); 191 return 0; 192 } 193 194 static int sync_page_killable(void *word) 195 { 196 sync_page(word); 197 return fatal_signal_pending(current) ? -EINTR : 0; 198 } 199 200 /** 201 * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range 202 * @mapping: address space structure to write 203 * @start: offset in bytes where the range starts 204 * @end: offset in bytes where the range ends (inclusive) 205 * @sync_mode: enable synchronous operation 206 * 207 * Start writeback against all of a mapping's dirty pages that lie 208 * within the byte offsets <start, end> inclusive. 209 * 210 * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as 211 * opposed to a regular memory cleansing writeback. The difference between 212 * these two operations is that if a dirty page/buffer is encountered, it must 213 * be waited upon, and not just skipped over. 214 */ 215 int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, 216 loff_t end, int sync_mode) 217 { 218 int ret; 219 struct writeback_control wbc = { 220 .sync_mode = sync_mode, 221 .nr_to_write = LONG_MAX, 222 .range_start = start, 223 .range_end = end, 224 }; 225 226 if (!mapping_cap_writeback_dirty(mapping)) 227 return 0; 228 229 ret = do_writepages(mapping, &wbc); 230 return ret; 231 } 232 233 static inline int __filemap_fdatawrite(struct address_space *mapping, 234 int sync_mode) 235 { 236 return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode); 237 } 238 239 int filemap_fdatawrite(struct address_space *mapping) 240 { 241 return __filemap_fdatawrite(mapping, WB_SYNC_ALL); 242 } 243 EXPORT_SYMBOL(filemap_fdatawrite); 244 245 int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, 246 loff_t end) 247 { 248 return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL); 249 } 250 EXPORT_SYMBOL(filemap_fdatawrite_range); 251 252 /** 253 * filemap_flush - mostly a non-blocking flush 254 * @mapping: target address_space 255 * 256 * This is a mostly non-blocking flush. Not suitable for data-integrity 257 * purposes - I/O may not be started against all dirty pages. 258 */ 259 int filemap_flush(struct address_space *mapping) 260 { 261 return __filemap_fdatawrite(mapping, WB_SYNC_NONE); 262 } 263 EXPORT_SYMBOL(filemap_flush); 264 265 /** 266 * filemap_fdatawait_range - wait for writeback to complete 267 * @mapping: address space structure to wait for 268 * @start_byte: offset in bytes where the range starts 269 * @end_byte: offset in bytes where the range ends (inclusive) 270 * 271 * Walk the list of under-writeback pages of the given address space 272 * in the given range and wait for all of them. 273 */ 274 int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, 275 loff_t end_byte) 276 { 277 pgoff_t index = start_byte >> PAGE_CACHE_SHIFT; 278 pgoff_t end = end_byte >> PAGE_CACHE_SHIFT; 279 struct pagevec pvec; 280 int nr_pages; 281 int ret = 0; 282 283 if (end_byte < start_byte) 284 return 0; 285 286 pagevec_init(&pvec, 0); 287 while ((index <= end) && 288 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, 289 PAGECACHE_TAG_WRITEBACK, 290 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) { 291 unsigned i; 292 293 for (i = 0; i < nr_pages; i++) { 294 struct page *page = pvec.pages[i]; 295 296 /* until radix tree lookup accepts end_index */ 297 if (page->index > end) 298 continue; 299 300 wait_on_page_writeback(page); 301 if (TestClearPageError(page)) 302 ret = -EIO; 303 } 304 pagevec_release(&pvec); 305 cond_resched(); 306 } 307 308 /* Check for outstanding write errors */ 309 if (test_and_clear_bit(AS_ENOSPC, &mapping->flags)) 310 ret = -ENOSPC; 311 if (test_and_clear_bit(AS_EIO, &mapping->flags)) 312 ret = -EIO; 313 314 return ret; 315 } 316 EXPORT_SYMBOL(filemap_fdatawait_range); 317 318 /** 319 * filemap_fdatawait - wait for all under-writeback pages to complete 320 * @mapping: address space structure to wait for 321 * 322 * Walk the list of under-writeback pages of the given address space 323 * and wait for all of them. 324 */ 325 int filemap_fdatawait(struct address_space *mapping) 326 { 327 loff_t i_size = i_size_read(mapping->host); 328 329 if (i_size == 0) 330 return 0; 331 332 return filemap_fdatawait_range(mapping, 0, i_size - 1); 333 } 334 EXPORT_SYMBOL(filemap_fdatawait); 335 336 int filemap_write_and_wait(struct address_space *mapping) 337 { 338 int err = 0; 339 340 if (mapping->nrpages) { 341 err = filemap_fdatawrite(mapping); 342 /* 343 * Even if the above returned error, the pages may be 344 * written partially (e.g. -ENOSPC), so we wait for it. 345 * But the -EIO is special case, it may indicate the worst 346 * thing (e.g. bug) happened, so we avoid waiting for it. 347 */ 348 if (err != -EIO) { 349 int err2 = filemap_fdatawait(mapping); 350 if (!err) 351 err = err2; 352 } 353 } 354 return err; 355 } 356 EXPORT_SYMBOL(filemap_write_and_wait); 357 358 /** 359 * filemap_write_and_wait_range - write out & wait on a file range 360 * @mapping: the address_space for the pages 361 * @lstart: offset in bytes where the range starts 362 * @lend: offset in bytes where the range ends (inclusive) 363 * 364 * Write out and wait upon file offsets lstart->lend, inclusive. 365 * 366 * Note that `lend' is inclusive (describes the last byte to be written) so 367 * that this function can be used to write to the very end-of-file (end = -1). 368 */ 369 int filemap_write_and_wait_range(struct address_space *mapping, 370 loff_t lstart, loff_t lend) 371 { 372 int err = 0; 373 374 if (mapping->nrpages) { 375 err = __filemap_fdatawrite_range(mapping, lstart, lend, 376 WB_SYNC_ALL); 377 /* See comment of filemap_write_and_wait() */ 378 if (err != -EIO) { 379 int err2 = filemap_fdatawait_range(mapping, 380 lstart, lend); 381 if (!err) 382 err = err2; 383 } 384 } 385 return err; 386 } 387 EXPORT_SYMBOL(filemap_write_and_wait_range); 388 389 /** 390 * add_to_page_cache_locked - add a locked page to the pagecache 391 * @page: page to add 392 * @mapping: the page's address_space 393 * @offset: page index 394 * @gfp_mask: page allocation mode 395 * 396 * This function is used to add a page to the pagecache. It must be locked. 397 * This function does not add the page to the LRU. The caller must do that. 398 */ 399 int add_to_page_cache_locked(struct page *page, struct address_space *mapping, 400 pgoff_t offset, gfp_t gfp_mask) 401 { 402 int error; 403 404 VM_BUG_ON(!PageLocked(page)); 405 406 error = mem_cgroup_cache_charge(page, current->mm, 407 gfp_mask & GFP_RECLAIM_MASK); 408 if (error) 409 goto out; 410 411 error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); 412 if (error == 0) { 413 page_cache_get(page); 414 page->mapping = mapping; 415 page->index = offset; 416 417 spin_lock_irq(&mapping->tree_lock); 418 error = radix_tree_insert(&mapping->page_tree, offset, page); 419 if (likely(!error)) { 420 mapping->nrpages++; 421 __inc_zone_page_state(page, NR_FILE_PAGES); 422 if (PageSwapBacked(page)) 423 __inc_zone_page_state(page, NR_SHMEM); 424 spin_unlock_irq(&mapping->tree_lock); 425 } else { 426 page->mapping = NULL; 427 spin_unlock_irq(&mapping->tree_lock); 428 mem_cgroup_uncharge_cache_page(page); 429 page_cache_release(page); 430 } 431 radix_tree_preload_end(); 432 } else 433 mem_cgroup_uncharge_cache_page(page); 434 out: 435 return error; 436 } 437 EXPORT_SYMBOL(add_to_page_cache_locked); 438 439 int add_to_page_cache_lru(struct page *page, struct address_space *mapping, 440 pgoff_t offset, gfp_t gfp_mask) 441 { 442 int ret; 443 444 /* 445 * Splice_read and readahead add shmem/tmpfs pages into the page cache 446 * before shmem_readpage has a chance to mark them as SwapBacked: they 447 * need to go on the anon lru below, and mem_cgroup_cache_charge 448 * (called in add_to_page_cache) needs to know where they're going too. 449 */ 450 if (mapping_cap_swap_backed(mapping)) 451 SetPageSwapBacked(page); 452 453 ret = add_to_page_cache(page, mapping, offset, gfp_mask); 454 if (ret == 0) { 455 if (page_is_file_cache(page)) 456 lru_cache_add_file(page); 457 else 458 lru_cache_add_anon(page); 459 } 460 return ret; 461 } 462 EXPORT_SYMBOL_GPL(add_to_page_cache_lru); 463 464 #ifdef CONFIG_NUMA 465 struct page *__page_cache_alloc(gfp_t gfp) 466 { 467 int n; 468 struct page *page; 469 470 if (cpuset_do_page_mem_spread()) { 471 get_mems_allowed(); 472 n = cpuset_mem_spread_node(); 473 page = alloc_pages_exact_node(n, gfp, 0); 474 put_mems_allowed(); 475 return page; 476 } 477 return alloc_pages(gfp, 0); 478 } 479 EXPORT_SYMBOL(__page_cache_alloc); 480 #endif 481 482 static int __sleep_on_page_lock(void *word) 483 { 484 io_schedule(); 485 return 0; 486 } 487 488 /* 489 * In order to wait for pages to become available there must be 490 * waitqueues associated with pages. By using a hash table of 491 * waitqueues where the bucket discipline is to maintain all 492 * waiters on the same queue and wake all when any of the pages 493 * become available, and for the woken contexts to check to be 494 * sure the appropriate page became available, this saves space 495 * at a cost of "thundering herd" phenomena during rare hash 496 * collisions. 497 */ 498 static wait_queue_head_t *page_waitqueue(struct page *page) 499 { 500 const struct zone *zone = page_zone(page); 501 502 return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)]; 503 } 504 505 static inline void wake_up_page(struct page *page, int bit) 506 { 507 __wake_up_bit(page_waitqueue(page), &page->flags, bit); 508 } 509 510 void wait_on_page_bit(struct page *page, int bit_nr) 511 { 512 DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); 513 514 if (test_bit(bit_nr, &page->flags)) 515 __wait_on_bit(page_waitqueue(page), &wait, sync_page, 516 TASK_UNINTERRUPTIBLE); 517 } 518 EXPORT_SYMBOL(wait_on_page_bit); 519 520 /** 521 * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue 522 * @page: Page defining the wait queue of interest 523 * @waiter: Waiter to add to the queue 524 * 525 * Add an arbitrary @waiter to the wait queue for the nominated @page. 526 */ 527 void add_page_wait_queue(struct page *page, wait_queue_t *waiter) 528 { 529 wait_queue_head_t *q = page_waitqueue(page); 530 unsigned long flags; 531 532 spin_lock_irqsave(&q->lock, flags); 533 __add_wait_queue(q, waiter); 534 spin_unlock_irqrestore(&q->lock, flags); 535 } 536 EXPORT_SYMBOL_GPL(add_page_wait_queue); 537 538 /** 539 * unlock_page - unlock a locked page 540 * @page: the page 541 * 542 * Unlocks the page and wakes up sleepers in ___wait_on_page_locked(). 543 * Also wakes sleepers in wait_on_page_writeback() because the wakeup 544 * mechananism between PageLocked pages and PageWriteback pages is shared. 545 * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep. 546 * 547 * The mb is necessary to enforce ordering between the clear_bit and the read 548 * of the waitqueue (to avoid SMP races with a parallel wait_on_page_locked()). 549 */ 550 void unlock_page(struct page *page) 551 { 552 VM_BUG_ON(!PageLocked(page)); 553 clear_bit_unlock(PG_locked, &page->flags); 554 smp_mb__after_clear_bit(); 555 wake_up_page(page, PG_locked); 556 } 557 EXPORT_SYMBOL(unlock_page); 558 559 /** 560 * end_page_writeback - end writeback against a page 561 * @page: the page 562 */ 563 void end_page_writeback(struct page *page) 564 { 565 if (TestClearPageReclaim(page)) 566 rotate_reclaimable_page(page); 567 568 if (!test_clear_page_writeback(page)) 569 BUG(); 570 571 smp_mb__after_clear_bit(); 572 wake_up_page(page, PG_writeback); 573 } 574 EXPORT_SYMBOL(end_page_writeback); 575 576 /** 577 * __lock_page - get a lock on the page, assuming we need to sleep to get it 578 * @page: the page to lock 579 * 580 * Ugly. Running sync_page() in state TASK_UNINTERRUPTIBLE is scary. If some 581 * random driver's requestfn sets TASK_RUNNING, we could busywait. However 582 * chances are that on the second loop, the block layer's plug list is empty, 583 * so sync_page() will then return in state TASK_UNINTERRUPTIBLE. 584 */ 585 void __lock_page(struct page *page) 586 { 587 DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); 588 589 __wait_on_bit_lock(page_waitqueue(page), &wait, sync_page, 590 TASK_UNINTERRUPTIBLE); 591 } 592 EXPORT_SYMBOL(__lock_page); 593 594 int __lock_page_killable(struct page *page) 595 { 596 DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); 597 598 return __wait_on_bit_lock(page_waitqueue(page), &wait, 599 sync_page_killable, TASK_KILLABLE); 600 } 601 EXPORT_SYMBOL_GPL(__lock_page_killable); 602 603 /** 604 * __lock_page_nosync - get a lock on the page, without calling sync_page() 605 * @page: the page to lock 606 * 607 * Variant of lock_page that does not require the caller to hold a reference 608 * on the page's mapping. 609 */ 610 void __lock_page_nosync(struct page *page) 611 { 612 DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); 613 __wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock, 614 TASK_UNINTERRUPTIBLE); 615 } 616 617 int __lock_page_or_retry(struct page *page, struct mm_struct *mm, 618 unsigned int flags) 619 { 620 if (!(flags & FAULT_FLAG_ALLOW_RETRY)) { 621 __lock_page(page); 622 return 1; 623 } else { 624 up_read(&mm->mmap_sem); 625 wait_on_page_locked(page); 626 return 0; 627 } 628 } 629 630 /** 631 * find_get_page - find and get a page reference 632 * @mapping: the address_space to search 633 * @offset: the page index 634 * 635 * Is there a pagecache struct page at the given (mapping, offset) tuple? 636 * If yes, increment its refcount and return it; if no, return NULL. 637 */ 638 struct page *find_get_page(struct address_space *mapping, pgoff_t offset) 639 { 640 void **pagep; 641 struct page *page; 642 643 rcu_read_lock(); 644 repeat: 645 page = NULL; 646 pagep = radix_tree_lookup_slot(&mapping->page_tree, offset); 647 if (pagep) { 648 page = radix_tree_deref_slot(pagep); 649 if (unlikely(!page)) 650 goto out; 651 if (radix_tree_deref_retry(page)) 652 goto repeat; 653 654 if (!page_cache_get_speculative(page)) 655 goto repeat; 656 657 /* 658 * Has the page moved? 659 * This is part of the lockless pagecache protocol. See 660 * include/linux/pagemap.h for details. 661 */ 662 if (unlikely(page != *pagep)) { 663 page_cache_release(page); 664 goto repeat; 665 } 666 } 667 out: 668 rcu_read_unlock(); 669 670 return page; 671 } 672 EXPORT_SYMBOL(find_get_page); 673 674 /** 675 * find_lock_page - locate, pin and lock a pagecache page 676 * @mapping: the address_space to search 677 * @offset: the page index 678 * 679 * Locates the desired pagecache page, locks it, increments its reference 680 * count and returns its address. 681 * 682 * Returns zero if the page was not present. find_lock_page() may sleep. 683 */ 684 struct page *find_lock_page(struct address_space *mapping, pgoff_t offset) 685 { 686 struct page *page; 687 688 repeat: 689 page = find_get_page(mapping, offset); 690 if (page) { 691 lock_page(page); 692 /* Has the page been truncated? */ 693 if (unlikely(page->mapping != mapping)) { 694 unlock_page(page); 695 page_cache_release(page); 696 goto repeat; 697 } 698 VM_BUG_ON(page->index != offset); 699 } 700 return page; 701 } 702 EXPORT_SYMBOL(find_lock_page); 703 704 /** 705 * find_or_create_page - locate or add a pagecache page 706 * @mapping: the page's address_space 707 * @index: the page's index into the mapping 708 * @gfp_mask: page allocation mode 709 * 710 * Locates a page in the pagecache. If the page is not present, a new page 711 * is allocated using @gfp_mask and is added to the pagecache and to the VM's 712 * LRU list. The returned page is locked and has its reference count 713 * incremented. 714 * 715 * find_or_create_page() may sleep, even if @gfp_flags specifies an atomic 716 * allocation! 717 * 718 * find_or_create_page() returns the desired page's address, or zero on 719 * memory exhaustion. 720 */ 721 struct page *find_or_create_page(struct address_space *mapping, 722 pgoff_t index, gfp_t gfp_mask) 723 { 724 struct page *page; 725 int err; 726 repeat: 727 page = find_lock_page(mapping, index); 728 if (!page) { 729 page = __page_cache_alloc(gfp_mask); 730 if (!page) 731 return NULL; 732 /* 733 * We want a regular kernel memory (not highmem or DMA etc) 734 * allocation for the radix tree nodes, but we need to honour 735 * the context-specific requirements the caller has asked for. 736 * GFP_RECLAIM_MASK collects those requirements. 737 */ 738 err = add_to_page_cache_lru(page, mapping, index, 739 (gfp_mask & GFP_RECLAIM_MASK)); 740 if (unlikely(err)) { 741 page_cache_release(page); 742 page = NULL; 743 if (err == -EEXIST) 744 goto repeat; 745 } 746 } 747 return page; 748 } 749 EXPORT_SYMBOL(find_or_create_page); 750 751 /** 752 * find_get_pages - gang pagecache lookup 753 * @mapping: The address_space to search 754 * @start: The starting page index 755 * @nr_pages: The maximum number of pages 756 * @pages: Where the resulting pages are placed 757 * 758 * find_get_pages() will search for and return a group of up to 759 * @nr_pages pages in the mapping. The pages are placed at @pages. 760 * find_get_pages() takes a reference against the returned pages. 761 * 762 * The search returns a group of mapping-contiguous pages with ascending 763 * indexes. There may be holes in the indices due to not-present pages. 764 * 765 * find_get_pages() returns the number of pages which were found. 766 */ 767 unsigned find_get_pages(struct address_space *mapping, pgoff_t start, 768 unsigned int nr_pages, struct page **pages) 769 { 770 unsigned int i; 771 unsigned int ret; 772 unsigned int nr_found; 773 774 rcu_read_lock(); 775 restart: 776 nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, 777 (void ***)pages, start, nr_pages); 778 ret = 0; 779 for (i = 0; i < nr_found; i++) { 780 struct page *page; 781 repeat: 782 page = radix_tree_deref_slot((void **)pages[i]); 783 if (unlikely(!page)) 784 continue; 785 if (radix_tree_deref_retry(page)) { 786 if (ret) 787 start = pages[ret-1]->index; 788 goto restart; 789 } 790 791 if (!page_cache_get_speculative(page)) 792 goto repeat; 793 794 /* Has the page moved? */ 795 if (unlikely(page != *((void **)pages[i]))) { 796 page_cache_release(page); 797 goto repeat; 798 } 799 800 pages[ret] = page; 801 ret++; 802 } 803 rcu_read_unlock(); 804 return ret; 805 } 806 807 /** 808 * find_get_pages_contig - gang contiguous pagecache lookup 809 * @mapping: The address_space to search 810 * @index: The starting page index 811 * @nr_pages: The maximum number of pages 812 * @pages: Where the resulting pages are placed 813 * 814 * find_get_pages_contig() works exactly like find_get_pages(), except 815 * that the returned number of pages are guaranteed to be contiguous. 816 * 817 * find_get_pages_contig() returns the number of pages which were found. 818 */ 819 unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, 820 unsigned int nr_pages, struct page **pages) 821 { 822 unsigned int i; 823 unsigned int ret; 824 unsigned int nr_found; 825 826 rcu_read_lock(); 827 restart: 828 nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, 829 (void ***)pages, index, nr_pages); 830 ret = 0; 831 for (i = 0; i < nr_found; i++) { 832 struct page *page; 833 repeat: 834 page = radix_tree_deref_slot((void **)pages[i]); 835 if (unlikely(!page)) 836 continue; 837 if (radix_tree_deref_retry(page)) 838 goto restart; 839 840 if (!page_cache_get_speculative(page)) 841 goto repeat; 842 843 /* Has the page moved? */ 844 if (unlikely(page != *((void **)pages[i]))) { 845 page_cache_release(page); 846 goto repeat; 847 } 848 849 /* 850 * must check mapping and index after taking the ref. 851 * otherwise we can get both false positives and false 852 * negatives, which is just confusing to the caller. 853 */ 854 if (page->mapping == NULL || page->index != index) { 855 page_cache_release(page); 856 break; 857 } 858 859 pages[ret] = page; 860 ret++; 861 index++; 862 } 863 rcu_read_unlock(); 864 return ret; 865 } 866 EXPORT_SYMBOL(find_get_pages_contig); 867 868 /** 869 * find_get_pages_tag - find and return pages that match @tag 870 * @mapping: the address_space to search 871 * @index: the starting page index 872 * @tag: the tag index 873 * @nr_pages: the maximum number of pages 874 * @pages: where the resulting pages are placed 875 * 876 * Like find_get_pages, except we only return pages which are tagged with 877 * @tag. We update @index to index the next page for the traversal. 878 */ 879 unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, 880 int tag, unsigned int nr_pages, struct page **pages) 881 { 882 unsigned int i; 883 unsigned int ret; 884 unsigned int nr_found; 885 886 rcu_read_lock(); 887 restart: 888 nr_found = radix_tree_gang_lookup_tag_slot(&mapping->page_tree, 889 (void ***)pages, *index, nr_pages, tag); 890 ret = 0; 891 for (i = 0; i < nr_found; i++) { 892 struct page *page; 893 repeat: 894 page = radix_tree_deref_slot((void **)pages[i]); 895 if (unlikely(!page)) 896 continue; 897 if (radix_tree_deref_retry(page)) 898 goto restart; 899 900 if (!page_cache_get_speculative(page)) 901 goto repeat; 902 903 /* Has the page moved? */ 904 if (unlikely(page != *((void **)pages[i]))) { 905 page_cache_release(page); 906 goto repeat; 907 } 908 909 pages[ret] = page; 910 ret++; 911 } 912 rcu_read_unlock(); 913 914 if (ret) 915 *index = pages[ret - 1]->index + 1; 916 917 return ret; 918 } 919 EXPORT_SYMBOL(find_get_pages_tag); 920 921 /** 922 * grab_cache_page_nowait - returns locked page at given index in given cache 923 * @mapping: target address_space 924 * @index: the page index 925 * 926 * Same as grab_cache_page(), but do not wait if the page is unavailable. 927 * This is intended for speculative data generators, where the data can 928 * be regenerated if the page couldn't be grabbed. This routine should 929 * be safe to call while holding the lock for another page. 930 * 931 * Clear __GFP_FS when allocating the page to avoid recursion into the fs 932 * and deadlock against the caller's locked page. 933 */ 934 struct page * 935 grab_cache_page_nowait(struct address_space *mapping, pgoff_t index) 936 { 937 struct page *page = find_get_page(mapping, index); 938 939 if (page) { 940 if (trylock_page(page)) 941 return page; 942 page_cache_release(page); 943 return NULL; 944 } 945 page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS); 946 if (page && add_to_page_cache_lru(page, mapping, index, GFP_NOFS)) { 947 page_cache_release(page); 948 page = NULL; 949 } 950 return page; 951 } 952 EXPORT_SYMBOL(grab_cache_page_nowait); 953 954 /* 955 * CD/DVDs are error prone. When a medium error occurs, the driver may fail 956 * a _large_ part of the i/o request. Imagine the worst scenario: 957 * 958 * ---R__________________________________________B__________ 959 * ^ reading here ^ bad block(assume 4k) 960 * 961 * read(R) => miss => readahead(R...B) => media error => frustrating retries 962 * => failing the whole request => read(R) => read(R+1) => 963 * readahead(R+1...B+1) => bang => read(R+2) => read(R+3) => 964 * readahead(R+3...B+2) => bang => read(R+3) => read(R+4) => 965 * readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ...... 966 * 967 * It is going insane. Fix it by quickly scaling down the readahead size. 968 */ 969 static void shrink_readahead_size_eio(struct file *filp, 970 struct file_ra_state *ra) 971 { 972 ra->ra_pages /= 4; 973 } 974 975 /** 976 * do_generic_file_read - generic file read routine 977 * @filp: the file to read 978 * @ppos: current file position 979 * @desc: read_descriptor 980 * @actor: read method 981 * 982 * This is a generic file read routine, and uses the 983 * mapping->a_ops->readpage() function for the actual low-level stuff. 984 * 985 * This is really ugly. But the goto's actually try to clarify some 986 * of the logic when it comes to error handling etc. 987 */ 988 static void do_generic_file_read(struct file *filp, loff_t *ppos, 989 read_descriptor_t *desc, read_actor_t actor) 990 { 991 struct address_space *mapping = filp->f_mapping; 992 struct inode *inode = mapping->host; 993 struct file_ra_state *ra = &filp->f_ra; 994 pgoff_t index; 995 pgoff_t last_index; 996 pgoff_t prev_index; 997 unsigned long offset; /* offset into pagecache page */ 998 unsigned int prev_offset; 999 int error; 1000 1001 index = *ppos >> PAGE_CACHE_SHIFT; 1002 prev_index = ra->prev_pos >> PAGE_CACHE_SHIFT; 1003 prev_offset = ra->prev_pos & (PAGE_CACHE_SIZE-1); 1004 last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; 1005 offset = *ppos & ~PAGE_CACHE_MASK; 1006 1007 for (;;) { 1008 struct page *page; 1009 pgoff_t end_index; 1010 loff_t isize; 1011 unsigned long nr, ret; 1012 1013 cond_resched(); 1014 find_page: 1015 page = find_get_page(mapping, index); 1016 if (!page) { 1017 page_cache_sync_readahead(mapping, 1018 ra, filp, 1019 index, last_index - index); 1020 page = find_get_page(mapping, index); 1021 if (unlikely(page == NULL)) 1022 goto no_cached_page; 1023 } 1024 if (PageReadahead(page)) { 1025 page_cache_async_readahead(mapping, 1026 ra, filp, page, 1027 index, last_index - index); 1028 } 1029 if (!PageUptodate(page)) { 1030 if (inode->i_blkbits == PAGE_CACHE_SHIFT || 1031 !mapping->a_ops->is_partially_uptodate) 1032 goto page_not_up_to_date; 1033 if (!trylock_page(page)) 1034 goto page_not_up_to_date; 1035 /* Did it get truncated before we got the lock? */ 1036 if (!page->mapping) 1037 goto page_not_up_to_date_locked; 1038 if (!mapping->a_ops->is_partially_uptodate(page, 1039 desc, offset)) 1040 goto page_not_up_to_date_locked; 1041 unlock_page(page); 1042 } 1043 page_ok: 1044 /* 1045 * i_size must be checked after we know the page is Uptodate. 1046 * 1047 * Checking i_size after the check allows us to calculate 1048 * the correct value for "nr", which means the zero-filled 1049 * part of the page is not copied back to userspace (unless 1050 * another truncate extends the file - this is desired though). 1051 */ 1052 1053 isize = i_size_read(inode); 1054 end_index = (isize - 1) >> PAGE_CACHE_SHIFT; 1055 if (unlikely(!isize || index > end_index)) { 1056 page_cache_release(page); 1057 goto out; 1058 } 1059 1060 /* nr is the maximum number of bytes to copy from this page */ 1061 nr = PAGE_CACHE_SIZE; 1062 if (index == end_index) { 1063 nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; 1064 if (nr <= offset) { 1065 page_cache_release(page); 1066 goto out; 1067 } 1068 } 1069 nr = nr - offset; 1070 1071 /* If users can be writing to this page using arbitrary 1072 * virtual addresses, take care about potential aliasing 1073 * before reading the page on the kernel side. 1074 */ 1075 if (mapping_writably_mapped(mapping)) 1076 flush_dcache_page(page); 1077 1078 /* 1079 * When a sequential read accesses a page several times, 1080 * only mark it as accessed the first time. 1081 */ 1082 if (prev_index != index || offset != prev_offset) 1083 mark_page_accessed(page); 1084 prev_index = index; 1085 1086 /* 1087 * Ok, we have the page, and it's up-to-date, so 1088 * now we can copy it to user space... 1089 * 1090 * The actor routine returns how many bytes were actually used.. 1091 * NOTE! This may not be the same as how much of a user buffer 1092 * we filled up (we may be padding etc), so we can only update 1093 * "pos" here (the actor routine has to update the user buffer 1094 * pointers and the remaining count). 1095 */ 1096 ret = actor(desc, page, offset, nr); 1097 offset += ret; 1098 index += offset >> PAGE_CACHE_SHIFT; 1099 offset &= ~PAGE_CACHE_MASK; 1100 prev_offset = offset; 1101 1102 page_cache_release(page); 1103 if (ret == nr && desc->count) 1104 continue; 1105 goto out; 1106 1107 page_not_up_to_date: 1108 /* Get exclusive access to the page ... */ 1109 error = lock_page_killable(page); 1110 if (unlikely(error)) 1111 goto readpage_error; 1112 1113 page_not_up_to_date_locked: 1114 /* Did it get truncated before we got the lock? */ 1115 if (!page->mapping) { 1116 unlock_page(page); 1117 page_cache_release(page); 1118 continue; 1119 } 1120 1121 /* Did somebody else fill it already? */ 1122 if (PageUptodate(page)) { 1123 unlock_page(page); 1124 goto page_ok; 1125 } 1126 1127 readpage: 1128 /* 1129 * A previous I/O error may have been due to temporary 1130 * failures, eg. multipath errors. 1131 * PG_error will be set again if readpage fails. 1132 */ 1133 ClearPageError(page); 1134 /* Start the actual read. The read will unlock the page. */ 1135 error = mapping->a_ops->readpage(filp, page); 1136 1137 if (unlikely(error)) { 1138 if (error == AOP_TRUNCATED_PAGE) { 1139 page_cache_release(page); 1140 goto find_page; 1141 } 1142 goto readpage_error; 1143 } 1144 1145 if (!PageUptodate(page)) { 1146 error = lock_page_killable(page); 1147 if (unlikely(error)) 1148 goto readpage_error; 1149 if (!PageUptodate(page)) { 1150 if (page->mapping == NULL) { 1151 /* 1152 * invalidate_mapping_pages got it 1153 */ 1154 unlock_page(page); 1155 page_cache_release(page); 1156 goto find_page; 1157 } 1158 unlock_page(page); 1159 shrink_readahead_size_eio(filp, ra); 1160 error = -EIO; 1161 goto readpage_error; 1162 } 1163 unlock_page(page); 1164 } 1165 1166 goto page_ok; 1167 1168 readpage_error: 1169 /* UHHUH! A synchronous read error occurred. Report it */ 1170 desc->error = error; 1171 page_cache_release(page); 1172 goto out; 1173 1174 no_cached_page: 1175 /* 1176 * Ok, it wasn't cached, so we need to create a new 1177 * page.. 1178 */ 1179 page = page_cache_alloc_cold(mapping); 1180 if (!page) { 1181 desc->error = -ENOMEM; 1182 goto out; 1183 } 1184 error = add_to_page_cache_lru(page, mapping, 1185 index, GFP_KERNEL); 1186 if (error) { 1187 page_cache_release(page); 1188 if (error == -EEXIST) 1189 goto find_page; 1190 desc->error = error; 1191 goto out; 1192 } 1193 goto readpage; 1194 } 1195 1196 out: 1197 ra->prev_pos = prev_index; 1198 ra->prev_pos <<= PAGE_CACHE_SHIFT; 1199 ra->prev_pos |= prev_offset; 1200 1201 *ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset; 1202 file_accessed(filp); 1203 } 1204 1205 int file_read_actor(read_descriptor_t *desc, struct page *page, 1206 unsigned long offset, unsigned long size) 1207 { 1208 char *kaddr; 1209 unsigned long left, count = desc->count; 1210 1211 if (size > count) 1212 size = count; 1213 1214 /* 1215 * Faults on the destination of a read are common, so do it before 1216 * taking the kmap. 1217 */ 1218 if (!fault_in_pages_writeable(desc->arg.buf, size)) { 1219 kaddr = kmap_atomic(page, KM_USER0); 1220 left = __copy_to_user_inatomic(desc->arg.buf, 1221 kaddr + offset, size); 1222 kunmap_atomic(kaddr, KM_USER0); 1223 if (left == 0) 1224 goto success; 1225 } 1226 1227 /* Do it the slow way */ 1228 kaddr = kmap(page); 1229 left = __copy_to_user(desc->arg.buf, kaddr + offset, size); 1230 kunmap(page); 1231 1232 if (left) { 1233 size -= left; 1234 desc->error = -EFAULT; 1235 } 1236 success: 1237 desc->count = count - size; 1238 desc->written += size; 1239 desc->arg.buf += size; 1240 return size; 1241 } 1242 1243 /* 1244 * Performs necessary checks before doing a write 1245 * @iov: io vector request 1246 * @nr_segs: number of segments in the iovec 1247 * @count: number of bytes to write 1248 * @access_flags: type of access: %VERIFY_READ or %VERIFY_WRITE 1249 * 1250 * Adjust number of segments and amount of bytes to write (nr_segs should be 1251 * properly initialized first). Returns appropriate error code that caller 1252 * should return or zero in case that write should be allowed. 1253 */ 1254 int generic_segment_checks(const struct iovec *iov, 1255 unsigned long *nr_segs, size_t *count, int access_flags) 1256 { 1257 unsigned long seg; 1258 size_t cnt = 0; 1259 for (seg = 0; seg < *nr_segs; seg++) { 1260 const struct iovec *iv = &iov[seg]; 1261 1262 /* 1263 * If any segment has a negative length, or the cumulative 1264 * length ever wraps negative then return -EINVAL. 1265 */ 1266 cnt += iv->iov_len; 1267 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0)) 1268 return -EINVAL; 1269 if (access_ok(access_flags, iv->iov_base, iv->iov_len)) 1270 continue; 1271 if (seg == 0) 1272 return -EFAULT; 1273 *nr_segs = seg; 1274 cnt -= iv->iov_len; /* This segment is no good */ 1275 break; 1276 } 1277 *count = cnt; 1278 return 0; 1279 } 1280 EXPORT_SYMBOL(generic_segment_checks); 1281 1282 /** 1283 * generic_file_aio_read - generic filesystem read routine 1284 * @iocb: kernel I/O control block 1285 * @iov: io vector request 1286 * @nr_segs: number of segments in the iovec 1287 * @pos: current file position 1288 * 1289 * This is the "read()" routine for all filesystems 1290 * that can use the page cache directly. 1291 */ 1292 ssize_t 1293 generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, 1294 unsigned long nr_segs, loff_t pos) 1295 { 1296 struct file *filp = iocb->ki_filp; 1297 ssize_t retval; 1298 unsigned long seg = 0; 1299 size_t count; 1300 loff_t *ppos = &iocb->ki_pos; 1301 1302 count = 0; 1303 retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); 1304 if (retval) 1305 return retval; 1306 1307 /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ 1308 if (filp->f_flags & O_DIRECT) { 1309 loff_t size; 1310 struct address_space *mapping; 1311 struct inode *inode; 1312 1313 mapping = filp->f_mapping; 1314 inode = mapping->host; 1315 if (!count) 1316 goto out; /* skip atime */ 1317 size = i_size_read(inode); 1318 if (pos < size) { 1319 retval = filemap_write_and_wait_range(mapping, pos, 1320 pos + iov_length(iov, nr_segs) - 1); 1321 if (!retval) { 1322 retval = mapping->a_ops->direct_IO(READ, iocb, 1323 iov, pos, nr_segs); 1324 } 1325 if (retval > 0) { 1326 *ppos = pos + retval; 1327 count -= retval; 1328 } 1329 1330 /* 1331 * Btrfs can have a short DIO read if we encounter 1332 * compressed extents, so if there was an error, or if 1333 * we've already read everything we wanted to, or if 1334 * there was a short read because we hit EOF, go ahead 1335 * and return. Otherwise fallthrough to buffered io for 1336 * the rest of the read. 1337 */ 1338 if (retval < 0 || !count || *ppos >= size) { 1339 file_accessed(filp); 1340 goto out; 1341 } 1342 } 1343 } 1344 1345 count = retval; 1346 for (seg = 0; seg < nr_segs; seg++) { 1347 read_descriptor_t desc; 1348 loff_t offset = 0; 1349 1350 /* 1351 * If we did a short DIO read we need to skip the section of the 1352 * iov that we've already read data into. 1353 */ 1354 if (count) { 1355 if (count > iov[seg].iov_len) { 1356 count -= iov[seg].iov_len; 1357 continue; 1358 } 1359 offset = count; 1360 count = 0; 1361 } 1362 1363 desc.written = 0; 1364 desc.arg.buf = iov[seg].iov_base + offset; 1365 desc.count = iov[seg].iov_len - offset; 1366 if (desc.count == 0) 1367 continue; 1368 desc.error = 0; 1369 do_generic_file_read(filp, ppos, &desc, file_read_actor); 1370 retval += desc.written; 1371 if (desc.error) { 1372 retval = retval ?: desc.error; 1373 break; 1374 } 1375 if (desc.count > 0) 1376 break; 1377 } 1378 out: 1379 return retval; 1380 } 1381 EXPORT_SYMBOL(generic_file_aio_read); 1382 1383 static ssize_t 1384 do_readahead(struct address_space *mapping, struct file *filp, 1385 pgoff_t index, unsigned long nr) 1386 { 1387 if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) 1388 return -EINVAL; 1389 1390 force_page_cache_readahead(mapping, filp, index, nr); 1391 return 0; 1392 } 1393 1394 SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count) 1395 { 1396 ssize_t ret; 1397 struct file *file; 1398 1399 ret = -EBADF; 1400 file = fget(fd); 1401 if (file) { 1402 if (file->f_mode & FMODE_READ) { 1403 struct address_space *mapping = file->f_mapping; 1404 pgoff_t start = offset >> PAGE_CACHE_SHIFT; 1405 pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT; 1406 unsigned long len = end - start + 1; 1407 ret = do_readahead(mapping, file, start, len); 1408 } 1409 fput(file); 1410 } 1411 return ret; 1412 } 1413 #ifdef CONFIG_HAVE_SYSCALL_WRAPPERS 1414 asmlinkage long SyS_readahead(long fd, loff_t offset, long count) 1415 { 1416 return SYSC_readahead((int) fd, offset, (size_t) count); 1417 } 1418 SYSCALL_ALIAS(sys_readahead, SyS_readahead); 1419 #endif 1420 1421 #ifdef CONFIG_MMU 1422 /** 1423 * page_cache_read - adds requested page to the page cache if not already there 1424 * @file: file to read 1425 * @offset: page index 1426 * 1427 * This adds the requested page to the page cache if it isn't already there, 1428 * and schedules an I/O to read in its contents from disk. 1429 */ 1430 static int page_cache_read(struct file *file, pgoff_t offset) 1431 { 1432 struct address_space *mapping = file->f_mapping; 1433 struct page *page; 1434 int ret; 1435 1436 do { 1437 page = page_cache_alloc_cold(mapping); 1438 if (!page) 1439 return -ENOMEM; 1440 1441 ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL); 1442 if (ret == 0) 1443 ret = mapping->a_ops->readpage(file, page); 1444 else if (ret == -EEXIST) 1445 ret = 0; /* losing race to add is OK */ 1446 1447 page_cache_release(page); 1448 1449 } while (ret == AOP_TRUNCATED_PAGE); 1450 1451 return ret; 1452 } 1453 1454 #define MMAP_LOTSAMISS (100) 1455 1456 /* 1457 * Synchronous readahead happens when we don't even find 1458 * a page in the page cache at all. 1459 */ 1460 static void do_sync_mmap_readahead(struct vm_area_struct *vma, 1461 struct file_ra_state *ra, 1462 struct file *file, 1463 pgoff_t offset) 1464 { 1465 unsigned long ra_pages; 1466 struct address_space *mapping = file->f_mapping; 1467 1468 /* If we don't want any read-ahead, don't bother */ 1469 if (VM_RandomReadHint(vma)) 1470 return; 1471 1472 if (VM_SequentialReadHint(vma) || 1473 offset - 1 == (ra->prev_pos >> PAGE_CACHE_SHIFT)) { 1474 page_cache_sync_readahead(mapping, ra, file, offset, 1475 ra->ra_pages); 1476 return; 1477 } 1478 1479 if (ra->mmap_miss < INT_MAX) 1480 ra->mmap_miss++; 1481 1482 /* 1483 * Do we miss much more than hit in this file? If so, 1484 * stop bothering with read-ahead. It will only hurt. 1485 */ 1486 if (ra->mmap_miss > MMAP_LOTSAMISS) 1487 return; 1488 1489 /* 1490 * mmap read-around 1491 */ 1492 ra_pages = max_sane_readahead(ra->ra_pages); 1493 if (ra_pages) { 1494 ra->start = max_t(long, 0, offset - ra_pages/2); 1495 ra->size = ra_pages; 1496 ra->async_size = 0; 1497 ra_submit(ra, mapping, file); 1498 } 1499 } 1500 1501 /* 1502 * Asynchronous readahead happens when we find the page and PG_readahead, 1503 * so we want to possibly extend the readahead further.. 1504 */ 1505 static void do_async_mmap_readahead(struct vm_area_struct *vma, 1506 struct file_ra_state *ra, 1507 struct file *file, 1508 struct page *page, 1509 pgoff_t offset) 1510 { 1511 struct address_space *mapping = file->f_mapping; 1512 1513 /* If we don't want any read-ahead, don't bother */ 1514 if (VM_RandomReadHint(vma)) 1515 return; 1516 if (ra->mmap_miss > 0) 1517 ra->mmap_miss--; 1518 if (PageReadahead(page)) 1519 page_cache_async_readahead(mapping, ra, file, 1520 page, offset, ra->ra_pages); 1521 } 1522 1523 /** 1524 * filemap_fault - read in file data for page fault handling 1525 * @vma: vma in which the fault was taken 1526 * @vmf: struct vm_fault containing details of the fault 1527 * 1528 * filemap_fault() is invoked via the vma operations vector for a 1529 * mapped memory region to read in file data during a page fault. 1530 * 1531 * The goto's are kind of ugly, but this streamlines the normal case of having 1532 * it in the page cache, and handles the special cases reasonably without 1533 * having a lot of duplicated code. 1534 */ 1535 int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 1536 { 1537 int error; 1538 struct file *file = vma->vm_file; 1539 struct address_space *mapping = file->f_mapping; 1540 struct file_ra_state *ra = &file->f_ra; 1541 struct inode *inode = mapping->host; 1542 pgoff_t offset = vmf->pgoff; 1543 struct page *page; 1544 pgoff_t size; 1545 int ret = 0; 1546 1547 size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1548 if (offset >= size) 1549 return VM_FAULT_SIGBUS; 1550 1551 /* 1552 * Do we have something in the page cache already? 1553 */ 1554 page = find_get_page(mapping, offset); 1555 if (likely(page)) { 1556 /* 1557 * We found the page, so try async readahead before 1558 * waiting for the lock. 1559 */ 1560 do_async_mmap_readahead(vma, ra, file, page, offset); 1561 } else { 1562 /* No page in the page cache at all */ 1563 do_sync_mmap_readahead(vma, ra, file, offset); 1564 count_vm_event(PGMAJFAULT); 1565 ret = VM_FAULT_MAJOR; 1566 retry_find: 1567 page = find_get_page(mapping, offset); 1568 if (!page) 1569 goto no_cached_page; 1570 } 1571 1572 if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) { 1573 page_cache_release(page); 1574 return ret | VM_FAULT_RETRY; 1575 } 1576 1577 /* Did it get truncated? */ 1578 if (unlikely(page->mapping != mapping)) { 1579 unlock_page(page); 1580 put_page(page); 1581 goto retry_find; 1582 } 1583 VM_BUG_ON(page->index != offset); 1584 1585 /* 1586 * We have a locked page in the page cache, now we need to check 1587 * that it's up-to-date. If not, it is going to be due to an error. 1588 */ 1589 if (unlikely(!PageUptodate(page))) 1590 goto page_not_uptodate; 1591 1592 /* 1593 * Found the page and have a reference on it. 1594 * We must recheck i_size under page lock. 1595 */ 1596 size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1597 if (unlikely(offset >= size)) { 1598 unlock_page(page); 1599 page_cache_release(page); 1600 return VM_FAULT_SIGBUS; 1601 } 1602 1603 ra->prev_pos = (loff_t)offset << PAGE_CACHE_SHIFT; 1604 vmf->page = page; 1605 return ret | VM_FAULT_LOCKED; 1606 1607 no_cached_page: 1608 /* 1609 * We're only likely to ever get here if MADV_RANDOM is in 1610 * effect. 1611 */ 1612 error = page_cache_read(file, offset); 1613 1614 /* 1615 * The page we want has now been added to the page cache. 1616 * In the unlikely event that someone removed it in the 1617 * meantime, we'll just come back here and read it again. 1618 */ 1619 if (error >= 0) 1620 goto retry_find; 1621 1622 /* 1623 * An error return from page_cache_read can result if the 1624 * system is low on memory, or a problem occurs while trying 1625 * to schedule I/O. 1626 */ 1627 if (error == -ENOMEM) 1628 return VM_FAULT_OOM; 1629 return VM_FAULT_SIGBUS; 1630 1631 page_not_uptodate: 1632 /* 1633 * Umm, take care of errors if the page isn't up-to-date. 1634 * Try to re-read it _once_. We do this synchronously, 1635 * because there really aren't any performance issues here 1636 * and we need to check for errors. 1637 */ 1638 ClearPageError(page); 1639 error = mapping->a_ops->readpage(file, page); 1640 if (!error) { 1641 wait_on_page_locked(page); 1642 if (!PageUptodate(page)) 1643 error = -EIO; 1644 } 1645 page_cache_release(page); 1646 1647 if (!error || error == AOP_TRUNCATED_PAGE) 1648 goto retry_find; 1649 1650 /* Things didn't work out. Return zero to tell the mm layer so. */ 1651 shrink_readahead_size_eio(file, ra); 1652 return VM_FAULT_SIGBUS; 1653 } 1654 EXPORT_SYMBOL(filemap_fault); 1655 1656 const struct vm_operations_struct generic_file_vm_ops = { 1657 .fault = filemap_fault, 1658 }; 1659 1660 /* This is used for a general mmap of a disk file */ 1661 1662 int generic_file_mmap(struct file * file, struct vm_area_struct * vma) 1663 { 1664 struct address_space *mapping = file->f_mapping; 1665 1666 if (!mapping->a_ops->readpage) 1667 return -ENOEXEC; 1668 file_accessed(file); 1669 vma->vm_ops = &generic_file_vm_ops; 1670 vma->vm_flags |= VM_CAN_NONLINEAR; 1671 return 0; 1672 } 1673 1674 /* 1675 * This is for filesystems which do not implement ->writepage. 1676 */ 1677 int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma) 1678 { 1679 if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) 1680 return -EINVAL; 1681 return generic_file_mmap(file, vma); 1682 } 1683 #else 1684 int generic_file_mmap(struct file * file, struct vm_area_struct * vma) 1685 { 1686 return -ENOSYS; 1687 } 1688 int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma) 1689 { 1690 return -ENOSYS; 1691 } 1692 #endif /* CONFIG_MMU */ 1693 1694 EXPORT_SYMBOL(generic_file_mmap); 1695 EXPORT_SYMBOL(generic_file_readonly_mmap); 1696 1697 static struct page *__read_cache_page(struct address_space *mapping, 1698 pgoff_t index, 1699 int (*filler)(void *,struct page*), 1700 void *data, 1701 gfp_t gfp) 1702 { 1703 struct page *page; 1704 int err; 1705 repeat: 1706 page = find_get_page(mapping, index); 1707 if (!page) { 1708 page = __page_cache_alloc(gfp | __GFP_COLD); 1709 if (!page) 1710 return ERR_PTR(-ENOMEM); 1711 err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL); 1712 if (unlikely(err)) { 1713 page_cache_release(page); 1714 if (err == -EEXIST) 1715 goto repeat; 1716 /* Presumably ENOMEM for radix tree node */ 1717 return ERR_PTR(err); 1718 } 1719 err = filler(data, page); 1720 if (err < 0) { 1721 page_cache_release(page); 1722 page = ERR_PTR(err); 1723 } 1724 } 1725 return page; 1726 } 1727 1728 static struct page *do_read_cache_page(struct address_space *mapping, 1729 pgoff_t index, 1730 int (*filler)(void *,struct page*), 1731 void *data, 1732 gfp_t gfp) 1733 1734 { 1735 struct page *page; 1736 int err; 1737 1738 retry: 1739 page = __read_cache_page(mapping, index, filler, data, gfp); 1740 if (IS_ERR(page)) 1741 return page; 1742 if (PageUptodate(page)) 1743 goto out; 1744 1745 lock_page(page); 1746 if (!page->mapping) { 1747 unlock_page(page); 1748 page_cache_release(page); 1749 goto retry; 1750 } 1751 if (PageUptodate(page)) { 1752 unlock_page(page); 1753 goto out; 1754 } 1755 err = filler(data, page); 1756 if (err < 0) { 1757 page_cache_release(page); 1758 return ERR_PTR(err); 1759 } 1760 out: 1761 mark_page_accessed(page); 1762 return page; 1763 } 1764 1765 /** 1766 * read_cache_page_async - read into page cache, fill it if needed 1767 * @mapping: the page's address_space 1768 * @index: the page index 1769 * @filler: function to perform the read 1770 * @data: destination for read data 1771 * 1772 * Same as read_cache_page, but don't wait for page to become unlocked 1773 * after submitting it to the filler. 1774 * 1775 * Read into the page cache. If a page already exists, and PageUptodate() is 1776 * not set, try to fill the page but don't wait for it to become unlocked. 1777 * 1778 * If the page does not get brought uptodate, return -EIO. 1779 */ 1780 struct page *read_cache_page_async(struct address_space *mapping, 1781 pgoff_t index, 1782 int (*filler)(void *,struct page*), 1783 void *data) 1784 { 1785 return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping)); 1786 } 1787 EXPORT_SYMBOL(read_cache_page_async); 1788 1789 static struct page *wait_on_page_read(struct page *page) 1790 { 1791 if (!IS_ERR(page)) { 1792 wait_on_page_locked(page); 1793 if (!PageUptodate(page)) { 1794 page_cache_release(page); 1795 page = ERR_PTR(-EIO); 1796 } 1797 } 1798 return page; 1799 } 1800 1801 /** 1802 * read_cache_page_gfp - read into page cache, using specified page allocation flags. 1803 * @mapping: the page's address_space 1804 * @index: the page index 1805 * @gfp: the page allocator flags to use if allocating 1806 * 1807 * This is the same as "read_mapping_page(mapping, index, NULL)", but with 1808 * any new page allocations done using the specified allocation flags. Note 1809 * that the Radix tree operations will still use GFP_KERNEL, so you can't 1810 * expect to do this atomically or anything like that - but you can pass in 1811 * other page requirements. 1812 * 1813 * If the page does not get brought uptodate, return -EIO. 1814 */ 1815 struct page *read_cache_page_gfp(struct address_space *mapping, 1816 pgoff_t index, 1817 gfp_t gfp) 1818 { 1819 filler_t *filler = (filler_t *)mapping->a_ops->readpage; 1820 1821 return wait_on_page_read(do_read_cache_page(mapping, index, filler, NULL, gfp)); 1822 } 1823 EXPORT_SYMBOL(read_cache_page_gfp); 1824 1825 /** 1826 * read_cache_page - read into page cache, fill it if needed 1827 * @mapping: the page's address_space 1828 * @index: the page index 1829 * @filler: function to perform the read 1830 * @data: destination for read data 1831 * 1832 * Read into the page cache. If a page already exists, and PageUptodate() is 1833 * not set, try to fill the page then wait for it to become unlocked. 1834 * 1835 * If the page does not get brought uptodate, return -EIO. 1836 */ 1837 struct page *read_cache_page(struct address_space *mapping, 1838 pgoff_t index, 1839 int (*filler)(void *,struct page*), 1840 void *data) 1841 { 1842 return wait_on_page_read(read_cache_page_async(mapping, index, filler, data)); 1843 } 1844 EXPORT_SYMBOL(read_cache_page); 1845 1846 /* 1847 * The logic we want is 1848 * 1849 * if suid or (sgid and xgrp) 1850 * remove privs 1851 */ 1852 int should_remove_suid(struct dentry *dentry) 1853 { 1854 mode_t mode = dentry->d_inode->i_mode; 1855 int kill = 0; 1856 1857 /* suid always must be killed */ 1858 if (unlikely(mode & S_ISUID)) 1859 kill = ATTR_KILL_SUID; 1860 1861 /* 1862 * sgid without any exec bits is just a mandatory locking mark; leave 1863 * it alone. If some exec bits are set, it's a real sgid; kill it. 1864 */ 1865 if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) 1866 kill |= ATTR_KILL_SGID; 1867 1868 if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode))) 1869 return kill; 1870 1871 return 0; 1872 } 1873 EXPORT_SYMBOL(should_remove_suid); 1874 1875 static int __remove_suid(struct dentry *dentry, int kill) 1876 { 1877 struct iattr newattrs; 1878 1879 newattrs.ia_valid = ATTR_FORCE | kill; 1880 return notify_change(dentry, &newattrs); 1881 } 1882 1883 int file_remove_suid(struct file *file) 1884 { 1885 struct dentry *dentry = file->f_path.dentry; 1886 int killsuid = should_remove_suid(dentry); 1887 int killpriv = security_inode_need_killpriv(dentry); 1888 int error = 0; 1889 1890 if (killpriv < 0) 1891 return killpriv; 1892 if (killpriv) 1893 error = security_inode_killpriv(dentry); 1894 if (!error && killsuid) 1895 error = __remove_suid(dentry, killsuid); 1896 1897 return error; 1898 } 1899 EXPORT_SYMBOL(file_remove_suid); 1900 1901 static size_t __iovec_copy_from_user_inatomic(char *vaddr, 1902 const struct iovec *iov, size_t base, size_t bytes) 1903 { 1904 size_t copied = 0, left = 0; 1905 1906 while (bytes) { 1907 char __user *buf = iov->iov_base + base; 1908 int copy = min(bytes, iov->iov_len - base); 1909 1910 base = 0; 1911 left = __copy_from_user_inatomic(vaddr, buf, copy); 1912 copied += copy; 1913 bytes -= copy; 1914 vaddr += copy; 1915 iov++; 1916 1917 if (unlikely(left)) 1918 break; 1919 } 1920 return copied - left; 1921 } 1922 1923 /* 1924 * Copy as much as we can into the page and return the number of bytes which 1925 * were successfully copied. If a fault is encountered then return the number of 1926 * bytes which were copied. 1927 */ 1928 size_t iov_iter_copy_from_user_atomic(struct page *page, 1929 struct iov_iter *i, unsigned long offset, size_t bytes) 1930 { 1931 char *kaddr; 1932 size_t copied; 1933 1934 BUG_ON(!in_atomic()); 1935 kaddr = kmap_atomic(page, KM_USER0); 1936 if (likely(i->nr_segs == 1)) { 1937 int left; 1938 char __user *buf = i->iov->iov_base + i->iov_offset; 1939 left = __copy_from_user_inatomic(kaddr + offset, buf, bytes); 1940 copied = bytes - left; 1941 } else { 1942 copied = __iovec_copy_from_user_inatomic(kaddr + offset, 1943 i->iov, i->iov_offset, bytes); 1944 } 1945 kunmap_atomic(kaddr, KM_USER0); 1946 1947 return copied; 1948 } 1949 EXPORT_SYMBOL(iov_iter_copy_from_user_atomic); 1950 1951 /* 1952 * This has the same sideeffects and return value as 1953 * iov_iter_copy_from_user_atomic(). 1954 * The difference is that it attempts to resolve faults. 1955 * Page must not be locked. 1956 */ 1957 size_t iov_iter_copy_from_user(struct page *page, 1958 struct iov_iter *i, unsigned long offset, size_t bytes) 1959 { 1960 char *kaddr; 1961 size_t copied; 1962 1963 kaddr = kmap(page); 1964 if (likely(i->nr_segs == 1)) { 1965 int left; 1966 char __user *buf = i->iov->iov_base + i->iov_offset; 1967 left = __copy_from_user(kaddr + offset, buf, bytes); 1968 copied = bytes - left; 1969 } else { 1970 copied = __iovec_copy_from_user_inatomic(kaddr + offset, 1971 i->iov, i->iov_offset, bytes); 1972 } 1973 kunmap(page); 1974 return copied; 1975 } 1976 EXPORT_SYMBOL(iov_iter_copy_from_user); 1977 1978 void iov_iter_advance(struct iov_iter *i, size_t bytes) 1979 { 1980 BUG_ON(i->count < bytes); 1981 1982 if (likely(i->nr_segs == 1)) { 1983 i->iov_offset += bytes; 1984 i->count -= bytes; 1985 } else { 1986 const struct iovec *iov = i->iov; 1987 size_t base = i->iov_offset; 1988 1989 /* 1990 * The !iov->iov_len check ensures we skip over unlikely 1991 * zero-length segments (without overruning the iovec). 1992 */ 1993 while (bytes || unlikely(i->count && !iov->iov_len)) { 1994 int copy; 1995 1996 copy = min(bytes, iov->iov_len - base); 1997 BUG_ON(!i->count || i->count < copy); 1998 i->count -= copy; 1999 bytes -= copy; 2000 base += copy; 2001 if (iov->iov_len == base) { 2002 iov++; 2003 base = 0; 2004 } 2005 } 2006 i->iov = iov; 2007 i->iov_offset = base; 2008 } 2009 } 2010 EXPORT_SYMBOL(iov_iter_advance); 2011 2012 /* 2013 * Fault in the first iovec of the given iov_iter, to a maximum length 2014 * of bytes. Returns 0 on success, or non-zero if the memory could not be 2015 * accessed (ie. because it is an invalid address). 2016 * 2017 * writev-intensive code may want this to prefault several iovecs -- that 2018 * would be possible (callers must not rely on the fact that _only_ the 2019 * first iovec will be faulted with the current implementation). 2020 */ 2021 int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes) 2022 { 2023 char __user *buf = i->iov->iov_base + i->iov_offset; 2024 bytes = min(bytes, i->iov->iov_len - i->iov_offset); 2025 return fault_in_pages_readable(buf, bytes); 2026 } 2027 EXPORT_SYMBOL(iov_iter_fault_in_readable); 2028 2029 /* 2030 * Return the count of just the current iov_iter segment. 2031 */ 2032 size_t iov_iter_single_seg_count(struct iov_iter *i) 2033 { 2034 const struct iovec *iov = i->iov; 2035 if (i->nr_segs == 1) 2036 return i->count; 2037 else 2038 return min(i->count, iov->iov_len - i->iov_offset); 2039 } 2040 EXPORT_SYMBOL(iov_iter_single_seg_count); 2041 2042 /* 2043 * Performs necessary checks before doing a write 2044 * 2045 * Can adjust writing position or amount of bytes to write. 2046 * Returns appropriate error code that caller should return or 2047 * zero in case that write should be allowed. 2048 */ 2049 inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk) 2050 { 2051 struct inode *inode = file->f_mapping->host; 2052 unsigned long limit = rlimit(RLIMIT_FSIZE); 2053 2054 if (unlikely(*pos < 0)) 2055 return -EINVAL; 2056 2057 if (!isblk) { 2058 /* FIXME: this is for backwards compatibility with 2.4 */ 2059 if (file->f_flags & O_APPEND) 2060 *pos = i_size_read(inode); 2061 2062 if (limit != RLIM_INFINITY) { 2063 if (*pos >= limit) { 2064 send_sig(SIGXFSZ, current, 0); 2065 return -EFBIG; 2066 } 2067 if (*count > limit - (typeof(limit))*pos) { 2068 *count = limit - (typeof(limit))*pos; 2069 } 2070 } 2071 } 2072 2073 /* 2074 * LFS rule 2075 */ 2076 if (unlikely(*pos + *count > MAX_NON_LFS && 2077 !(file->f_flags & O_LARGEFILE))) { 2078 if (*pos >= MAX_NON_LFS) { 2079 return -EFBIG; 2080 } 2081 if (*count > MAX_NON_LFS - (unsigned long)*pos) { 2082 *count = MAX_NON_LFS - (unsigned long)*pos; 2083 } 2084 } 2085 2086 /* 2087 * Are we about to exceed the fs block limit ? 2088 * 2089 * If we have written data it becomes a short write. If we have 2090 * exceeded without writing data we send a signal and return EFBIG. 2091 * Linus frestrict idea will clean these up nicely.. 2092 */ 2093 if (likely(!isblk)) { 2094 if (unlikely(*pos >= inode->i_sb->s_maxbytes)) { 2095 if (*count || *pos > inode->i_sb->s_maxbytes) { 2096 return -EFBIG; 2097 } 2098 /* zero-length writes at ->s_maxbytes are OK */ 2099 } 2100 2101 if (unlikely(*pos + *count > inode->i_sb->s_maxbytes)) 2102 *count = inode->i_sb->s_maxbytes - *pos; 2103 } else { 2104 #ifdef CONFIG_BLOCK 2105 loff_t isize; 2106 if (bdev_read_only(I_BDEV(inode))) 2107 return -EPERM; 2108 isize = i_size_read(inode); 2109 if (*pos >= isize) { 2110 if (*count || *pos > isize) 2111 return -ENOSPC; 2112 } 2113 2114 if (*pos + *count > isize) 2115 *count = isize - *pos; 2116 #else 2117 return -EPERM; 2118 #endif 2119 } 2120 return 0; 2121 } 2122 EXPORT_SYMBOL(generic_write_checks); 2123 2124 int pagecache_write_begin(struct file *file, struct address_space *mapping, 2125 loff_t pos, unsigned len, unsigned flags, 2126 struct page **pagep, void **fsdata) 2127 { 2128 const struct address_space_operations *aops = mapping->a_ops; 2129 2130 return aops->write_begin(file, mapping, pos, len, flags, 2131 pagep, fsdata); 2132 } 2133 EXPORT_SYMBOL(pagecache_write_begin); 2134 2135 int pagecache_write_end(struct file *file, struct address_space *mapping, 2136 loff_t pos, unsigned len, unsigned copied, 2137 struct page *page, void *fsdata) 2138 { 2139 const struct address_space_operations *aops = mapping->a_ops; 2140 2141 mark_page_accessed(page); 2142 return aops->write_end(file, mapping, pos, len, copied, page, fsdata); 2143 } 2144 EXPORT_SYMBOL(pagecache_write_end); 2145 2146 ssize_t 2147 generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, 2148 unsigned long *nr_segs, loff_t pos, loff_t *ppos, 2149 size_t count, size_t ocount) 2150 { 2151 struct file *file = iocb->ki_filp; 2152 struct address_space *mapping = file->f_mapping; 2153 struct inode *inode = mapping->host; 2154 ssize_t written; 2155 size_t write_len; 2156 pgoff_t end; 2157 2158 if (count != ocount) 2159 *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count); 2160 2161 write_len = iov_length(iov, *nr_segs); 2162 end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT; 2163 2164 written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1); 2165 if (written) 2166 goto out; 2167 2168 /* 2169 * After a write we want buffered reads to be sure to go to disk to get 2170 * the new data. We invalidate clean cached page from the region we're 2171 * about to write. We do this *before* the write so that we can return 2172 * without clobbering -EIOCBQUEUED from ->direct_IO(). 2173 */ 2174 if (mapping->nrpages) { 2175 written = invalidate_inode_pages2_range(mapping, 2176 pos >> PAGE_CACHE_SHIFT, end); 2177 /* 2178 * If a page can not be invalidated, return 0 to fall back 2179 * to buffered write. 2180 */ 2181 if (written) { 2182 if (written == -EBUSY) 2183 return 0; 2184 goto out; 2185 } 2186 } 2187 2188 written = mapping->a_ops->direct_IO(WRITE, iocb, iov, pos, *nr_segs); 2189 2190 /* 2191 * Finally, try again to invalidate clean pages which might have been 2192 * cached by non-direct readahead, or faulted in by get_user_pages() 2193 * if the source of the write was an mmap'ed region of the file 2194 * we're writing. Either one is a pretty crazy thing to do, 2195 * so we don't support it 100%. If this invalidation 2196 * fails, tough, the write still worked... 2197 */ 2198 if (mapping->nrpages) { 2199 invalidate_inode_pages2_range(mapping, 2200 pos >> PAGE_CACHE_SHIFT, end); 2201 } 2202 2203 if (written > 0) { 2204 pos += written; 2205 if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { 2206 i_size_write(inode, pos); 2207 mark_inode_dirty(inode); 2208 } 2209 *ppos = pos; 2210 } 2211 out: 2212 return written; 2213 } 2214 EXPORT_SYMBOL(generic_file_direct_write); 2215 2216 /* 2217 * Find or create a page at the given pagecache position. Return the locked 2218 * page. This function is specifically for buffered writes. 2219 */ 2220 struct page *grab_cache_page_write_begin(struct address_space *mapping, 2221 pgoff_t index, unsigned flags) 2222 { 2223 int status; 2224 struct page *page; 2225 gfp_t gfp_notmask = 0; 2226 if (flags & AOP_FLAG_NOFS) 2227 gfp_notmask = __GFP_FS; 2228 repeat: 2229 page = find_lock_page(mapping, index); 2230 if (page) 2231 return page; 2232 2233 page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~gfp_notmask); 2234 if (!page) 2235 return NULL; 2236 status = add_to_page_cache_lru(page, mapping, index, 2237 GFP_KERNEL & ~gfp_notmask); 2238 if (unlikely(status)) { 2239 page_cache_release(page); 2240 if (status == -EEXIST) 2241 goto repeat; 2242 return NULL; 2243 } 2244 return page; 2245 } 2246 EXPORT_SYMBOL(grab_cache_page_write_begin); 2247 2248 static ssize_t generic_perform_write(struct file *file, 2249 struct iov_iter *i, loff_t pos) 2250 { 2251 struct address_space *mapping = file->f_mapping; 2252 const struct address_space_operations *a_ops = mapping->a_ops; 2253 long status = 0; 2254 ssize_t written = 0; 2255 unsigned int flags = 0; 2256 2257 /* 2258 * Copies from kernel address space cannot fail (NFSD is a big user). 2259 */ 2260 if (segment_eq(get_fs(), KERNEL_DS)) 2261 flags |= AOP_FLAG_UNINTERRUPTIBLE; 2262 2263 do { 2264 struct page *page; 2265 unsigned long offset; /* Offset into pagecache page */ 2266 unsigned long bytes; /* Bytes to write to page */ 2267 size_t copied; /* Bytes copied from user */ 2268 void *fsdata; 2269 2270 offset = (pos & (PAGE_CACHE_SIZE - 1)); 2271 bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, 2272 iov_iter_count(i)); 2273 2274 again: 2275 2276 /* 2277 * Bring in the user page that we will copy from _first_. 2278 * Otherwise there's a nasty deadlock on copying from the 2279 * same page as we're writing to, without it being marked 2280 * up-to-date. 2281 * 2282 * Not only is this an optimisation, but it is also required 2283 * to check that the address is actually valid, when atomic 2284 * usercopies are used, below. 2285 */ 2286 if (unlikely(iov_iter_fault_in_readable(i, bytes))) { 2287 status = -EFAULT; 2288 break; 2289 } 2290 2291 status = a_ops->write_begin(file, mapping, pos, bytes, flags, 2292 &page, &fsdata); 2293 if (unlikely(status)) 2294 break; 2295 2296 if (mapping_writably_mapped(mapping)) 2297 flush_dcache_page(page); 2298 2299 pagefault_disable(); 2300 copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); 2301 pagefault_enable(); 2302 flush_dcache_page(page); 2303 2304 mark_page_accessed(page); 2305 status = a_ops->write_end(file, mapping, pos, bytes, copied, 2306 page, fsdata); 2307 if (unlikely(status < 0)) 2308 break; 2309 copied = status; 2310 2311 cond_resched(); 2312 2313 iov_iter_advance(i, copied); 2314 if (unlikely(copied == 0)) { 2315 /* 2316 * If we were unable to copy any data at all, we must 2317 * fall back to a single segment length write. 2318 * 2319 * If we didn't fallback here, we could livelock 2320 * because not all segments in the iov can be copied at 2321 * once without a pagefault. 2322 */ 2323 bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, 2324 iov_iter_single_seg_count(i)); 2325 goto again; 2326 } 2327 pos += copied; 2328 written += copied; 2329 2330 balance_dirty_pages_ratelimited(mapping); 2331 2332 } while (iov_iter_count(i)); 2333 2334 return written ? written : status; 2335 } 2336 2337 ssize_t 2338 generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, 2339 unsigned long nr_segs, loff_t pos, loff_t *ppos, 2340 size_t count, ssize_t written) 2341 { 2342 struct file *file = iocb->ki_filp; 2343 ssize_t status; 2344 struct iov_iter i; 2345 2346 iov_iter_init(&i, iov, nr_segs, count, written); 2347 status = generic_perform_write(file, &i, pos); 2348 2349 if (likely(status >= 0)) { 2350 written += status; 2351 *ppos = pos + status; 2352 } 2353 2354 return written ? written : status; 2355 } 2356 EXPORT_SYMBOL(generic_file_buffered_write); 2357 2358 /** 2359 * __generic_file_aio_write - write data to a file 2360 * @iocb: IO state structure (file, offset, etc.) 2361 * @iov: vector with data to write 2362 * @nr_segs: number of segments in the vector 2363 * @ppos: position where to write 2364 * 2365 * This function does all the work needed for actually writing data to a 2366 * file. It does all basic checks, removes SUID from the file, updates 2367 * modification times and calls proper subroutines depending on whether we 2368 * do direct IO or a standard buffered write. 2369 * 2370 * It expects i_mutex to be grabbed unless we work on a block device or similar 2371 * object which does not need locking at all. 2372 * 2373 * This function does *not* take care of syncing data in case of O_SYNC write. 2374 * A caller has to handle it. This is mainly due to the fact that we want to 2375 * avoid syncing under i_mutex. 2376 */ 2377 ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, 2378 unsigned long nr_segs, loff_t *ppos) 2379 { 2380 struct file *file = iocb->ki_filp; 2381 struct address_space * mapping = file->f_mapping; 2382 size_t ocount; /* original count */ 2383 size_t count; /* after file limit checks */ 2384 struct inode *inode = mapping->host; 2385 loff_t pos; 2386 ssize_t written; 2387 ssize_t err; 2388 2389 ocount = 0; 2390 err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ); 2391 if (err) 2392 return err; 2393 2394 count = ocount; 2395 pos = *ppos; 2396 2397 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); 2398 2399 /* We can write back this queue in page reclaim */ 2400 current->backing_dev_info = mapping->backing_dev_info; 2401 written = 0; 2402 2403 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); 2404 if (err) 2405 goto out; 2406 2407 if (count == 0) 2408 goto out; 2409 2410 err = file_remove_suid(file); 2411 if (err) 2412 goto out; 2413 2414 file_update_time(file); 2415 2416 /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ 2417 if (unlikely(file->f_flags & O_DIRECT)) { 2418 loff_t endbyte; 2419 ssize_t written_buffered; 2420 2421 written = generic_file_direct_write(iocb, iov, &nr_segs, pos, 2422 ppos, count, ocount); 2423 if (written < 0 || written == count) 2424 goto out; 2425 /* 2426 * direct-io write to a hole: fall through to buffered I/O 2427 * for completing the rest of the request. 2428 */ 2429 pos += written; 2430 count -= written; 2431 written_buffered = generic_file_buffered_write(iocb, iov, 2432 nr_segs, pos, ppos, count, 2433 written); 2434 /* 2435 * If generic_file_buffered_write() retuned a synchronous error 2436 * then we want to return the number of bytes which were 2437 * direct-written, or the error code if that was zero. Note 2438 * that this differs from normal direct-io semantics, which 2439 * will return -EFOO even if some bytes were written. 2440 */ 2441 if (written_buffered < 0) { 2442 err = written_buffered; 2443 goto out; 2444 } 2445 2446 /* 2447 * We need to ensure that the page cache pages are written to 2448 * disk and invalidated to preserve the expected O_DIRECT 2449 * semantics. 2450 */ 2451 endbyte = pos + written_buffered - written - 1; 2452 err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte); 2453 if (err == 0) { 2454 written = written_buffered; 2455 invalidate_mapping_pages(mapping, 2456 pos >> PAGE_CACHE_SHIFT, 2457 endbyte >> PAGE_CACHE_SHIFT); 2458 } else { 2459 /* 2460 * We don't know how much we wrote, so just return 2461 * the number of bytes which were direct-written 2462 */ 2463 } 2464 } else { 2465 written = generic_file_buffered_write(iocb, iov, nr_segs, 2466 pos, ppos, count, written); 2467 } 2468 out: 2469 current->backing_dev_info = NULL; 2470 return written ? written : err; 2471 } 2472 EXPORT_SYMBOL(__generic_file_aio_write); 2473 2474 /** 2475 * generic_file_aio_write - write data to a file 2476 * @iocb: IO state structure 2477 * @iov: vector with data to write 2478 * @nr_segs: number of segments in the vector 2479 * @pos: position in file where to write 2480 * 2481 * This is a wrapper around __generic_file_aio_write() to be used by most 2482 * filesystems. It takes care of syncing the file in case of O_SYNC file 2483 * and acquires i_mutex as needed. 2484 */ 2485 ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, 2486 unsigned long nr_segs, loff_t pos) 2487 { 2488 struct file *file = iocb->ki_filp; 2489 struct inode *inode = file->f_mapping->host; 2490 ssize_t ret; 2491 2492 BUG_ON(iocb->ki_pos != pos); 2493 2494 mutex_lock(&inode->i_mutex); 2495 ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); 2496 mutex_unlock(&inode->i_mutex); 2497 2498 if (ret > 0 || ret == -EIOCBQUEUED) { 2499 ssize_t err; 2500 2501 err = generic_write_sync(file, pos, ret); 2502 if (err < 0 && ret > 0) 2503 ret = err; 2504 } 2505 return ret; 2506 } 2507 EXPORT_SYMBOL(generic_file_aio_write); 2508 2509 /** 2510 * try_to_release_page() - release old fs-specific metadata on a page 2511 * 2512 * @page: the page which the kernel is trying to free 2513 * @gfp_mask: memory allocation flags (and I/O mode) 2514 * 2515 * The address_space is to try to release any data against the page 2516 * (presumably at page->private). If the release was successful, return `1'. 2517 * Otherwise return zero. 2518 * 2519 * This may also be called if PG_fscache is set on a page, indicating that the 2520 * page is known to the local caching routines. 2521 * 2522 * The @gfp_mask argument specifies whether I/O may be performed to release 2523 * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS). 2524 * 2525 */ 2526 int try_to_release_page(struct page *page, gfp_t gfp_mask) 2527 { 2528 struct address_space * const mapping = page->mapping; 2529 2530 BUG_ON(!PageLocked(page)); 2531 if (PageWriteback(page)) 2532 return 0; 2533 2534 if (mapping && mapping->a_ops->releasepage) 2535 return mapping->a_ops->releasepage(page, gfp_mask); 2536 return try_to_free_buffers(page); 2537 } 2538 2539 EXPORT_SYMBOL(try_to_release_page); 2540