1 /* 2 * Resizable virtual memory filesystem for Linux. 3 * 4 * Copyright (C) 2000 Linus Torvalds. 5 * 2000 Transmeta Corp. 6 * 2000-2001 Christoph Rohland 7 * 2000-2001 SAP AG 8 * 2002 Red Hat Inc. 9 * Copyright (C) 2002-2011 Hugh Dickins. 10 * Copyright (C) 2011 Google Inc. 11 * Copyright (C) 2002-2005 VERITAS Software Corporation. 12 * Copyright (C) 2004 Andi Kleen, SuSE Labs 13 * 14 * Extended attribute support for tmpfs: 15 * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net> 16 * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com> 17 * 18 * tiny-shmem: 19 * Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com> 20 * 21 * This file is released under the GPL. 22 */ 23 24 #include <linux/fs.h> 25 #include <linux/init.h> 26 #include <linux/vfs.h> 27 #include <linux/mount.h> 28 #include <linux/ramfs.h> 29 #include <linux/pagemap.h> 30 #include <linux/file.h> 31 #include <linux/mm.h> 32 #include <linux/random.h> 33 #include <linux/sched/signal.h> 34 #include <linux/export.h> 35 #include <linux/swap.h> 36 #include <linux/uio.h> 37 #include <linux/khugepaged.h> 38 #include <linux/hugetlb.h> 39 40 #include <asm/tlbflush.h> /* for arch/microblaze update_mmu_cache() */ 41 42 static struct vfsmount *shm_mnt; 43 44 #ifdef CONFIG_SHMEM 45 /* 46 * This virtual memory filesystem is heavily based on the ramfs. It 47 * extends ramfs by the ability to use swap and honor resource limits 48 * which makes it a completely usable filesystem. 49 */ 50 51 #include <linux/xattr.h> 52 #include <linux/exportfs.h> 53 #include <linux/posix_acl.h> 54 #include <linux/posix_acl_xattr.h> 55 #include <linux/mman.h> 56 #include <linux/string.h> 57 #include <linux/slab.h> 58 #include <linux/backing-dev.h> 59 #include <linux/shmem_fs.h> 60 #include <linux/writeback.h> 61 #include <linux/blkdev.h> 62 #include <linux/pagevec.h> 63 #include <linux/percpu_counter.h> 64 #include <linux/falloc.h> 65 #include <linux/splice.h> 66 #include <linux/security.h> 67 #include <linux/swapops.h> 68 #include <linux/mempolicy.h> 69 #include <linux/namei.h> 70 #include <linux/ctype.h> 71 #include <linux/migrate.h> 72 #include <linux/highmem.h> 73 #include <linux/seq_file.h> 74 #include <linux/magic.h> 75 #include <linux/syscalls.h> 76 #include <linux/fcntl.h> 77 #include <uapi/linux/memfd.h> 78 #include <linux/userfaultfd_k.h> 79 #include <linux/rmap.h> 80 #include <linux/uuid.h> 81 82 #include <linux/uaccess.h> 83 #include <asm/pgtable.h> 84 85 #include "internal.h" 86 87 #define BLOCKS_PER_PAGE (PAGE_SIZE/512) 88 #define VM_ACCT(size) (PAGE_ALIGN(size) >> PAGE_SHIFT) 89 90 /* Pretend that each entry is of this size in directory's i_size */ 91 #define BOGO_DIRENT_SIZE 20 92 93 /* Symlink up to this size is kmalloc'ed instead of using a swappable page */ 94 #define SHORT_SYMLINK_LEN 128 95 96 /* 97 * shmem_fallocate communicates with shmem_fault or shmem_writepage via 98 * inode->i_private (with i_mutex making sure that it has only one user at 99 * a time): we would prefer not to enlarge the shmem inode just for that. 100 */ 101 struct shmem_falloc { 102 wait_queue_head_t *waitq; /* faults into hole wait for punch to end */ 103 pgoff_t start; /* start of range currently being fallocated */ 104 pgoff_t next; /* the next page offset to be fallocated */ 105 pgoff_t nr_falloced; /* how many new pages have been fallocated */ 106 pgoff_t nr_unswapped; /* how often writepage refused to swap out */ 107 }; 108 109 #ifdef CONFIG_TMPFS 110 static unsigned long shmem_default_max_blocks(void) 111 { 112 return totalram_pages / 2; 113 } 114 115 static unsigned long shmem_default_max_inodes(void) 116 { 117 return min(totalram_pages - totalhigh_pages, totalram_pages / 2); 118 } 119 #endif 120 121 static bool shmem_should_replace_page(struct page *page, gfp_t gfp); 122 static int shmem_replace_page(struct page **pagep, gfp_t gfp, 123 struct shmem_inode_info *info, pgoff_t index); 124 static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, 125 struct page **pagep, enum sgp_type sgp, 126 gfp_t gfp, struct vm_area_struct *vma, 127 struct vm_fault *vmf, vm_fault_t *fault_type); 128 129 int shmem_getpage(struct inode *inode, pgoff_t index, 130 struct page **pagep, enum sgp_type sgp) 131 { 132 return shmem_getpage_gfp(inode, index, pagep, sgp, 133 mapping_gfp_mask(inode->i_mapping), NULL, NULL, NULL); 134 } 135 136 static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) 137 { 138 return sb->s_fs_info; 139 } 140 141 /* 142 * shmem_file_setup pre-accounts the whole fixed size of a VM object, 143 * for shared memory and for shared anonymous (/dev/zero) mappings 144 * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1), 145 * consistent with the pre-accounting of private mappings ... 146 */ 147 static inline int shmem_acct_size(unsigned long flags, loff_t size) 148 { 149 return (flags & VM_NORESERVE) ? 150 0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size)); 151 } 152 153 static inline void shmem_unacct_size(unsigned long flags, loff_t size) 154 { 155 if (!(flags & VM_NORESERVE)) 156 vm_unacct_memory(VM_ACCT(size)); 157 } 158 159 static inline int shmem_reacct_size(unsigned long flags, 160 loff_t oldsize, loff_t newsize) 161 { 162 if (!(flags & VM_NORESERVE)) { 163 if (VM_ACCT(newsize) > VM_ACCT(oldsize)) 164 return security_vm_enough_memory_mm(current->mm, 165 VM_ACCT(newsize) - VM_ACCT(oldsize)); 166 else if (VM_ACCT(newsize) < VM_ACCT(oldsize)) 167 vm_unacct_memory(VM_ACCT(oldsize) - VM_ACCT(newsize)); 168 } 169 return 0; 170 } 171 172 /* 173 * ... whereas tmpfs objects are accounted incrementally as 174 * pages are allocated, in order to allow large sparse files. 175 * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM, 176 * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM. 177 */ 178 static inline int shmem_acct_block(unsigned long flags, long pages) 179 { 180 if (!(flags & VM_NORESERVE)) 181 return 0; 182 183 return security_vm_enough_memory_mm(current->mm, 184 pages * VM_ACCT(PAGE_SIZE)); 185 } 186 187 static inline void shmem_unacct_blocks(unsigned long flags, long pages) 188 { 189 if (flags & VM_NORESERVE) 190 vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE)); 191 } 192 193 static inline bool shmem_inode_acct_block(struct inode *inode, long pages) 194 { 195 struct shmem_inode_info *info = SHMEM_I(inode); 196 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 197 198 if (shmem_acct_block(info->flags, pages)) 199 return false; 200 201 if (sbinfo->max_blocks) { 202 if (percpu_counter_compare(&sbinfo->used_blocks, 203 sbinfo->max_blocks - pages) > 0) 204 goto unacct; 205 percpu_counter_add(&sbinfo->used_blocks, pages); 206 } 207 208 return true; 209 210 unacct: 211 shmem_unacct_blocks(info->flags, pages); 212 return false; 213 } 214 215 static inline void shmem_inode_unacct_blocks(struct inode *inode, long pages) 216 { 217 struct shmem_inode_info *info = SHMEM_I(inode); 218 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 219 220 if (sbinfo->max_blocks) 221 percpu_counter_sub(&sbinfo->used_blocks, pages); 222 shmem_unacct_blocks(info->flags, pages); 223 } 224 225 static const struct super_operations shmem_ops; 226 static const struct address_space_operations shmem_aops; 227 static const struct file_operations shmem_file_operations; 228 static const struct inode_operations shmem_inode_operations; 229 static const struct inode_operations shmem_dir_inode_operations; 230 static const struct inode_operations shmem_special_inode_operations; 231 static const struct vm_operations_struct shmem_vm_ops; 232 static struct file_system_type shmem_fs_type; 233 234 bool vma_is_shmem(struct vm_area_struct *vma) 235 { 236 return vma->vm_ops == &shmem_vm_ops; 237 } 238 239 static LIST_HEAD(shmem_swaplist); 240 static DEFINE_MUTEX(shmem_swaplist_mutex); 241 242 static int shmem_reserve_inode(struct super_block *sb) 243 { 244 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 245 if (sbinfo->max_inodes) { 246 spin_lock(&sbinfo->stat_lock); 247 if (!sbinfo->free_inodes) { 248 spin_unlock(&sbinfo->stat_lock); 249 return -ENOSPC; 250 } 251 sbinfo->free_inodes--; 252 spin_unlock(&sbinfo->stat_lock); 253 } 254 return 0; 255 } 256 257 static void shmem_free_inode(struct super_block *sb) 258 { 259 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 260 if (sbinfo->max_inodes) { 261 spin_lock(&sbinfo->stat_lock); 262 sbinfo->free_inodes++; 263 spin_unlock(&sbinfo->stat_lock); 264 } 265 } 266 267 /** 268 * shmem_recalc_inode - recalculate the block usage of an inode 269 * @inode: inode to recalc 270 * 271 * We have to calculate the free blocks since the mm can drop 272 * undirtied hole pages behind our back. 273 * 274 * But normally info->alloced == inode->i_mapping->nrpages + info->swapped 275 * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped) 276 * 277 * It has to be called with the spinlock held. 278 */ 279 static void shmem_recalc_inode(struct inode *inode) 280 { 281 struct shmem_inode_info *info = SHMEM_I(inode); 282 long freed; 283 284 freed = info->alloced - info->swapped - inode->i_mapping->nrpages; 285 if (freed > 0) { 286 info->alloced -= freed; 287 inode->i_blocks -= freed * BLOCKS_PER_PAGE; 288 shmem_inode_unacct_blocks(inode, freed); 289 } 290 } 291 292 bool shmem_charge(struct inode *inode, long pages) 293 { 294 struct shmem_inode_info *info = SHMEM_I(inode); 295 unsigned long flags; 296 297 if (!shmem_inode_acct_block(inode, pages)) 298 return false; 299 300 spin_lock_irqsave(&info->lock, flags); 301 info->alloced += pages; 302 inode->i_blocks += pages * BLOCKS_PER_PAGE; 303 shmem_recalc_inode(inode); 304 spin_unlock_irqrestore(&info->lock, flags); 305 inode->i_mapping->nrpages += pages; 306 307 return true; 308 } 309 310 void shmem_uncharge(struct inode *inode, long pages) 311 { 312 struct shmem_inode_info *info = SHMEM_I(inode); 313 unsigned long flags; 314 315 spin_lock_irqsave(&info->lock, flags); 316 info->alloced -= pages; 317 inode->i_blocks -= pages * BLOCKS_PER_PAGE; 318 shmem_recalc_inode(inode); 319 spin_unlock_irqrestore(&info->lock, flags); 320 321 shmem_inode_unacct_blocks(inode, pages); 322 } 323 324 /* 325 * Replace item expected in xarray by a new item, while holding xa_lock. 326 */ 327 static int shmem_replace_entry(struct address_space *mapping, 328 pgoff_t index, void *expected, void *replacement) 329 { 330 XA_STATE(xas, &mapping->i_pages, index); 331 void *item; 332 333 VM_BUG_ON(!expected); 334 VM_BUG_ON(!replacement); 335 item = xas_load(&xas); 336 if (item != expected) 337 return -ENOENT; 338 xas_store(&xas, replacement); 339 return 0; 340 } 341 342 /* 343 * Sometimes, before we decide whether to proceed or to fail, we must check 344 * that an entry was not already brought back from swap by a racing thread. 345 * 346 * Checking page is not enough: by the time a SwapCache page is locked, it 347 * might be reused, and again be SwapCache, using the same swap as before. 348 */ 349 static bool shmem_confirm_swap(struct address_space *mapping, 350 pgoff_t index, swp_entry_t swap) 351 { 352 return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap); 353 } 354 355 /* 356 * Definitions for "huge tmpfs": tmpfs mounted with the huge= option 357 * 358 * SHMEM_HUGE_NEVER: 359 * disables huge pages for the mount; 360 * SHMEM_HUGE_ALWAYS: 361 * enables huge pages for the mount; 362 * SHMEM_HUGE_WITHIN_SIZE: 363 * only allocate huge pages if the page will be fully within i_size, 364 * also respect fadvise()/madvise() hints; 365 * SHMEM_HUGE_ADVISE: 366 * only allocate huge pages if requested with fadvise()/madvise(); 367 */ 368 369 #define SHMEM_HUGE_NEVER 0 370 #define SHMEM_HUGE_ALWAYS 1 371 #define SHMEM_HUGE_WITHIN_SIZE 2 372 #define SHMEM_HUGE_ADVISE 3 373 374 /* 375 * Special values. 376 * Only can be set via /sys/kernel/mm/transparent_hugepage/shmem_enabled: 377 * 378 * SHMEM_HUGE_DENY: 379 * disables huge on shm_mnt and all mounts, for emergency use; 380 * SHMEM_HUGE_FORCE: 381 * enables huge on shm_mnt and all mounts, w/o needing option, for testing; 382 * 383 */ 384 #define SHMEM_HUGE_DENY (-1) 385 #define SHMEM_HUGE_FORCE (-2) 386 387 #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE 388 /* ifdef here to avoid bloating shmem.o when not necessary */ 389 390 static int shmem_huge __read_mostly; 391 392 #if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS) 393 static int shmem_parse_huge(const char *str) 394 { 395 if (!strcmp(str, "never")) 396 return SHMEM_HUGE_NEVER; 397 if (!strcmp(str, "always")) 398 return SHMEM_HUGE_ALWAYS; 399 if (!strcmp(str, "within_size")) 400 return SHMEM_HUGE_WITHIN_SIZE; 401 if (!strcmp(str, "advise")) 402 return SHMEM_HUGE_ADVISE; 403 if (!strcmp(str, "deny")) 404 return SHMEM_HUGE_DENY; 405 if (!strcmp(str, "force")) 406 return SHMEM_HUGE_FORCE; 407 return -EINVAL; 408 } 409 410 static const char *shmem_format_huge(int huge) 411 { 412 switch (huge) { 413 case SHMEM_HUGE_NEVER: 414 return "never"; 415 case SHMEM_HUGE_ALWAYS: 416 return "always"; 417 case SHMEM_HUGE_WITHIN_SIZE: 418 return "within_size"; 419 case SHMEM_HUGE_ADVISE: 420 return "advise"; 421 case SHMEM_HUGE_DENY: 422 return "deny"; 423 case SHMEM_HUGE_FORCE: 424 return "force"; 425 default: 426 VM_BUG_ON(1); 427 return "bad_val"; 428 } 429 } 430 #endif 431 432 static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, 433 struct shrink_control *sc, unsigned long nr_to_split) 434 { 435 LIST_HEAD(list), *pos, *next; 436 LIST_HEAD(to_remove); 437 struct inode *inode; 438 struct shmem_inode_info *info; 439 struct page *page; 440 unsigned long batch = sc ? sc->nr_to_scan : 128; 441 int removed = 0, split = 0; 442 443 if (list_empty(&sbinfo->shrinklist)) 444 return SHRINK_STOP; 445 446 spin_lock(&sbinfo->shrinklist_lock); 447 list_for_each_safe(pos, next, &sbinfo->shrinklist) { 448 info = list_entry(pos, struct shmem_inode_info, shrinklist); 449 450 /* pin the inode */ 451 inode = igrab(&info->vfs_inode); 452 453 /* inode is about to be evicted */ 454 if (!inode) { 455 list_del_init(&info->shrinklist); 456 removed++; 457 goto next; 458 } 459 460 /* Check if there's anything to gain */ 461 if (round_up(inode->i_size, PAGE_SIZE) == 462 round_up(inode->i_size, HPAGE_PMD_SIZE)) { 463 list_move(&info->shrinklist, &to_remove); 464 removed++; 465 goto next; 466 } 467 468 list_move(&info->shrinklist, &list); 469 next: 470 if (!--batch) 471 break; 472 } 473 spin_unlock(&sbinfo->shrinklist_lock); 474 475 list_for_each_safe(pos, next, &to_remove) { 476 info = list_entry(pos, struct shmem_inode_info, shrinklist); 477 inode = &info->vfs_inode; 478 list_del_init(&info->shrinklist); 479 iput(inode); 480 } 481 482 list_for_each_safe(pos, next, &list) { 483 int ret; 484 485 info = list_entry(pos, struct shmem_inode_info, shrinklist); 486 inode = &info->vfs_inode; 487 488 if (nr_to_split && split >= nr_to_split) 489 goto leave; 490 491 page = find_get_page(inode->i_mapping, 492 (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT); 493 if (!page) 494 goto drop; 495 496 /* No huge page at the end of the file: nothing to split */ 497 if (!PageTransHuge(page)) { 498 put_page(page); 499 goto drop; 500 } 501 502 /* 503 * Leave the inode on the list if we failed to lock 504 * the page at this time. 505 * 506 * Waiting for the lock may lead to deadlock in the 507 * reclaim path. 508 */ 509 if (!trylock_page(page)) { 510 put_page(page); 511 goto leave; 512 } 513 514 ret = split_huge_page(page); 515 unlock_page(page); 516 put_page(page); 517 518 /* If split failed leave the inode on the list */ 519 if (ret) 520 goto leave; 521 522 split++; 523 drop: 524 list_del_init(&info->shrinklist); 525 removed++; 526 leave: 527 iput(inode); 528 } 529 530 spin_lock(&sbinfo->shrinklist_lock); 531 list_splice_tail(&list, &sbinfo->shrinklist); 532 sbinfo->shrinklist_len -= removed; 533 spin_unlock(&sbinfo->shrinklist_lock); 534 535 return split; 536 } 537 538 static long shmem_unused_huge_scan(struct super_block *sb, 539 struct shrink_control *sc) 540 { 541 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 542 543 if (!READ_ONCE(sbinfo->shrinklist_len)) 544 return SHRINK_STOP; 545 546 return shmem_unused_huge_shrink(sbinfo, sc, 0); 547 } 548 549 static long shmem_unused_huge_count(struct super_block *sb, 550 struct shrink_control *sc) 551 { 552 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 553 return READ_ONCE(sbinfo->shrinklist_len); 554 } 555 #else /* !CONFIG_TRANSPARENT_HUGE_PAGECACHE */ 556 557 #define shmem_huge SHMEM_HUGE_DENY 558 559 static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, 560 struct shrink_control *sc, unsigned long nr_to_split) 561 { 562 return 0; 563 } 564 #endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE */ 565 566 static inline bool is_huge_enabled(struct shmem_sb_info *sbinfo) 567 { 568 if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && 569 (shmem_huge == SHMEM_HUGE_FORCE || sbinfo->huge) && 570 shmem_huge != SHMEM_HUGE_DENY) 571 return true; 572 return false; 573 } 574 575 /* 576 * Like add_to_page_cache_locked, but error if expected item has gone. 577 */ 578 static int shmem_add_to_page_cache(struct page *page, 579 struct address_space *mapping, 580 pgoff_t index, void *expected) 581 { 582 int error, nr = hpage_nr_pages(page); 583 584 VM_BUG_ON_PAGE(PageTail(page), page); 585 VM_BUG_ON_PAGE(index != round_down(index, nr), page); 586 VM_BUG_ON_PAGE(!PageLocked(page), page); 587 VM_BUG_ON_PAGE(!PageSwapBacked(page), page); 588 VM_BUG_ON(expected && PageTransHuge(page)); 589 590 page_ref_add(page, nr); 591 page->mapping = mapping; 592 page->index = index; 593 594 xa_lock_irq(&mapping->i_pages); 595 if (PageTransHuge(page)) { 596 void __rcu **results; 597 pgoff_t idx; 598 int i; 599 600 error = 0; 601 if (radix_tree_gang_lookup_slot(&mapping->i_pages, 602 &results, &idx, index, 1) && 603 idx < index + HPAGE_PMD_NR) { 604 error = -EEXIST; 605 } 606 607 if (!error) { 608 for (i = 0; i < HPAGE_PMD_NR; i++) { 609 error = radix_tree_insert(&mapping->i_pages, 610 index + i, page + i); 611 VM_BUG_ON(error); 612 } 613 count_vm_event(THP_FILE_ALLOC); 614 } 615 } else if (!expected) { 616 error = radix_tree_insert(&mapping->i_pages, index, page); 617 } else { 618 error = shmem_replace_entry(mapping, index, expected, page); 619 } 620 621 if (!error) { 622 mapping->nrpages += nr; 623 if (PageTransHuge(page)) 624 __inc_node_page_state(page, NR_SHMEM_THPS); 625 __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr); 626 __mod_node_page_state(page_pgdat(page), NR_SHMEM, nr); 627 xa_unlock_irq(&mapping->i_pages); 628 } else { 629 page->mapping = NULL; 630 xa_unlock_irq(&mapping->i_pages); 631 page_ref_sub(page, nr); 632 } 633 return error; 634 } 635 636 /* 637 * Like delete_from_page_cache, but substitutes swap for page. 638 */ 639 static void shmem_delete_from_page_cache(struct page *page, void *radswap) 640 { 641 struct address_space *mapping = page->mapping; 642 int error; 643 644 VM_BUG_ON_PAGE(PageCompound(page), page); 645 646 xa_lock_irq(&mapping->i_pages); 647 error = shmem_replace_entry(mapping, page->index, page, radswap); 648 page->mapping = NULL; 649 mapping->nrpages--; 650 __dec_node_page_state(page, NR_FILE_PAGES); 651 __dec_node_page_state(page, NR_SHMEM); 652 xa_unlock_irq(&mapping->i_pages); 653 put_page(page); 654 BUG_ON(error); 655 } 656 657 /* 658 * Remove swap entry from radix tree, free the swap and its page cache. 659 */ 660 static int shmem_free_swap(struct address_space *mapping, 661 pgoff_t index, void *radswap) 662 { 663 void *old; 664 665 xa_lock_irq(&mapping->i_pages); 666 old = radix_tree_delete_item(&mapping->i_pages, index, radswap); 667 xa_unlock_irq(&mapping->i_pages); 668 if (old != radswap) 669 return -ENOENT; 670 free_swap_and_cache(radix_to_swp_entry(radswap)); 671 return 0; 672 } 673 674 /* 675 * Determine (in bytes) how many of the shmem object's pages mapped by the 676 * given offsets are swapped out. 677 * 678 * This is safe to call without i_mutex or the i_pages lock thanks to RCU, 679 * as long as the inode doesn't go away and racy results are not a problem. 680 */ 681 unsigned long shmem_partial_swap_usage(struct address_space *mapping, 682 pgoff_t start, pgoff_t end) 683 { 684 struct radix_tree_iter iter; 685 void __rcu **slot; 686 struct page *page; 687 unsigned long swapped = 0; 688 689 rcu_read_lock(); 690 691 radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { 692 if (iter.index >= end) 693 break; 694 695 page = radix_tree_deref_slot(slot); 696 697 if (radix_tree_deref_retry(page)) { 698 slot = radix_tree_iter_retry(&iter); 699 continue; 700 } 701 702 if (xa_is_value(page)) 703 swapped++; 704 705 if (need_resched()) { 706 slot = radix_tree_iter_resume(slot, &iter); 707 cond_resched_rcu(); 708 } 709 } 710 711 rcu_read_unlock(); 712 713 return swapped << PAGE_SHIFT; 714 } 715 716 /* 717 * Determine (in bytes) how many of the shmem object's pages mapped by the 718 * given vma is swapped out. 719 * 720 * This is safe to call without i_mutex or the i_pages lock thanks to RCU, 721 * as long as the inode doesn't go away and racy results are not a problem. 722 */ 723 unsigned long shmem_swap_usage(struct vm_area_struct *vma) 724 { 725 struct inode *inode = file_inode(vma->vm_file); 726 struct shmem_inode_info *info = SHMEM_I(inode); 727 struct address_space *mapping = inode->i_mapping; 728 unsigned long swapped; 729 730 /* Be careful as we don't hold info->lock */ 731 swapped = READ_ONCE(info->swapped); 732 733 /* 734 * The easier cases are when the shmem object has nothing in swap, or 735 * the vma maps it whole. Then we can simply use the stats that we 736 * already track. 737 */ 738 if (!swapped) 739 return 0; 740 741 if (!vma->vm_pgoff && vma->vm_end - vma->vm_start >= inode->i_size) 742 return swapped << PAGE_SHIFT; 743 744 /* Here comes the more involved part */ 745 return shmem_partial_swap_usage(mapping, 746 linear_page_index(vma, vma->vm_start), 747 linear_page_index(vma, vma->vm_end)); 748 } 749 750 /* 751 * SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists. 752 */ 753 void shmem_unlock_mapping(struct address_space *mapping) 754 { 755 struct pagevec pvec; 756 pgoff_t indices[PAGEVEC_SIZE]; 757 pgoff_t index = 0; 758 759 pagevec_init(&pvec); 760 /* 761 * Minor point, but we might as well stop if someone else SHM_LOCKs it. 762 */ 763 while (!mapping_unevictable(mapping)) { 764 /* 765 * Avoid pagevec_lookup(): find_get_pages() returns 0 as if it 766 * has finished, if it hits a row of PAGEVEC_SIZE swap entries. 767 */ 768 pvec.nr = find_get_entries(mapping, index, 769 PAGEVEC_SIZE, pvec.pages, indices); 770 if (!pvec.nr) 771 break; 772 index = indices[pvec.nr - 1] + 1; 773 pagevec_remove_exceptionals(&pvec); 774 check_move_unevictable_pages(pvec.pages, pvec.nr); 775 pagevec_release(&pvec); 776 cond_resched(); 777 } 778 } 779 780 /* 781 * Remove range of pages and swap entries from radix tree, and free them. 782 * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate. 783 */ 784 static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, 785 bool unfalloc) 786 { 787 struct address_space *mapping = inode->i_mapping; 788 struct shmem_inode_info *info = SHMEM_I(inode); 789 pgoff_t start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT; 790 pgoff_t end = (lend + 1) >> PAGE_SHIFT; 791 unsigned int partial_start = lstart & (PAGE_SIZE - 1); 792 unsigned int partial_end = (lend + 1) & (PAGE_SIZE - 1); 793 struct pagevec pvec; 794 pgoff_t indices[PAGEVEC_SIZE]; 795 long nr_swaps_freed = 0; 796 pgoff_t index; 797 int i; 798 799 if (lend == -1) 800 end = -1; /* unsigned, so actually very big */ 801 802 pagevec_init(&pvec); 803 index = start; 804 while (index < end) { 805 pvec.nr = find_get_entries(mapping, index, 806 min(end - index, (pgoff_t)PAGEVEC_SIZE), 807 pvec.pages, indices); 808 if (!pvec.nr) 809 break; 810 for (i = 0; i < pagevec_count(&pvec); i++) { 811 struct page *page = pvec.pages[i]; 812 813 index = indices[i]; 814 if (index >= end) 815 break; 816 817 if (xa_is_value(page)) { 818 if (unfalloc) 819 continue; 820 nr_swaps_freed += !shmem_free_swap(mapping, 821 index, page); 822 continue; 823 } 824 825 VM_BUG_ON_PAGE(page_to_pgoff(page) != index, page); 826 827 if (!trylock_page(page)) 828 continue; 829 830 if (PageTransTail(page)) { 831 /* Middle of THP: zero out the page */ 832 clear_highpage(page); 833 unlock_page(page); 834 continue; 835 } else if (PageTransHuge(page)) { 836 if (index == round_down(end, HPAGE_PMD_NR)) { 837 /* 838 * Range ends in the middle of THP: 839 * zero out the page 840 */ 841 clear_highpage(page); 842 unlock_page(page); 843 continue; 844 } 845 index += HPAGE_PMD_NR - 1; 846 i += HPAGE_PMD_NR - 1; 847 } 848 849 if (!unfalloc || !PageUptodate(page)) { 850 VM_BUG_ON_PAGE(PageTail(page), page); 851 if (page_mapping(page) == mapping) { 852 VM_BUG_ON_PAGE(PageWriteback(page), page); 853 truncate_inode_page(mapping, page); 854 } 855 } 856 unlock_page(page); 857 } 858 pagevec_remove_exceptionals(&pvec); 859 pagevec_release(&pvec); 860 cond_resched(); 861 index++; 862 } 863 864 if (partial_start) { 865 struct page *page = NULL; 866 shmem_getpage(inode, start - 1, &page, SGP_READ); 867 if (page) { 868 unsigned int top = PAGE_SIZE; 869 if (start > end) { 870 top = partial_end; 871 partial_end = 0; 872 } 873 zero_user_segment(page, partial_start, top); 874 set_page_dirty(page); 875 unlock_page(page); 876 put_page(page); 877 } 878 } 879 if (partial_end) { 880 struct page *page = NULL; 881 shmem_getpage(inode, end, &page, SGP_READ); 882 if (page) { 883 zero_user_segment(page, 0, partial_end); 884 set_page_dirty(page); 885 unlock_page(page); 886 put_page(page); 887 } 888 } 889 if (start >= end) 890 return; 891 892 index = start; 893 while (index < end) { 894 cond_resched(); 895 896 pvec.nr = find_get_entries(mapping, index, 897 min(end - index, (pgoff_t)PAGEVEC_SIZE), 898 pvec.pages, indices); 899 if (!pvec.nr) { 900 /* If all gone or hole-punch or unfalloc, we're done */ 901 if (index == start || end != -1) 902 break; 903 /* But if truncating, restart to make sure all gone */ 904 index = start; 905 continue; 906 } 907 for (i = 0; i < pagevec_count(&pvec); i++) { 908 struct page *page = pvec.pages[i]; 909 910 index = indices[i]; 911 if (index >= end) 912 break; 913 914 if (xa_is_value(page)) { 915 if (unfalloc) 916 continue; 917 if (shmem_free_swap(mapping, index, page)) { 918 /* Swap was replaced by page: retry */ 919 index--; 920 break; 921 } 922 nr_swaps_freed++; 923 continue; 924 } 925 926 lock_page(page); 927 928 if (PageTransTail(page)) { 929 /* Middle of THP: zero out the page */ 930 clear_highpage(page); 931 unlock_page(page); 932 /* 933 * Partial thp truncate due 'start' in middle 934 * of THP: don't need to look on these pages 935 * again on !pvec.nr restart. 936 */ 937 if (index != round_down(end, HPAGE_PMD_NR)) 938 start++; 939 continue; 940 } else if (PageTransHuge(page)) { 941 if (index == round_down(end, HPAGE_PMD_NR)) { 942 /* 943 * Range ends in the middle of THP: 944 * zero out the page 945 */ 946 clear_highpage(page); 947 unlock_page(page); 948 continue; 949 } 950 index += HPAGE_PMD_NR - 1; 951 i += HPAGE_PMD_NR - 1; 952 } 953 954 if (!unfalloc || !PageUptodate(page)) { 955 VM_BUG_ON_PAGE(PageTail(page), page); 956 if (page_mapping(page) == mapping) { 957 VM_BUG_ON_PAGE(PageWriteback(page), page); 958 truncate_inode_page(mapping, page); 959 } else { 960 /* Page was replaced by swap: retry */ 961 unlock_page(page); 962 index--; 963 break; 964 } 965 } 966 unlock_page(page); 967 } 968 pagevec_remove_exceptionals(&pvec); 969 pagevec_release(&pvec); 970 index++; 971 } 972 973 spin_lock_irq(&info->lock); 974 info->swapped -= nr_swaps_freed; 975 shmem_recalc_inode(inode); 976 spin_unlock_irq(&info->lock); 977 } 978 979 void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) 980 { 981 shmem_undo_range(inode, lstart, lend, false); 982 inode->i_ctime = inode->i_mtime = current_time(inode); 983 } 984 EXPORT_SYMBOL_GPL(shmem_truncate_range); 985 986 static int shmem_getattr(const struct path *path, struct kstat *stat, 987 u32 request_mask, unsigned int query_flags) 988 { 989 struct inode *inode = path->dentry->d_inode; 990 struct shmem_inode_info *info = SHMEM_I(inode); 991 struct shmem_sb_info *sb_info = SHMEM_SB(inode->i_sb); 992 993 if (info->alloced - info->swapped != inode->i_mapping->nrpages) { 994 spin_lock_irq(&info->lock); 995 shmem_recalc_inode(inode); 996 spin_unlock_irq(&info->lock); 997 } 998 generic_fillattr(inode, stat); 999 1000 if (is_huge_enabled(sb_info)) 1001 stat->blksize = HPAGE_PMD_SIZE; 1002 1003 return 0; 1004 } 1005 1006 static int shmem_setattr(struct dentry *dentry, struct iattr *attr) 1007 { 1008 struct inode *inode = d_inode(dentry); 1009 struct shmem_inode_info *info = SHMEM_I(inode); 1010 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 1011 int error; 1012 1013 error = setattr_prepare(dentry, attr); 1014 if (error) 1015 return error; 1016 1017 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { 1018 loff_t oldsize = inode->i_size; 1019 loff_t newsize = attr->ia_size; 1020 1021 /* protected by i_mutex */ 1022 if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) || 1023 (newsize > oldsize && (info->seals & F_SEAL_GROW))) 1024 return -EPERM; 1025 1026 if (newsize != oldsize) { 1027 error = shmem_reacct_size(SHMEM_I(inode)->flags, 1028 oldsize, newsize); 1029 if (error) 1030 return error; 1031 i_size_write(inode, newsize); 1032 inode->i_ctime = inode->i_mtime = current_time(inode); 1033 } 1034 if (newsize <= oldsize) { 1035 loff_t holebegin = round_up(newsize, PAGE_SIZE); 1036 if (oldsize > holebegin) 1037 unmap_mapping_range(inode->i_mapping, 1038 holebegin, 0, 1); 1039 if (info->alloced) 1040 shmem_truncate_range(inode, 1041 newsize, (loff_t)-1); 1042 /* unmap again to remove racily COWed private pages */ 1043 if (oldsize > holebegin) 1044 unmap_mapping_range(inode->i_mapping, 1045 holebegin, 0, 1); 1046 1047 /* 1048 * Part of the huge page can be beyond i_size: subject 1049 * to shrink under memory pressure. 1050 */ 1051 if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) { 1052 spin_lock(&sbinfo->shrinklist_lock); 1053 /* 1054 * _careful to defend against unlocked access to 1055 * ->shrink_list in shmem_unused_huge_shrink() 1056 */ 1057 if (list_empty_careful(&info->shrinklist)) { 1058 list_add_tail(&info->shrinklist, 1059 &sbinfo->shrinklist); 1060 sbinfo->shrinklist_len++; 1061 } 1062 spin_unlock(&sbinfo->shrinklist_lock); 1063 } 1064 } 1065 } 1066 1067 setattr_copy(inode, attr); 1068 if (attr->ia_valid & ATTR_MODE) 1069 error = posix_acl_chmod(inode, inode->i_mode); 1070 return error; 1071 } 1072 1073 static void shmem_evict_inode(struct inode *inode) 1074 { 1075 struct shmem_inode_info *info = SHMEM_I(inode); 1076 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 1077 1078 if (inode->i_mapping->a_ops == &shmem_aops) { 1079 shmem_unacct_size(info->flags, inode->i_size); 1080 inode->i_size = 0; 1081 shmem_truncate_range(inode, 0, (loff_t)-1); 1082 if (!list_empty(&info->shrinklist)) { 1083 spin_lock(&sbinfo->shrinklist_lock); 1084 if (!list_empty(&info->shrinklist)) { 1085 list_del_init(&info->shrinklist); 1086 sbinfo->shrinklist_len--; 1087 } 1088 spin_unlock(&sbinfo->shrinklist_lock); 1089 } 1090 if (!list_empty(&info->swaplist)) { 1091 mutex_lock(&shmem_swaplist_mutex); 1092 list_del_init(&info->swaplist); 1093 mutex_unlock(&shmem_swaplist_mutex); 1094 } 1095 } 1096 1097 simple_xattrs_free(&info->xattrs); 1098 WARN_ON(inode->i_blocks); 1099 shmem_free_inode(inode->i_sb); 1100 clear_inode(inode); 1101 } 1102 1103 static unsigned long find_swap_entry(struct radix_tree_root *root, void *item) 1104 { 1105 struct radix_tree_iter iter; 1106 void __rcu **slot; 1107 unsigned long found = -1; 1108 unsigned int checked = 0; 1109 1110 rcu_read_lock(); 1111 radix_tree_for_each_slot(slot, root, &iter, 0) { 1112 void *entry = radix_tree_deref_slot(slot); 1113 1114 if (radix_tree_deref_retry(entry)) { 1115 slot = radix_tree_iter_retry(&iter); 1116 continue; 1117 } 1118 if (entry == item) { 1119 found = iter.index; 1120 break; 1121 } 1122 checked++; 1123 if ((checked % 4096) != 0) 1124 continue; 1125 slot = radix_tree_iter_resume(slot, &iter); 1126 cond_resched_rcu(); 1127 } 1128 1129 rcu_read_unlock(); 1130 return found; 1131 } 1132 1133 /* 1134 * If swap found in inode, free it and move page from swapcache to filecache. 1135 */ 1136 static int shmem_unuse_inode(struct shmem_inode_info *info, 1137 swp_entry_t swap, struct page **pagep) 1138 { 1139 struct address_space *mapping = info->vfs_inode.i_mapping; 1140 void *radswap; 1141 pgoff_t index; 1142 gfp_t gfp; 1143 int error = 0; 1144 1145 radswap = swp_to_radix_entry(swap); 1146 index = find_swap_entry(&mapping->i_pages, radswap); 1147 if (index == -1) 1148 return -EAGAIN; /* tell shmem_unuse we found nothing */ 1149 1150 /* 1151 * Move _head_ to start search for next from here. 1152 * But be careful: shmem_evict_inode checks list_empty without taking 1153 * mutex, and there's an instant in list_move_tail when info->swaplist 1154 * would appear empty, if it were the only one on shmem_swaplist. 1155 */ 1156 if (shmem_swaplist.next != &info->swaplist) 1157 list_move_tail(&shmem_swaplist, &info->swaplist); 1158 1159 gfp = mapping_gfp_mask(mapping); 1160 if (shmem_should_replace_page(*pagep, gfp)) { 1161 mutex_unlock(&shmem_swaplist_mutex); 1162 error = shmem_replace_page(pagep, gfp, info, index); 1163 mutex_lock(&shmem_swaplist_mutex); 1164 /* 1165 * We needed to drop mutex to make that restrictive page 1166 * allocation, but the inode might have been freed while we 1167 * dropped it: although a racing shmem_evict_inode() cannot 1168 * complete without emptying the radix_tree, our page lock 1169 * on this swapcache page is not enough to prevent that - 1170 * free_swap_and_cache() of our swap entry will only 1171 * trylock_page(), removing swap from radix_tree whatever. 1172 * 1173 * We must not proceed to shmem_add_to_page_cache() if the 1174 * inode has been freed, but of course we cannot rely on 1175 * inode or mapping or info to check that. However, we can 1176 * safely check if our swap entry is still in use (and here 1177 * it can't have got reused for another page): if it's still 1178 * in use, then the inode cannot have been freed yet, and we 1179 * can safely proceed (if it's no longer in use, that tells 1180 * nothing about the inode, but we don't need to unuse swap). 1181 */ 1182 if (!page_swapcount(*pagep)) 1183 error = -ENOENT; 1184 } 1185 1186 /* 1187 * We rely on shmem_swaplist_mutex, not only to protect the swaplist, 1188 * but also to hold up shmem_evict_inode(): so inode cannot be freed 1189 * beneath us (pagelock doesn't help until the page is in pagecache). 1190 */ 1191 if (!error) 1192 error = shmem_add_to_page_cache(*pagep, mapping, index, 1193 radswap); 1194 if (error != -ENOMEM) { 1195 /* 1196 * Truncation and eviction use free_swap_and_cache(), which 1197 * only does trylock page: if we raced, best clean up here. 1198 */ 1199 delete_from_swap_cache(*pagep); 1200 set_page_dirty(*pagep); 1201 if (!error) { 1202 spin_lock_irq(&info->lock); 1203 info->swapped--; 1204 spin_unlock_irq(&info->lock); 1205 swap_free(swap); 1206 } 1207 } 1208 return error; 1209 } 1210 1211 /* 1212 * Search through swapped inodes to find and replace swap by page. 1213 */ 1214 int shmem_unuse(swp_entry_t swap, struct page *page) 1215 { 1216 struct list_head *this, *next; 1217 struct shmem_inode_info *info; 1218 struct mem_cgroup *memcg; 1219 int error = 0; 1220 1221 /* 1222 * There's a faint possibility that swap page was replaced before 1223 * caller locked it: caller will come back later with the right page. 1224 */ 1225 if (unlikely(!PageSwapCache(page) || page_private(page) != swap.val)) 1226 goto out; 1227 1228 /* 1229 * Charge page using GFP_KERNEL while we can wait, before taking 1230 * the shmem_swaplist_mutex which might hold up shmem_writepage(). 1231 * Charged back to the user (not to caller) when swap account is used. 1232 */ 1233 error = mem_cgroup_try_charge_delay(page, current->mm, GFP_KERNEL, 1234 &memcg, false); 1235 if (error) 1236 goto out; 1237 /* No radix_tree_preload: swap entry keeps a place for page in tree */ 1238 error = -EAGAIN; 1239 1240 mutex_lock(&shmem_swaplist_mutex); 1241 list_for_each_safe(this, next, &shmem_swaplist) { 1242 info = list_entry(this, struct shmem_inode_info, swaplist); 1243 if (info->swapped) 1244 error = shmem_unuse_inode(info, swap, &page); 1245 else 1246 list_del_init(&info->swaplist); 1247 cond_resched(); 1248 if (error != -EAGAIN) 1249 break; 1250 /* found nothing in this: move on to search the next */ 1251 } 1252 mutex_unlock(&shmem_swaplist_mutex); 1253 1254 if (error) { 1255 if (error != -ENOMEM) 1256 error = 0; 1257 mem_cgroup_cancel_charge(page, memcg, false); 1258 } else 1259 mem_cgroup_commit_charge(page, memcg, true, false); 1260 out: 1261 unlock_page(page); 1262 put_page(page); 1263 return error; 1264 } 1265 1266 /* 1267 * Move the page from the page cache to the swap cache. 1268 */ 1269 static int shmem_writepage(struct page *page, struct writeback_control *wbc) 1270 { 1271 struct shmem_inode_info *info; 1272 struct address_space *mapping; 1273 struct inode *inode; 1274 swp_entry_t swap; 1275 pgoff_t index; 1276 1277 VM_BUG_ON_PAGE(PageCompound(page), page); 1278 BUG_ON(!PageLocked(page)); 1279 mapping = page->mapping; 1280 index = page->index; 1281 inode = mapping->host; 1282 info = SHMEM_I(inode); 1283 if (info->flags & VM_LOCKED) 1284 goto redirty; 1285 if (!total_swap_pages) 1286 goto redirty; 1287 1288 /* 1289 * Our capabilities prevent regular writeback or sync from ever calling 1290 * shmem_writepage; but a stacking filesystem might use ->writepage of 1291 * its underlying filesystem, in which case tmpfs should write out to 1292 * swap only in response to memory pressure, and not for the writeback 1293 * threads or sync. 1294 */ 1295 if (!wbc->for_reclaim) { 1296 WARN_ON_ONCE(1); /* Still happens? Tell us about it! */ 1297 goto redirty; 1298 } 1299 1300 /* 1301 * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC 1302 * value into swapfile.c, the only way we can correctly account for a 1303 * fallocated page arriving here is now to initialize it and write it. 1304 * 1305 * That's okay for a page already fallocated earlier, but if we have 1306 * not yet completed the fallocation, then (a) we want to keep track 1307 * of this page in case we have to undo it, and (b) it may not be a 1308 * good idea to continue anyway, once we're pushing into swap. So 1309 * reactivate the page, and let shmem_fallocate() quit when too many. 1310 */ 1311 if (!PageUptodate(page)) { 1312 if (inode->i_private) { 1313 struct shmem_falloc *shmem_falloc; 1314 spin_lock(&inode->i_lock); 1315 shmem_falloc = inode->i_private; 1316 if (shmem_falloc && 1317 !shmem_falloc->waitq && 1318 index >= shmem_falloc->start && 1319 index < shmem_falloc->next) 1320 shmem_falloc->nr_unswapped++; 1321 else 1322 shmem_falloc = NULL; 1323 spin_unlock(&inode->i_lock); 1324 if (shmem_falloc) 1325 goto redirty; 1326 } 1327 clear_highpage(page); 1328 flush_dcache_page(page); 1329 SetPageUptodate(page); 1330 } 1331 1332 swap = get_swap_page(page); 1333 if (!swap.val) 1334 goto redirty; 1335 1336 /* 1337 * Add inode to shmem_unuse()'s list of swapped-out inodes, 1338 * if it's not already there. Do it now before the page is 1339 * moved to swap cache, when its pagelock no longer protects 1340 * the inode from eviction. But don't unlock the mutex until 1341 * we've incremented swapped, because shmem_unuse_inode() will 1342 * prune a !swapped inode from the swaplist under this mutex. 1343 */ 1344 mutex_lock(&shmem_swaplist_mutex); 1345 if (list_empty(&info->swaplist)) 1346 list_add_tail(&info->swaplist, &shmem_swaplist); 1347 1348 if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { 1349 spin_lock_irq(&info->lock); 1350 shmem_recalc_inode(inode); 1351 info->swapped++; 1352 spin_unlock_irq(&info->lock); 1353 1354 swap_shmem_alloc(swap); 1355 shmem_delete_from_page_cache(page, swp_to_radix_entry(swap)); 1356 1357 mutex_unlock(&shmem_swaplist_mutex); 1358 BUG_ON(page_mapped(page)); 1359 swap_writepage(page, wbc); 1360 return 0; 1361 } 1362 1363 mutex_unlock(&shmem_swaplist_mutex); 1364 put_swap_page(page, swap); 1365 redirty: 1366 set_page_dirty(page); 1367 if (wbc->for_reclaim) 1368 return AOP_WRITEPAGE_ACTIVATE; /* Return with page locked */ 1369 unlock_page(page); 1370 return 0; 1371 } 1372 1373 #if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS) 1374 static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) 1375 { 1376 char buffer[64]; 1377 1378 if (!mpol || mpol->mode == MPOL_DEFAULT) 1379 return; /* show nothing */ 1380 1381 mpol_to_str(buffer, sizeof(buffer), mpol); 1382 1383 seq_printf(seq, ",mpol=%s", buffer); 1384 } 1385 1386 static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) 1387 { 1388 struct mempolicy *mpol = NULL; 1389 if (sbinfo->mpol) { 1390 spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */ 1391 mpol = sbinfo->mpol; 1392 mpol_get(mpol); 1393 spin_unlock(&sbinfo->stat_lock); 1394 } 1395 return mpol; 1396 } 1397 #else /* !CONFIG_NUMA || !CONFIG_TMPFS */ 1398 static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) 1399 { 1400 } 1401 static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) 1402 { 1403 return NULL; 1404 } 1405 #endif /* CONFIG_NUMA && CONFIG_TMPFS */ 1406 #ifndef CONFIG_NUMA 1407 #define vm_policy vm_private_data 1408 #endif 1409 1410 static void shmem_pseudo_vma_init(struct vm_area_struct *vma, 1411 struct shmem_inode_info *info, pgoff_t index) 1412 { 1413 /* Create a pseudo vma that just contains the policy */ 1414 vma_init(vma, NULL); 1415 /* Bias interleave by inode number to distribute better across nodes */ 1416 vma->vm_pgoff = index + info->vfs_inode.i_ino; 1417 vma->vm_policy = mpol_shared_policy_lookup(&info->policy, index); 1418 } 1419 1420 static void shmem_pseudo_vma_destroy(struct vm_area_struct *vma) 1421 { 1422 /* Drop reference taken by mpol_shared_policy_lookup() */ 1423 mpol_cond_put(vma->vm_policy); 1424 } 1425 1426 static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, 1427 struct shmem_inode_info *info, pgoff_t index) 1428 { 1429 struct vm_area_struct pvma; 1430 struct page *page; 1431 struct vm_fault vmf; 1432 1433 shmem_pseudo_vma_init(&pvma, info, index); 1434 vmf.vma = &pvma; 1435 vmf.address = 0; 1436 page = swap_cluster_readahead(swap, gfp, &vmf); 1437 shmem_pseudo_vma_destroy(&pvma); 1438 1439 return page; 1440 } 1441 1442 static struct page *shmem_alloc_hugepage(gfp_t gfp, 1443 struct shmem_inode_info *info, pgoff_t index) 1444 { 1445 struct vm_area_struct pvma; 1446 struct inode *inode = &info->vfs_inode; 1447 struct address_space *mapping = inode->i_mapping; 1448 pgoff_t idx, hindex; 1449 void __rcu **results; 1450 struct page *page; 1451 1452 if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) 1453 return NULL; 1454 1455 hindex = round_down(index, HPAGE_PMD_NR); 1456 rcu_read_lock(); 1457 if (radix_tree_gang_lookup_slot(&mapping->i_pages, &results, &idx, 1458 hindex, 1) && idx < hindex + HPAGE_PMD_NR) { 1459 rcu_read_unlock(); 1460 return NULL; 1461 } 1462 rcu_read_unlock(); 1463 1464 shmem_pseudo_vma_init(&pvma, info, hindex); 1465 page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN, 1466 HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(), true); 1467 shmem_pseudo_vma_destroy(&pvma); 1468 if (page) 1469 prep_transhuge_page(page); 1470 return page; 1471 } 1472 1473 static struct page *shmem_alloc_page(gfp_t gfp, 1474 struct shmem_inode_info *info, pgoff_t index) 1475 { 1476 struct vm_area_struct pvma; 1477 struct page *page; 1478 1479 shmem_pseudo_vma_init(&pvma, info, index); 1480 page = alloc_page_vma(gfp, &pvma, 0); 1481 shmem_pseudo_vma_destroy(&pvma); 1482 1483 return page; 1484 } 1485 1486 static struct page *shmem_alloc_and_acct_page(gfp_t gfp, 1487 struct inode *inode, 1488 pgoff_t index, bool huge) 1489 { 1490 struct shmem_inode_info *info = SHMEM_I(inode); 1491 struct page *page; 1492 int nr; 1493 int err = -ENOSPC; 1494 1495 if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) 1496 huge = false; 1497 nr = huge ? HPAGE_PMD_NR : 1; 1498 1499 if (!shmem_inode_acct_block(inode, nr)) 1500 goto failed; 1501 1502 if (huge) 1503 page = shmem_alloc_hugepage(gfp, info, index); 1504 else 1505 page = shmem_alloc_page(gfp, info, index); 1506 if (page) { 1507 __SetPageLocked(page); 1508 __SetPageSwapBacked(page); 1509 return page; 1510 } 1511 1512 err = -ENOMEM; 1513 shmem_inode_unacct_blocks(inode, nr); 1514 failed: 1515 return ERR_PTR(err); 1516 } 1517 1518 /* 1519 * When a page is moved from swapcache to shmem filecache (either by the 1520 * usual swapin of shmem_getpage_gfp(), or by the less common swapoff of 1521 * shmem_unuse_inode()), it may have been read in earlier from swap, in 1522 * ignorance of the mapping it belongs to. If that mapping has special 1523 * constraints (like the gma500 GEM driver, which requires RAM below 4GB), 1524 * we may need to copy to a suitable page before moving to filecache. 1525 * 1526 * In a future release, this may well be extended to respect cpuset and 1527 * NUMA mempolicy, and applied also to anonymous pages in do_swap_page(); 1528 * but for now it is a simple matter of zone. 1529 */ 1530 static bool shmem_should_replace_page(struct page *page, gfp_t gfp) 1531 { 1532 return page_zonenum(page) > gfp_zone(gfp); 1533 } 1534 1535 static int shmem_replace_page(struct page **pagep, gfp_t gfp, 1536 struct shmem_inode_info *info, pgoff_t index) 1537 { 1538 struct page *oldpage, *newpage; 1539 struct address_space *swap_mapping; 1540 pgoff_t swap_index; 1541 int error; 1542 1543 oldpage = *pagep; 1544 swap_index = page_private(oldpage); 1545 swap_mapping = page_mapping(oldpage); 1546 1547 /* 1548 * We have arrived here because our zones are constrained, so don't 1549 * limit chance of success by further cpuset and node constraints. 1550 */ 1551 gfp &= ~GFP_CONSTRAINT_MASK; 1552 newpage = shmem_alloc_page(gfp, info, index); 1553 if (!newpage) 1554 return -ENOMEM; 1555 1556 get_page(newpage); 1557 copy_highpage(newpage, oldpage); 1558 flush_dcache_page(newpage); 1559 1560 __SetPageLocked(newpage); 1561 __SetPageSwapBacked(newpage); 1562 SetPageUptodate(newpage); 1563 set_page_private(newpage, swap_index); 1564 SetPageSwapCache(newpage); 1565 1566 /* 1567 * Our caller will very soon move newpage out of swapcache, but it's 1568 * a nice clean interface for us to replace oldpage by newpage there. 1569 */ 1570 xa_lock_irq(&swap_mapping->i_pages); 1571 error = shmem_replace_entry(swap_mapping, swap_index, oldpage, newpage); 1572 if (!error) { 1573 __inc_node_page_state(newpage, NR_FILE_PAGES); 1574 __dec_node_page_state(oldpage, NR_FILE_PAGES); 1575 } 1576 xa_unlock_irq(&swap_mapping->i_pages); 1577 1578 if (unlikely(error)) { 1579 /* 1580 * Is this possible? I think not, now that our callers check 1581 * both PageSwapCache and page_private after getting page lock; 1582 * but be defensive. Reverse old to newpage for clear and free. 1583 */ 1584 oldpage = newpage; 1585 } else { 1586 mem_cgroup_migrate(oldpage, newpage); 1587 lru_cache_add_anon(newpage); 1588 *pagep = newpage; 1589 } 1590 1591 ClearPageSwapCache(oldpage); 1592 set_page_private(oldpage, 0); 1593 1594 unlock_page(oldpage); 1595 put_page(oldpage); 1596 put_page(oldpage); 1597 return error; 1598 } 1599 1600 /* 1601 * shmem_getpage_gfp - find page in cache, or get from swap, or allocate 1602 * 1603 * If we allocate a new one we do not mark it dirty. That's up to the 1604 * vm. If we swap it in we mark it dirty since we also free the swap 1605 * entry since a page cannot live in both the swap and page cache. 1606 * 1607 * fault_mm and fault_type are only supplied by shmem_fault: 1608 * otherwise they are NULL. 1609 */ 1610 static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, 1611 struct page **pagep, enum sgp_type sgp, gfp_t gfp, 1612 struct vm_area_struct *vma, struct vm_fault *vmf, 1613 vm_fault_t *fault_type) 1614 { 1615 struct address_space *mapping = inode->i_mapping; 1616 struct shmem_inode_info *info = SHMEM_I(inode); 1617 struct shmem_sb_info *sbinfo; 1618 struct mm_struct *charge_mm; 1619 struct mem_cgroup *memcg; 1620 struct page *page; 1621 swp_entry_t swap; 1622 enum sgp_type sgp_huge = sgp; 1623 pgoff_t hindex = index; 1624 int error; 1625 int once = 0; 1626 int alloced = 0; 1627 1628 if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT)) 1629 return -EFBIG; 1630 if (sgp == SGP_NOHUGE || sgp == SGP_HUGE) 1631 sgp = SGP_CACHE; 1632 repeat: 1633 swap.val = 0; 1634 page = find_lock_entry(mapping, index); 1635 if (xa_is_value(page)) { 1636 swap = radix_to_swp_entry(page); 1637 page = NULL; 1638 } 1639 1640 if (sgp <= SGP_CACHE && 1641 ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) { 1642 error = -EINVAL; 1643 goto unlock; 1644 } 1645 1646 if (page && sgp == SGP_WRITE) 1647 mark_page_accessed(page); 1648 1649 /* fallocated page? */ 1650 if (page && !PageUptodate(page)) { 1651 if (sgp != SGP_READ) 1652 goto clear; 1653 unlock_page(page); 1654 put_page(page); 1655 page = NULL; 1656 } 1657 if (page || (sgp == SGP_READ && !swap.val)) { 1658 *pagep = page; 1659 return 0; 1660 } 1661 1662 /* 1663 * Fast cache lookup did not find it: 1664 * bring it back from swap or allocate. 1665 */ 1666 sbinfo = SHMEM_SB(inode->i_sb); 1667 charge_mm = vma ? vma->vm_mm : current->mm; 1668 1669 if (swap.val) { 1670 /* Look it up and read it in.. */ 1671 page = lookup_swap_cache(swap, NULL, 0); 1672 if (!page) { 1673 /* Or update major stats only when swapin succeeds?? */ 1674 if (fault_type) { 1675 *fault_type |= VM_FAULT_MAJOR; 1676 count_vm_event(PGMAJFAULT); 1677 count_memcg_event_mm(charge_mm, PGMAJFAULT); 1678 } 1679 /* Here we actually start the io */ 1680 page = shmem_swapin(swap, gfp, info, index); 1681 if (!page) { 1682 error = -ENOMEM; 1683 goto failed; 1684 } 1685 } 1686 1687 /* We have to do this with page locked to prevent races */ 1688 lock_page(page); 1689 if (!PageSwapCache(page) || page_private(page) != swap.val || 1690 !shmem_confirm_swap(mapping, index, swap)) { 1691 error = -EEXIST; /* try again */ 1692 goto unlock; 1693 } 1694 if (!PageUptodate(page)) { 1695 error = -EIO; 1696 goto failed; 1697 } 1698 wait_on_page_writeback(page); 1699 1700 if (shmem_should_replace_page(page, gfp)) { 1701 error = shmem_replace_page(&page, gfp, info, index); 1702 if (error) 1703 goto failed; 1704 } 1705 1706 error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg, 1707 false); 1708 if (!error) { 1709 error = shmem_add_to_page_cache(page, mapping, index, 1710 swp_to_radix_entry(swap)); 1711 /* 1712 * We already confirmed swap under page lock, and make 1713 * no memory allocation here, so usually no possibility 1714 * of error; but free_swap_and_cache() only trylocks a 1715 * page, so it is just possible that the entry has been 1716 * truncated or holepunched since swap was confirmed. 1717 * shmem_undo_range() will have done some of the 1718 * unaccounting, now delete_from_swap_cache() will do 1719 * the rest. 1720 * Reset swap.val? No, leave it so "failed" goes back to 1721 * "repeat": reading a hole and writing should succeed. 1722 */ 1723 if (error) { 1724 mem_cgroup_cancel_charge(page, memcg, false); 1725 delete_from_swap_cache(page); 1726 } 1727 } 1728 if (error) 1729 goto failed; 1730 1731 mem_cgroup_commit_charge(page, memcg, true, false); 1732 1733 spin_lock_irq(&info->lock); 1734 info->swapped--; 1735 shmem_recalc_inode(inode); 1736 spin_unlock_irq(&info->lock); 1737 1738 if (sgp == SGP_WRITE) 1739 mark_page_accessed(page); 1740 1741 delete_from_swap_cache(page); 1742 set_page_dirty(page); 1743 swap_free(swap); 1744 1745 } else { 1746 if (vma && userfaultfd_missing(vma)) { 1747 *fault_type = handle_userfault(vmf, VM_UFFD_MISSING); 1748 return 0; 1749 } 1750 1751 /* shmem_symlink() */ 1752 if (mapping->a_ops != &shmem_aops) 1753 goto alloc_nohuge; 1754 if (shmem_huge == SHMEM_HUGE_DENY || sgp_huge == SGP_NOHUGE) 1755 goto alloc_nohuge; 1756 if (shmem_huge == SHMEM_HUGE_FORCE) 1757 goto alloc_huge; 1758 switch (sbinfo->huge) { 1759 loff_t i_size; 1760 pgoff_t off; 1761 case SHMEM_HUGE_NEVER: 1762 goto alloc_nohuge; 1763 case SHMEM_HUGE_WITHIN_SIZE: 1764 off = round_up(index, HPAGE_PMD_NR); 1765 i_size = round_up(i_size_read(inode), PAGE_SIZE); 1766 if (i_size >= HPAGE_PMD_SIZE && 1767 i_size >> PAGE_SHIFT >= off) 1768 goto alloc_huge; 1769 /* fallthrough */ 1770 case SHMEM_HUGE_ADVISE: 1771 if (sgp_huge == SGP_HUGE) 1772 goto alloc_huge; 1773 /* TODO: implement fadvise() hints */ 1774 goto alloc_nohuge; 1775 } 1776 1777 alloc_huge: 1778 page = shmem_alloc_and_acct_page(gfp, inode, index, true); 1779 if (IS_ERR(page)) { 1780 alloc_nohuge: page = shmem_alloc_and_acct_page(gfp, inode, 1781 index, false); 1782 } 1783 if (IS_ERR(page)) { 1784 int retry = 5; 1785 error = PTR_ERR(page); 1786 page = NULL; 1787 if (error != -ENOSPC) 1788 goto failed; 1789 /* 1790 * Try to reclaim some spece by splitting a huge page 1791 * beyond i_size on the filesystem. 1792 */ 1793 while (retry--) { 1794 int ret; 1795 ret = shmem_unused_huge_shrink(sbinfo, NULL, 1); 1796 if (ret == SHRINK_STOP) 1797 break; 1798 if (ret) 1799 goto alloc_nohuge; 1800 } 1801 goto failed; 1802 } 1803 1804 if (PageTransHuge(page)) 1805 hindex = round_down(index, HPAGE_PMD_NR); 1806 else 1807 hindex = index; 1808 1809 if (sgp == SGP_WRITE) 1810 __SetPageReferenced(page); 1811 1812 error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg, 1813 PageTransHuge(page)); 1814 if (error) 1815 goto unacct; 1816 error = radix_tree_maybe_preload_order(gfp & GFP_RECLAIM_MASK, 1817 compound_order(page)); 1818 if (!error) { 1819 error = shmem_add_to_page_cache(page, mapping, hindex, 1820 NULL); 1821 radix_tree_preload_end(); 1822 } 1823 if (error) { 1824 mem_cgroup_cancel_charge(page, memcg, 1825 PageTransHuge(page)); 1826 goto unacct; 1827 } 1828 mem_cgroup_commit_charge(page, memcg, false, 1829 PageTransHuge(page)); 1830 lru_cache_add_anon(page); 1831 1832 spin_lock_irq(&info->lock); 1833 info->alloced += 1 << compound_order(page); 1834 inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page); 1835 shmem_recalc_inode(inode); 1836 spin_unlock_irq(&info->lock); 1837 alloced = true; 1838 1839 if (PageTransHuge(page) && 1840 DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) < 1841 hindex + HPAGE_PMD_NR - 1) { 1842 /* 1843 * Part of the huge page is beyond i_size: subject 1844 * to shrink under memory pressure. 1845 */ 1846 spin_lock(&sbinfo->shrinklist_lock); 1847 /* 1848 * _careful to defend against unlocked access to 1849 * ->shrink_list in shmem_unused_huge_shrink() 1850 */ 1851 if (list_empty_careful(&info->shrinklist)) { 1852 list_add_tail(&info->shrinklist, 1853 &sbinfo->shrinklist); 1854 sbinfo->shrinklist_len++; 1855 } 1856 spin_unlock(&sbinfo->shrinklist_lock); 1857 } 1858 1859 /* 1860 * Let SGP_FALLOC use the SGP_WRITE optimization on a new page. 1861 */ 1862 if (sgp == SGP_FALLOC) 1863 sgp = SGP_WRITE; 1864 clear: 1865 /* 1866 * Let SGP_WRITE caller clear ends if write does not fill page; 1867 * but SGP_FALLOC on a page fallocated earlier must initialize 1868 * it now, lest undo on failure cancel our earlier guarantee. 1869 */ 1870 if (sgp != SGP_WRITE && !PageUptodate(page)) { 1871 struct page *head = compound_head(page); 1872 int i; 1873 1874 for (i = 0; i < (1 << compound_order(head)); i++) { 1875 clear_highpage(head + i); 1876 flush_dcache_page(head + i); 1877 } 1878 SetPageUptodate(head); 1879 } 1880 } 1881 1882 /* Perhaps the file has been truncated since we checked */ 1883 if (sgp <= SGP_CACHE && 1884 ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) { 1885 if (alloced) { 1886 ClearPageDirty(page); 1887 delete_from_page_cache(page); 1888 spin_lock_irq(&info->lock); 1889 shmem_recalc_inode(inode); 1890 spin_unlock_irq(&info->lock); 1891 } 1892 error = -EINVAL; 1893 goto unlock; 1894 } 1895 *pagep = page + index - hindex; 1896 return 0; 1897 1898 /* 1899 * Error recovery. 1900 */ 1901 unacct: 1902 shmem_inode_unacct_blocks(inode, 1 << compound_order(page)); 1903 1904 if (PageTransHuge(page)) { 1905 unlock_page(page); 1906 put_page(page); 1907 goto alloc_nohuge; 1908 } 1909 failed: 1910 if (swap.val && !shmem_confirm_swap(mapping, index, swap)) 1911 error = -EEXIST; 1912 unlock: 1913 if (page) { 1914 unlock_page(page); 1915 put_page(page); 1916 } 1917 if (error == -ENOSPC && !once++) { 1918 spin_lock_irq(&info->lock); 1919 shmem_recalc_inode(inode); 1920 spin_unlock_irq(&info->lock); 1921 goto repeat; 1922 } 1923 if (error == -EEXIST) /* from above or from radix_tree_insert */ 1924 goto repeat; 1925 return error; 1926 } 1927 1928 /* 1929 * This is like autoremove_wake_function, but it removes the wait queue 1930 * entry unconditionally - even if something else had already woken the 1931 * target. 1932 */ 1933 static int synchronous_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) 1934 { 1935 int ret = default_wake_function(wait, mode, sync, key); 1936 list_del_init(&wait->entry); 1937 return ret; 1938 } 1939 1940 static vm_fault_t shmem_fault(struct vm_fault *vmf) 1941 { 1942 struct vm_area_struct *vma = vmf->vma; 1943 struct inode *inode = file_inode(vma->vm_file); 1944 gfp_t gfp = mapping_gfp_mask(inode->i_mapping); 1945 enum sgp_type sgp; 1946 int err; 1947 vm_fault_t ret = VM_FAULT_LOCKED; 1948 1949 /* 1950 * Trinity finds that probing a hole which tmpfs is punching can 1951 * prevent the hole-punch from ever completing: which in turn 1952 * locks writers out with its hold on i_mutex. So refrain from 1953 * faulting pages into the hole while it's being punched. Although 1954 * shmem_undo_range() does remove the additions, it may be unable to 1955 * keep up, as each new page needs its own unmap_mapping_range() call, 1956 * and the i_mmap tree grows ever slower to scan if new vmas are added. 1957 * 1958 * It does not matter if we sometimes reach this check just before the 1959 * hole-punch begins, so that one fault then races with the punch: 1960 * we just need to make racing faults a rare case. 1961 * 1962 * The implementation below would be much simpler if we just used a 1963 * standard mutex or completion: but we cannot take i_mutex in fault, 1964 * and bloating every shmem inode for this unlikely case would be sad. 1965 */ 1966 if (unlikely(inode->i_private)) { 1967 struct shmem_falloc *shmem_falloc; 1968 1969 spin_lock(&inode->i_lock); 1970 shmem_falloc = inode->i_private; 1971 if (shmem_falloc && 1972 shmem_falloc->waitq && 1973 vmf->pgoff >= shmem_falloc->start && 1974 vmf->pgoff < shmem_falloc->next) { 1975 wait_queue_head_t *shmem_falloc_waitq; 1976 DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function); 1977 1978 ret = VM_FAULT_NOPAGE; 1979 if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) && 1980 !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) { 1981 /* It's polite to up mmap_sem if we can */ 1982 up_read(&vma->vm_mm->mmap_sem); 1983 ret = VM_FAULT_RETRY; 1984 } 1985 1986 shmem_falloc_waitq = shmem_falloc->waitq; 1987 prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait, 1988 TASK_UNINTERRUPTIBLE); 1989 spin_unlock(&inode->i_lock); 1990 schedule(); 1991 1992 /* 1993 * shmem_falloc_waitq points into the shmem_fallocate() 1994 * stack of the hole-punching task: shmem_falloc_waitq 1995 * is usually invalid by the time we reach here, but 1996 * finish_wait() does not dereference it in that case; 1997 * though i_lock needed lest racing with wake_up_all(). 1998 */ 1999 spin_lock(&inode->i_lock); 2000 finish_wait(shmem_falloc_waitq, &shmem_fault_wait); 2001 spin_unlock(&inode->i_lock); 2002 return ret; 2003 } 2004 spin_unlock(&inode->i_lock); 2005 } 2006 2007 sgp = SGP_CACHE; 2008 2009 if ((vma->vm_flags & VM_NOHUGEPAGE) || 2010 test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) 2011 sgp = SGP_NOHUGE; 2012 else if (vma->vm_flags & VM_HUGEPAGE) 2013 sgp = SGP_HUGE; 2014 2015 err = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, sgp, 2016 gfp, vma, vmf, &ret); 2017 if (err) 2018 return vmf_error(err); 2019 return ret; 2020 } 2021 2022 unsigned long shmem_get_unmapped_area(struct file *file, 2023 unsigned long uaddr, unsigned long len, 2024 unsigned long pgoff, unsigned long flags) 2025 { 2026 unsigned long (*get_area)(struct file *, 2027 unsigned long, unsigned long, unsigned long, unsigned long); 2028 unsigned long addr; 2029 unsigned long offset; 2030 unsigned long inflated_len; 2031 unsigned long inflated_addr; 2032 unsigned long inflated_offset; 2033 2034 if (len > TASK_SIZE) 2035 return -ENOMEM; 2036 2037 get_area = current->mm->get_unmapped_area; 2038 addr = get_area(file, uaddr, len, pgoff, flags); 2039 2040 if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) 2041 return addr; 2042 if (IS_ERR_VALUE(addr)) 2043 return addr; 2044 if (addr & ~PAGE_MASK) 2045 return addr; 2046 if (addr > TASK_SIZE - len) 2047 return addr; 2048 2049 if (shmem_huge == SHMEM_HUGE_DENY) 2050 return addr; 2051 if (len < HPAGE_PMD_SIZE) 2052 return addr; 2053 if (flags & MAP_FIXED) 2054 return addr; 2055 /* 2056 * Our priority is to support MAP_SHARED mapped hugely; 2057 * and support MAP_PRIVATE mapped hugely too, until it is COWed. 2058 * But if caller specified an address hint, respect that as before. 2059 */ 2060 if (uaddr) 2061 return addr; 2062 2063 if (shmem_huge != SHMEM_HUGE_FORCE) { 2064 struct super_block *sb; 2065 2066 if (file) { 2067 VM_BUG_ON(file->f_op != &shmem_file_operations); 2068 sb = file_inode(file)->i_sb; 2069 } else { 2070 /* 2071 * Called directly from mm/mmap.c, or drivers/char/mem.c 2072 * for "/dev/zero", to create a shared anonymous object. 2073 */ 2074 if (IS_ERR(shm_mnt)) 2075 return addr; 2076 sb = shm_mnt->mnt_sb; 2077 } 2078 if (SHMEM_SB(sb)->huge == SHMEM_HUGE_NEVER) 2079 return addr; 2080 } 2081 2082 offset = (pgoff << PAGE_SHIFT) & (HPAGE_PMD_SIZE-1); 2083 if (offset && offset + len < 2 * HPAGE_PMD_SIZE) 2084 return addr; 2085 if ((addr & (HPAGE_PMD_SIZE-1)) == offset) 2086 return addr; 2087 2088 inflated_len = len + HPAGE_PMD_SIZE - PAGE_SIZE; 2089 if (inflated_len > TASK_SIZE) 2090 return addr; 2091 if (inflated_len < len) 2092 return addr; 2093 2094 inflated_addr = get_area(NULL, 0, inflated_len, 0, flags); 2095 if (IS_ERR_VALUE(inflated_addr)) 2096 return addr; 2097 if (inflated_addr & ~PAGE_MASK) 2098 return addr; 2099 2100 inflated_offset = inflated_addr & (HPAGE_PMD_SIZE-1); 2101 inflated_addr += offset - inflated_offset; 2102 if (inflated_offset > offset) 2103 inflated_addr += HPAGE_PMD_SIZE; 2104 2105 if (inflated_addr > TASK_SIZE - len) 2106 return addr; 2107 return inflated_addr; 2108 } 2109 2110 #ifdef CONFIG_NUMA 2111 static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol) 2112 { 2113 struct inode *inode = file_inode(vma->vm_file); 2114 return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol); 2115 } 2116 2117 static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, 2118 unsigned long addr) 2119 { 2120 struct inode *inode = file_inode(vma->vm_file); 2121 pgoff_t index; 2122 2123 index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 2124 return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index); 2125 } 2126 #endif 2127 2128 int shmem_lock(struct file *file, int lock, struct user_struct *user) 2129 { 2130 struct inode *inode = file_inode(file); 2131 struct shmem_inode_info *info = SHMEM_I(inode); 2132 int retval = -ENOMEM; 2133 2134 spin_lock_irq(&info->lock); 2135 if (lock && !(info->flags & VM_LOCKED)) { 2136 if (!user_shm_lock(inode->i_size, user)) 2137 goto out_nomem; 2138 info->flags |= VM_LOCKED; 2139 mapping_set_unevictable(file->f_mapping); 2140 } 2141 if (!lock && (info->flags & VM_LOCKED) && user) { 2142 user_shm_unlock(inode->i_size, user); 2143 info->flags &= ~VM_LOCKED; 2144 mapping_clear_unevictable(file->f_mapping); 2145 } 2146 retval = 0; 2147 2148 out_nomem: 2149 spin_unlock_irq(&info->lock); 2150 return retval; 2151 } 2152 2153 static int shmem_mmap(struct file *file, struct vm_area_struct *vma) 2154 { 2155 file_accessed(file); 2156 vma->vm_ops = &shmem_vm_ops; 2157 if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && 2158 ((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) < 2159 (vma->vm_end & HPAGE_PMD_MASK)) { 2160 khugepaged_enter(vma, vma->vm_flags); 2161 } 2162 return 0; 2163 } 2164 2165 static struct inode *shmem_get_inode(struct super_block *sb, const struct inode *dir, 2166 umode_t mode, dev_t dev, unsigned long flags) 2167 { 2168 struct inode *inode; 2169 struct shmem_inode_info *info; 2170 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 2171 2172 if (shmem_reserve_inode(sb)) 2173 return NULL; 2174 2175 inode = new_inode(sb); 2176 if (inode) { 2177 inode->i_ino = get_next_ino(); 2178 inode_init_owner(inode, dir, mode); 2179 inode->i_blocks = 0; 2180 inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); 2181 inode->i_generation = prandom_u32(); 2182 info = SHMEM_I(inode); 2183 memset(info, 0, (char *)inode - (char *)info); 2184 spin_lock_init(&info->lock); 2185 info->seals = F_SEAL_SEAL; 2186 info->flags = flags & VM_NORESERVE; 2187 INIT_LIST_HEAD(&info->shrinklist); 2188 INIT_LIST_HEAD(&info->swaplist); 2189 simple_xattrs_init(&info->xattrs); 2190 cache_no_acl(inode); 2191 2192 switch (mode & S_IFMT) { 2193 default: 2194 inode->i_op = &shmem_special_inode_operations; 2195 init_special_inode(inode, mode, dev); 2196 break; 2197 case S_IFREG: 2198 inode->i_mapping->a_ops = &shmem_aops; 2199 inode->i_op = &shmem_inode_operations; 2200 inode->i_fop = &shmem_file_operations; 2201 mpol_shared_policy_init(&info->policy, 2202 shmem_get_sbmpol(sbinfo)); 2203 break; 2204 case S_IFDIR: 2205 inc_nlink(inode); 2206 /* Some things misbehave if size == 0 on a directory */ 2207 inode->i_size = 2 * BOGO_DIRENT_SIZE; 2208 inode->i_op = &shmem_dir_inode_operations; 2209 inode->i_fop = &simple_dir_operations; 2210 break; 2211 case S_IFLNK: 2212 /* 2213 * Must not load anything in the rbtree, 2214 * mpol_free_shared_policy will not be called. 2215 */ 2216 mpol_shared_policy_init(&info->policy, NULL); 2217 break; 2218 } 2219 2220 lockdep_annotate_inode_mutex_key(inode); 2221 } else 2222 shmem_free_inode(sb); 2223 return inode; 2224 } 2225 2226 bool shmem_mapping(struct address_space *mapping) 2227 { 2228 return mapping->a_ops == &shmem_aops; 2229 } 2230 2231 static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, 2232 pmd_t *dst_pmd, 2233 struct vm_area_struct *dst_vma, 2234 unsigned long dst_addr, 2235 unsigned long src_addr, 2236 bool zeropage, 2237 struct page **pagep) 2238 { 2239 struct inode *inode = file_inode(dst_vma->vm_file); 2240 struct shmem_inode_info *info = SHMEM_I(inode); 2241 struct address_space *mapping = inode->i_mapping; 2242 gfp_t gfp = mapping_gfp_mask(mapping); 2243 pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); 2244 struct mem_cgroup *memcg; 2245 spinlock_t *ptl; 2246 void *page_kaddr; 2247 struct page *page; 2248 pte_t _dst_pte, *dst_pte; 2249 int ret; 2250 2251 ret = -ENOMEM; 2252 if (!shmem_inode_acct_block(inode, 1)) 2253 goto out; 2254 2255 if (!*pagep) { 2256 page = shmem_alloc_page(gfp, info, pgoff); 2257 if (!page) 2258 goto out_unacct_blocks; 2259 2260 if (!zeropage) { /* mcopy_atomic */ 2261 page_kaddr = kmap_atomic(page); 2262 ret = copy_from_user(page_kaddr, 2263 (const void __user *)src_addr, 2264 PAGE_SIZE); 2265 kunmap_atomic(page_kaddr); 2266 2267 /* fallback to copy_from_user outside mmap_sem */ 2268 if (unlikely(ret)) { 2269 *pagep = page; 2270 shmem_inode_unacct_blocks(inode, 1); 2271 /* don't free the page */ 2272 return -EFAULT; 2273 } 2274 } else { /* mfill_zeropage_atomic */ 2275 clear_highpage(page); 2276 } 2277 } else { 2278 page = *pagep; 2279 *pagep = NULL; 2280 } 2281 2282 VM_BUG_ON(PageLocked(page) || PageSwapBacked(page)); 2283 __SetPageLocked(page); 2284 __SetPageSwapBacked(page); 2285 __SetPageUptodate(page); 2286 2287 ret = mem_cgroup_try_charge_delay(page, dst_mm, gfp, &memcg, false); 2288 if (ret) 2289 goto out_release; 2290 2291 ret = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK); 2292 if (!ret) { 2293 ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL); 2294 radix_tree_preload_end(); 2295 } 2296 if (ret) 2297 goto out_release_uncharge; 2298 2299 mem_cgroup_commit_charge(page, memcg, false, false); 2300 2301 _dst_pte = mk_pte(page, dst_vma->vm_page_prot); 2302 if (dst_vma->vm_flags & VM_WRITE) 2303 _dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte)); 2304 2305 ret = -EEXIST; 2306 dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); 2307 if (!pte_none(*dst_pte)) 2308 goto out_release_uncharge_unlock; 2309 2310 lru_cache_add_anon(page); 2311 2312 spin_lock(&info->lock); 2313 info->alloced++; 2314 inode->i_blocks += BLOCKS_PER_PAGE; 2315 shmem_recalc_inode(inode); 2316 spin_unlock(&info->lock); 2317 2318 inc_mm_counter(dst_mm, mm_counter_file(page)); 2319 page_add_file_rmap(page, false); 2320 set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); 2321 2322 /* No need to invalidate - it was non-present before */ 2323 update_mmu_cache(dst_vma, dst_addr, dst_pte); 2324 unlock_page(page); 2325 pte_unmap_unlock(dst_pte, ptl); 2326 ret = 0; 2327 out: 2328 return ret; 2329 out_release_uncharge_unlock: 2330 pte_unmap_unlock(dst_pte, ptl); 2331 out_release_uncharge: 2332 mem_cgroup_cancel_charge(page, memcg, false); 2333 out_release: 2334 unlock_page(page); 2335 put_page(page); 2336 out_unacct_blocks: 2337 shmem_inode_unacct_blocks(inode, 1); 2338 goto out; 2339 } 2340 2341 int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, 2342 pmd_t *dst_pmd, 2343 struct vm_area_struct *dst_vma, 2344 unsigned long dst_addr, 2345 unsigned long src_addr, 2346 struct page **pagep) 2347 { 2348 return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, 2349 dst_addr, src_addr, false, pagep); 2350 } 2351 2352 int shmem_mfill_zeropage_pte(struct mm_struct *dst_mm, 2353 pmd_t *dst_pmd, 2354 struct vm_area_struct *dst_vma, 2355 unsigned long dst_addr) 2356 { 2357 struct page *page = NULL; 2358 2359 return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, 2360 dst_addr, 0, true, &page); 2361 } 2362 2363 #ifdef CONFIG_TMPFS 2364 static const struct inode_operations shmem_symlink_inode_operations; 2365 static const struct inode_operations shmem_short_symlink_operations; 2366 2367 #ifdef CONFIG_TMPFS_XATTR 2368 static int shmem_initxattrs(struct inode *, const struct xattr *, void *); 2369 #else 2370 #define shmem_initxattrs NULL 2371 #endif 2372 2373 static int 2374 shmem_write_begin(struct file *file, struct address_space *mapping, 2375 loff_t pos, unsigned len, unsigned flags, 2376 struct page **pagep, void **fsdata) 2377 { 2378 struct inode *inode = mapping->host; 2379 struct shmem_inode_info *info = SHMEM_I(inode); 2380 pgoff_t index = pos >> PAGE_SHIFT; 2381 2382 /* i_mutex is held by caller */ 2383 if (unlikely(info->seals & (F_SEAL_WRITE | F_SEAL_GROW))) { 2384 if (info->seals & F_SEAL_WRITE) 2385 return -EPERM; 2386 if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size) 2387 return -EPERM; 2388 } 2389 2390 return shmem_getpage(inode, index, pagep, SGP_WRITE); 2391 } 2392 2393 static int 2394 shmem_write_end(struct file *file, struct address_space *mapping, 2395 loff_t pos, unsigned len, unsigned copied, 2396 struct page *page, void *fsdata) 2397 { 2398 struct inode *inode = mapping->host; 2399 2400 if (pos + copied > inode->i_size) 2401 i_size_write(inode, pos + copied); 2402 2403 if (!PageUptodate(page)) { 2404 struct page *head = compound_head(page); 2405 if (PageTransCompound(page)) { 2406 int i; 2407 2408 for (i = 0; i < HPAGE_PMD_NR; i++) { 2409 if (head + i == page) 2410 continue; 2411 clear_highpage(head + i); 2412 flush_dcache_page(head + i); 2413 } 2414 } 2415 if (copied < PAGE_SIZE) { 2416 unsigned from = pos & (PAGE_SIZE - 1); 2417 zero_user_segments(page, 0, from, 2418 from + copied, PAGE_SIZE); 2419 } 2420 SetPageUptodate(head); 2421 } 2422 set_page_dirty(page); 2423 unlock_page(page); 2424 put_page(page); 2425 2426 return copied; 2427 } 2428 2429 static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) 2430 { 2431 struct file *file = iocb->ki_filp; 2432 struct inode *inode = file_inode(file); 2433 struct address_space *mapping = inode->i_mapping; 2434 pgoff_t index; 2435 unsigned long offset; 2436 enum sgp_type sgp = SGP_READ; 2437 int error = 0; 2438 ssize_t retval = 0; 2439 loff_t *ppos = &iocb->ki_pos; 2440 2441 /* 2442 * Might this read be for a stacking filesystem? Then when reading 2443 * holes of a sparse file, we actually need to allocate those pages, 2444 * and even mark them dirty, so it cannot exceed the max_blocks limit. 2445 */ 2446 if (!iter_is_iovec(to)) 2447 sgp = SGP_CACHE; 2448 2449 index = *ppos >> PAGE_SHIFT; 2450 offset = *ppos & ~PAGE_MASK; 2451 2452 for (;;) { 2453 struct page *page = NULL; 2454 pgoff_t end_index; 2455 unsigned long nr, ret; 2456 loff_t i_size = i_size_read(inode); 2457 2458 end_index = i_size >> PAGE_SHIFT; 2459 if (index > end_index) 2460 break; 2461 if (index == end_index) { 2462 nr = i_size & ~PAGE_MASK; 2463 if (nr <= offset) 2464 break; 2465 } 2466 2467 error = shmem_getpage(inode, index, &page, sgp); 2468 if (error) { 2469 if (error == -EINVAL) 2470 error = 0; 2471 break; 2472 } 2473 if (page) { 2474 if (sgp == SGP_CACHE) 2475 set_page_dirty(page); 2476 unlock_page(page); 2477 } 2478 2479 /* 2480 * We must evaluate after, since reads (unlike writes) 2481 * are called without i_mutex protection against truncate 2482 */ 2483 nr = PAGE_SIZE; 2484 i_size = i_size_read(inode); 2485 end_index = i_size >> PAGE_SHIFT; 2486 if (index == end_index) { 2487 nr = i_size & ~PAGE_MASK; 2488 if (nr <= offset) { 2489 if (page) 2490 put_page(page); 2491 break; 2492 } 2493 } 2494 nr -= offset; 2495 2496 if (page) { 2497 /* 2498 * If users can be writing to this page using arbitrary 2499 * virtual addresses, take care about potential aliasing 2500 * before reading the page on the kernel side. 2501 */ 2502 if (mapping_writably_mapped(mapping)) 2503 flush_dcache_page(page); 2504 /* 2505 * Mark the page accessed if we read the beginning. 2506 */ 2507 if (!offset) 2508 mark_page_accessed(page); 2509 } else { 2510 page = ZERO_PAGE(0); 2511 get_page(page); 2512 } 2513 2514 /* 2515 * Ok, we have the page, and it's up-to-date, so 2516 * now we can copy it to user space... 2517 */ 2518 ret = copy_page_to_iter(page, offset, nr, to); 2519 retval += ret; 2520 offset += ret; 2521 index += offset >> PAGE_SHIFT; 2522 offset &= ~PAGE_MASK; 2523 2524 put_page(page); 2525 if (!iov_iter_count(to)) 2526 break; 2527 if (ret < nr) { 2528 error = -EFAULT; 2529 break; 2530 } 2531 cond_resched(); 2532 } 2533 2534 *ppos = ((loff_t) index << PAGE_SHIFT) + offset; 2535 file_accessed(file); 2536 return retval ? retval : error; 2537 } 2538 2539 /* 2540 * llseek SEEK_DATA or SEEK_HOLE through the radix_tree. 2541 */ 2542 static pgoff_t shmem_seek_hole_data(struct address_space *mapping, 2543 pgoff_t index, pgoff_t end, int whence) 2544 { 2545 struct page *page; 2546 struct pagevec pvec; 2547 pgoff_t indices[PAGEVEC_SIZE]; 2548 bool done = false; 2549 int i; 2550 2551 pagevec_init(&pvec); 2552 pvec.nr = 1; /* start small: we may be there already */ 2553 while (!done) { 2554 pvec.nr = find_get_entries(mapping, index, 2555 pvec.nr, pvec.pages, indices); 2556 if (!pvec.nr) { 2557 if (whence == SEEK_DATA) 2558 index = end; 2559 break; 2560 } 2561 for (i = 0; i < pvec.nr; i++, index++) { 2562 if (index < indices[i]) { 2563 if (whence == SEEK_HOLE) { 2564 done = true; 2565 break; 2566 } 2567 index = indices[i]; 2568 } 2569 page = pvec.pages[i]; 2570 if (page && !xa_is_value(page)) { 2571 if (!PageUptodate(page)) 2572 page = NULL; 2573 } 2574 if (index >= end || 2575 (page && whence == SEEK_DATA) || 2576 (!page && whence == SEEK_HOLE)) { 2577 done = true; 2578 break; 2579 } 2580 } 2581 pagevec_remove_exceptionals(&pvec); 2582 pagevec_release(&pvec); 2583 pvec.nr = PAGEVEC_SIZE; 2584 cond_resched(); 2585 } 2586 return index; 2587 } 2588 2589 static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence) 2590 { 2591 struct address_space *mapping = file->f_mapping; 2592 struct inode *inode = mapping->host; 2593 pgoff_t start, end; 2594 loff_t new_offset; 2595 2596 if (whence != SEEK_DATA && whence != SEEK_HOLE) 2597 return generic_file_llseek_size(file, offset, whence, 2598 MAX_LFS_FILESIZE, i_size_read(inode)); 2599 inode_lock(inode); 2600 /* We're holding i_mutex so we can access i_size directly */ 2601 2602 if (offset < 0) 2603 offset = -EINVAL; 2604 else if (offset >= inode->i_size) 2605 offset = -ENXIO; 2606 else { 2607 start = offset >> PAGE_SHIFT; 2608 end = (inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT; 2609 new_offset = shmem_seek_hole_data(mapping, start, end, whence); 2610 new_offset <<= PAGE_SHIFT; 2611 if (new_offset > offset) { 2612 if (new_offset < inode->i_size) 2613 offset = new_offset; 2614 else if (whence == SEEK_DATA) 2615 offset = -ENXIO; 2616 else 2617 offset = inode->i_size; 2618 } 2619 } 2620 2621 if (offset >= 0) 2622 offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE); 2623 inode_unlock(inode); 2624 return offset; 2625 } 2626 2627 static long shmem_fallocate(struct file *file, int mode, loff_t offset, 2628 loff_t len) 2629 { 2630 struct inode *inode = file_inode(file); 2631 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 2632 struct shmem_inode_info *info = SHMEM_I(inode); 2633 struct shmem_falloc shmem_falloc; 2634 pgoff_t start, index, end; 2635 int error; 2636 2637 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) 2638 return -EOPNOTSUPP; 2639 2640 inode_lock(inode); 2641 2642 if (mode & FALLOC_FL_PUNCH_HOLE) { 2643 struct address_space *mapping = file->f_mapping; 2644 loff_t unmap_start = round_up(offset, PAGE_SIZE); 2645 loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1; 2646 DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq); 2647 2648 /* protected by i_mutex */ 2649 if (info->seals & F_SEAL_WRITE) { 2650 error = -EPERM; 2651 goto out; 2652 } 2653 2654 shmem_falloc.waitq = &shmem_falloc_waitq; 2655 shmem_falloc.start = unmap_start >> PAGE_SHIFT; 2656 shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT; 2657 spin_lock(&inode->i_lock); 2658 inode->i_private = &shmem_falloc; 2659 spin_unlock(&inode->i_lock); 2660 2661 if ((u64)unmap_end > (u64)unmap_start) 2662 unmap_mapping_range(mapping, unmap_start, 2663 1 + unmap_end - unmap_start, 0); 2664 shmem_truncate_range(inode, offset, offset + len - 1); 2665 /* No need to unmap again: hole-punching leaves COWed pages */ 2666 2667 spin_lock(&inode->i_lock); 2668 inode->i_private = NULL; 2669 wake_up_all(&shmem_falloc_waitq); 2670 WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.head)); 2671 spin_unlock(&inode->i_lock); 2672 error = 0; 2673 goto out; 2674 } 2675 2676 /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */ 2677 error = inode_newsize_ok(inode, offset + len); 2678 if (error) 2679 goto out; 2680 2681 if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) { 2682 error = -EPERM; 2683 goto out; 2684 } 2685 2686 start = offset >> PAGE_SHIFT; 2687 end = (offset + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 2688 /* Try to avoid a swapstorm if len is impossible to satisfy */ 2689 if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) { 2690 error = -ENOSPC; 2691 goto out; 2692 } 2693 2694 shmem_falloc.waitq = NULL; 2695 shmem_falloc.start = start; 2696 shmem_falloc.next = start; 2697 shmem_falloc.nr_falloced = 0; 2698 shmem_falloc.nr_unswapped = 0; 2699 spin_lock(&inode->i_lock); 2700 inode->i_private = &shmem_falloc; 2701 spin_unlock(&inode->i_lock); 2702 2703 for (index = start; index < end; index++) { 2704 struct page *page; 2705 2706 /* 2707 * Good, the fallocate(2) manpage permits EINTR: we may have 2708 * been interrupted because we are using up too much memory. 2709 */ 2710 if (signal_pending(current)) 2711 error = -EINTR; 2712 else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced) 2713 error = -ENOMEM; 2714 else 2715 error = shmem_getpage(inode, index, &page, SGP_FALLOC); 2716 if (error) { 2717 /* Remove the !PageUptodate pages we added */ 2718 if (index > start) { 2719 shmem_undo_range(inode, 2720 (loff_t)start << PAGE_SHIFT, 2721 ((loff_t)index << PAGE_SHIFT) - 1, true); 2722 } 2723 goto undone; 2724 } 2725 2726 /* 2727 * Inform shmem_writepage() how far we have reached. 2728 * No need for lock or barrier: we have the page lock. 2729 */ 2730 shmem_falloc.next++; 2731 if (!PageUptodate(page)) 2732 shmem_falloc.nr_falloced++; 2733 2734 /* 2735 * If !PageUptodate, leave it that way so that freeable pages 2736 * can be recognized if we need to rollback on error later. 2737 * But set_page_dirty so that memory pressure will swap rather 2738 * than free the pages we are allocating (and SGP_CACHE pages 2739 * might still be clean: we now need to mark those dirty too). 2740 */ 2741 set_page_dirty(page); 2742 unlock_page(page); 2743 put_page(page); 2744 cond_resched(); 2745 } 2746 2747 if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) 2748 i_size_write(inode, offset + len); 2749 inode->i_ctime = current_time(inode); 2750 undone: 2751 spin_lock(&inode->i_lock); 2752 inode->i_private = NULL; 2753 spin_unlock(&inode->i_lock); 2754 out: 2755 inode_unlock(inode); 2756 return error; 2757 } 2758 2759 static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) 2760 { 2761 struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); 2762 2763 buf->f_type = TMPFS_MAGIC; 2764 buf->f_bsize = PAGE_SIZE; 2765 buf->f_namelen = NAME_MAX; 2766 if (sbinfo->max_blocks) { 2767 buf->f_blocks = sbinfo->max_blocks; 2768 buf->f_bavail = 2769 buf->f_bfree = sbinfo->max_blocks - 2770 percpu_counter_sum(&sbinfo->used_blocks); 2771 } 2772 if (sbinfo->max_inodes) { 2773 buf->f_files = sbinfo->max_inodes; 2774 buf->f_ffree = sbinfo->free_inodes; 2775 } 2776 /* else leave those fields 0 like simple_statfs */ 2777 return 0; 2778 } 2779 2780 /* 2781 * File creation. Allocate an inode, and we're done.. 2782 */ 2783 static int 2784 shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) 2785 { 2786 struct inode *inode; 2787 int error = -ENOSPC; 2788 2789 inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE); 2790 if (inode) { 2791 error = simple_acl_create(dir, inode); 2792 if (error) 2793 goto out_iput; 2794 error = security_inode_init_security(inode, dir, 2795 &dentry->d_name, 2796 shmem_initxattrs, NULL); 2797 if (error && error != -EOPNOTSUPP) 2798 goto out_iput; 2799 2800 error = 0; 2801 dir->i_size += BOGO_DIRENT_SIZE; 2802 dir->i_ctime = dir->i_mtime = current_time(dir); 2803 d_instantiate(dentry, inode); 2804 dget(dentry); /* Extra count - pin the dentry in core */ 2805 } 2806 return error; 2807 out_iput: 2808 iput(inode); 2809 return error; 2810 } 2811 2812 static int 2813 shmem_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) 2814 { 2815 struct inode *inode; 2816 int error = -ENOSPC; 2817 2818 inode = shmem_get_inode(dir->i_sb, dir, mode, 0, VM_NORESERVE); 2819 if (inode) { 2820 error = security_inode_init_security(inode, dir, 2821 NULL, 2822 shmem_initxattrs, NULL); 2823 if (error && error != -EOPNOTSUPP) 2824 goto out_iput; 2825 error = simple_acl_create(dir, inode); 2826 if (error) 2827 goto out_iput; 2828 d_tmpfile(dentry, inode); 2829 } 2830 return error; 2831 out_iput: 2832 iput(inode); 2833 return error; 2834 } 2835 2836 static int shmem_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) 2837 { 2838 int error; 2839 2840 if ((error = shmem_mknod(dir, dentry, mode | S_IFDIR, 0))) 2841 return error; 2842 inc_nlink(dir); 2843 return 0; 2844 } 2845 2846 static int shmem_create(struct inode *dir, struct dentry *dentry, umode_t mode, 2847 bool excl) 2848 { 2849 return shmem_mknod(dir, dentry, mode | S_IFREG, 0); 2850 } 2851 2852 /* 2853 * Link a file.. 2854 */ 2855 static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) 2856 { 2857 struct inode *inode = d_inode(old_dentry); 2858 int ret; 2859 2860 /* 2861 * No ordinary (disk based) filesystem counts links as inodes; 2862 * but each new link needs a new dentry, pinning lowmem, and 2863 * tmpfs dentries cannot be pruned until they are unlinked. 2864 */ 2865 ret = shmem_reserve_inode(inode->i_sb); 2866 if (ret) 2867 goto out; 2868 2869 dir->i_size += BOGO_DIRENT_SIZE; 2870 inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); 2871 inc_nlink(inode); 2872 ihold(inode); /* New dentry reference */ 2873 dget(dentry); /* Extra pinning count for the created dentry */ 2874 d_instantiate(dentry, inode); 2875 out: 2876 return ret; 2877 } 2878 2879 static int shmem_unlink(struct inode *dir, struct dentry *dentry) 2880 { 2881 struct inode *inode = d_inode(dentry); 2882 2883 if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) 2884 shmem_free_inode(inode->i_sb); 2885 2886 dir->i_size -= BOGO_DIRENT_SIZE; 2887 inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); 2888 drop_nlink(inode); 2889 dput(dentry); /* Undo the count from "create" - this does all the work */ 2890 return 0; 2891 } 2892 2893 static int shmem_rmdir(struct inode *dir, struct dentry *dentry) 2894 { 2895 if (!simple_empty(dentry)) 2896 return -ENOTEMPTY; 2897 2898 drop_nlink(d_inode(dentry)); 2899 drop_nlink(dir); 2900 return shmem_unlink(dir, dentry); 2901 } 2902 2903 static int shmem_exchange(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) 2904 { 2905 bool old_is_dir = d_is_dir(old_dentry); 2906 bool new_is_dir = d_is_dir(new_dentry); 2907 2908 if (old_dir != new_dir && old_is_dir != new_is_dir) { 2909 if (old_is_dir) { 2910 drop_nlink(old_dir); 2911 inc_nlink(new_dir); 2912 } else { 2913 drop_nlink(new_dir); 2914 inc_nlink(old_dir); 2915 } 2916 } 2917 old_dir->i_ctime = old_dir->i_mtime = 2918 new_dir->i_ctime = new_dir->i_mtime = 2919 d_inode(old_dentry)->i_ctime = 2920 d_inode(new_dentry)->i_ctime = current_time(old_dir); 2921 2922 return 0; 2923 } 2924 2925 static int shmem_whiteout(struct inode *old_dir, struct dentry *old_dentry) 2926 { 2927 struct dentry *whiteout; 2928 int error; 2929 2930 whiteout = d_alloc(old_dentry->d_parent, &old_dentry->d_name); 2931 if (!whiteout) 2932 return -ENOMEM; 2933 2934 error = shmem_mknod(old_dir, whiteout, 2935 S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV); 2936 dput(whiteout); 2937 if (error) 2938 return error; 2939 2940 /* 2941 * Cheat and hash the whiteout while the old dentry is still in 2942 * place, instead of playing games with FS_RENAME_DOES_D_MOVE. 2943 * 2944 * d_lookup() will consistently find one of them at this point, 2945 * not sure which one, but that isn't even important. 2946 */ 2947 d_rehash(whiteout); 2948 return 0; 2949 } 2950 2951 /* 2952 * The VFS layer already does all the dentry stuff for rename, 2953 * we just have to decrement the usage count for the target if 2954 * it exists so that the VFS layer correctly free's it when it 2955 * gets overwritten. 2956 */ 2957 static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) 2958 { 2959 struct inode *inode = d_inode(old_dentry); 2960 int they_are_dirs = S_ISDIR(inode->i_mode); 2961 2962 if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) 2963 return -EINVAL; 2964 2965 if (flags & RENAME_EXCHANGE) 2966 return shmem_exchange(old_dir, old_dentry, new_dir, new_dentry); 2967 2968 if (!simple_empty(new_dentry)) 2969 return -ENOTEMPTY; 2970 2971 if (flags & RENAME_WHITEOUT) { 2972 int error; 2973 2974 error = shmem_whiteout(old_dir, old_dentry); 2975 if (error) 2976 return error; 2977 } 2978 2979 if (d_really_is_positive(new_dentry)) { 2980 (void) shmem_unlink(new_dir, new_dentry); 2981 if (they_are_dirs) { 2982 drop_nlink(d_inode(new_dentry)); 2983 drop_nlink(old_dir); 2984 } 2985 } else if (they_are_dirs) { 2986 drop_nlink(old_dir); 2987 inc_nlink(new_dir); 2988 } 2989 2990 old_dir->i_size -= BOGO_DIRENT_SIZE; 2991 new_dir->i_size += BOGO_DIRENT_SIZE; 2992 old_dir->i_ctime = old_dir->i_mtime = 2993 new_dir->i_ctime = new_dir->i_mtime = 2994 inode->i_ctime = current_time(old_dir); 2995 return 0; 2996 } 2997 2998 static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *symname) 2999 { 3000 int error; 3001 int len; 3002 struct inode *inode; 3003 struct page *page; 3004 3005 len = strlen(symname) + 1; 3006 if (len > PAGE_SIZE) 3007 return -ENAMETOOLONG; 3008 3009 inode = shmem_get_inode(dir->i_sb, dir, S_IFLNK | 0777, 0, 3010 VM_NORESERVE); 3011 if (!inode) 3012 return -ENOSPC; 3013 3014 error = security_inode_init_security(inode, dir, &dentry->d_name, 3015 shmem_initxattrs, NULL); 3016 if (error) { 3017 if (error != -EOPNOTSUPP) { 3018 iput(inode); 3019 return error; 3020 } 3021 error = 0; 3022 } 3023 3024 inode->i_size = len-1; 3025 if (len <= SHORT_SYMLINK_LEN) { 3026 inode->i_link = kmemdup(symname, len, GFP_KERNEL); 3027 if (!inode->i_link) { 3028 iput(inode); 3029 return -ENOMEM; 3030 } 3031 inode->i_op = &shmem_short_symlink_operations; 3032 } else { 3033 inode_nohighmem(inode); 3034 error = shmem_getpage(inode, 0, &page, SGP_WRITE); 3035 if (error) { 3036 iput(inode); 3037 return error; 3038 } 3039 inode->i_mapping->a_ops = &shmem_aops; 3040 inode->i_op = &shmem_symlink_inode_operations; 3041 memcpy(page_address(page), symname, len); 3042 SetPageUptodate(page); 3043 set_page_dirty(page); 3044 unlock_page(page); 3045 put_page(page); 3046 } 3047 dir->i_size += BOGO_DIRENT_SIZE; 3048 dir->i_ctime = dir->i_mtime = current_time(dir); 3049 d_instantiate(dentry, inode); 3050 dget(dentry); 3051 return 0; 3052 } 3053 3054 static void shmem_put_link(void *arg) 3055 { 3056 mark_page_accessed(arg); 3057 put_page(arg); 3058 } 3059 3060 static const char *shmem_get_link(struct dentry *dentry, 3061 struct inode *inode, 3062 struct delayed_call *done) 3063 { 3064 struct page *page = NULL; 3065 int error; 3066 if (!dentry) { 3067 page = find_get_page(inode->i_mapping, 0); 3068 if (!page) 3069 return ERR_PTR(-ECHILD); 3070 if (!PageUptodate(page)) { 3071 put_page(page); 3072 return ERR_PTR(-ECHILD); 3073 } 3074 } else { 3075 error = shmem_getpage(inode, 0, &page, SGP_READ); 3076 if (error) 3077 return ERR_PTR(error); 3078 unlock_page(page); 3079 } 3080 set_delayed_call(done, shmem_put_link, page); 3081 return page_address(page); 3082 } 3083 3084 #ifdef CONFIG_TMPFS_XATTR 3085 /* 3086 * Superblocks without xattr inode operations may get some security.* xattr 3087 * support from the LSM "for free". As soon as we have any other xattrs 3088 * like ACLs, we also need to implement the security.* handlers at 3089 * filesystem level, though. 3090 */ 3091 3092 /* 3093 * Callback for security_inode_init_security() for acquiring xattrs. 3094 */ 3095 static int shmem_initxattrs(struct inode *inode, 3096 const struct xattr *xattr_array, 3097 void *fs_info) 3098 { 3099 struct shmem_inode_info *info = SHMEM_I(inode); 3100 const struct xattr *xattr; 3101 struct simple_xattr *new_xattr; 3102 size_t len; 3103 3104 for (xattr = xattr_array; xattr->name != NULL; xattr++) { 3105 new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len); 3106 if (!new_xattr) 3107 return -ENOMEM; 3108 3109 len = strlen(xattr->name) + 1; 3110 new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len, 3111 GFP_KERNEL); 3112 if (!new_xattr->name) { 3113 kfree(new_xattr); 3114 return -ENOMEM; 3115 } 3116 3117 memcpy(new_xattr->name, XATTR_SECURITY_PREFIX, 3118 XATTR_SECURITY_PREFIX_LEN); 3119 memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN, 3120 xattr->name, len); 3121 3122 simple_xattr_list_add(&info->xattrs, new_xattr); 3123 } 3124 3125 return 0; 3126 } 3127 3128 static int shmem_xattr_handler_get(const struct xattr_handler *handler, 3129 struct dentry *unused, struct inode *inode, 3130 const char *name, void *buffer, size_t size) 3131 { 3132 struct shmem_inode_info *info = SHMEM_I(inode); 3133 3134 name = xattr_full_name(handler, name); 3135 return simple_xattr_get(&info->xattrs, name, buffer, size); 3136 } 3137 3138 static int shmem_xattr_handler_set(const struct xattr_handler *handler, 3139 struct dentry *unused, struct inode *inode, 3140 const char *name, const void *value, 3141 size_t size, int flags) 3142 { 3143 struct shmem_inode_info *info = SHMEM_I(inode); 3144 3145 name = xattr_full_name(handler, name); 3146 return simple_xattr_set(&info->xattrs, name, value, size, flags); 3147 } 3148 3149 static const struct xattr_handler shmem_security_xattr_handler = { 3150 .prefix = XATTR_SECURITY_PREFIX, 3151 .get = shmem_xattr_handler_get, 3152 .set = shmem_xattr_handler_set, 3153 }; 3154 3155 static const struct xattr_handler shmem_trusted_xattr_handler = { 3156 .prefix = XATTR_TRUSTED_PREFIX, 3157 .get = shmem_xattr_handler_get, 3158 .set = shmem_xattr_handler_set, 3159 }; 3160 3161 static const struct xattr_handler *shmem_xattr_handlers[] = { 3162 #ifdef CONFIG_TMPFS_POSIX_ACL 3163 &posix_acl_access_xattr_handler, 3164 &posix_acl_default_xattr_handler, 3165 #endif 3166 &shmem_security_xattr_handler, 3167 &shmem_trusted_xattr_handler, 3168 NULL 3169 }; 3170 3171 static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size) 3172 { 3173 struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); 3174 return simple_xattr_list(d_inode(dentry), &info->xattrs, buffer, size); 3175 } 3176 #endif /* CONFIG_TMPFS_XATTR */ 3177 3178 static const struct inode_operations shmem_short_symlink_operations = { 3179 .get_link = simple_get_link, 3180 #ifdef CONFIG_TMPFS_XATTR 3181 .listxattr = shmem_listxattr, 3182 #endif 3183 }; 3184 3185 static const struct inode_operations shmem_symlink_inode_operations = { 3186 .get_link = shmem_get_link, 3187 #ifdef CONFIG_TMPFS_XATTR 3188 .listxattr = shmem_listxattr, 3189 #endif 3190 }; 3191 3192 static struct dentry *shmem_get_parent(struct dentry *child) 3193 { 3194 return ERR_PTR(-ESTALE); 3195 } 3196 3197 static int shmem_match(struct inode *ino, void *vfh) 3198 { 3199 __u32 *fh = vfh; 3200 __u64 inum = fh[2]; 3201 inum = (inum << 32) | fh[1]; 3202 return ino->i_ino == inum && fh[0] == ino->i_generation; 3203 } 3204 3205 /* Find any alias of inode, but prefer a hashed alias */ 3206 static struct dentry *shmem_find_alias(struct inode *inode) 3207 { 3208 struct dentry *alias = d_find_alias(inode); 3209 3210 return alias ?: d_find_any_alias(inode); 3211 } 3212 3213 3214 static struct dentry *shmem_fh_to_dentry(struct super_block *sb, 3215 struct fid *fid, int fh_len, int fh_type) 3216 { 3217 struct inode *inode; 3218 struct dentry *dentry = NULL; 3219 u64 inum; 3220 3221 if (fh_len < 3) 3222 return NULL; 3223 3224 inum = fid->raw[2]; 3225 inum = (inum << 32) | fid->raw[1]; 3226 3227 inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]), 3228 shmem_match, fid->raw); 3229 if (inode) { 3230 dentry = shmem_find_alias(inode); 3231 iput(inode); 3232 } 3233 3234 return dentry; 3235 } 3236 3237 static int shmem_encode_fh(struct inode *inode, __u32 *fh, int *len, 3238 struct inode *parent) 3239 { 3240 if (*len < 3) { 3241 *len = 3; 3242 return FILEID_INVALID; 3243 } 3244 3245 if (inode_unhashed(inode)) { 3246 /* Unfortunately insert_inode_hash is not idempotent, 3247 * so as we hash inodes here rather than at creation 3248 * time, we need a lock to ensure we only try 3249 * to do it once 3250 */ 3251 static DEFINE_SPINLOCK(lock); 3252 spin_lock(&lock); 3253 if (inode_unhashed(inode)) 3254 __insert_inode_hash(inode, 3255 inode->i_ino + inode->i_generation); 3256 spin_unlock(&lock); 3257 } 3258 3259 fh[0] = inode->i_generation; 3260 fh[1] = inode->i_ino; 3261 fh[2] = ((__u64)inode->i_ino) >> 32; 3262 3263 *len = 3; 3264 return 1; 3265 } 3266 3267 static const struct export_operations shmem_export_ops = { 3268 .get_parent = shmem_get_parent, 3269 .encode_fh = shmem_encode_fh, 3270 .fh_to_dentry = shmem_fh_to_dentry, 3271 }; 3272 3273 static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, 3274 bool remount) 3275 { 3276 char *this_char, *value, *rest; 3277 struct mempolicy *mpol = NULL; 3278 uid_t uid; 3279 gid_t gid; 3280 3281 while (options != NULL) { 3282 this_char = options; 3283 for (;;) { 3284 /* 3285 * NUL-terminate this option: unfortunately, 3286 * mount options form a comma-separated list, 3287 * but mpol's nodelist may also contain commas. 3288 */ 3289 options = strchr(options, ','); 3290 if (options == NULL) 3291 break; 3292 options++; 3293 if (!isdigit(*options)) { 3294 options[-1] = '\0'; 3295 break; 3296 } 3297 } 3298 if (!*this_char) 3299 continue; 3300 if ((value = strchr(this_char,'=')) != NULL) { 3301 *value++ = 0; 3302 } else { 3303 pr_err("tmpfs: No value for mount option '%s'\n", 3304 this_char); 3305 goto error; 3306 } 3307 3308 if (!strcmp(this_char,"size")) { 3309 unsigned long long size; 3310 size = memparse(value,&rest); 3311 if (*rest == '%') { 3312 size <<= PAGE_SHIFT; 3313 size *= totalram_pages; 3314 do_div(size, 100); 3315 rest++; 3316 } 3317 if (*rest) 3318 goto bad_val; 3319 sbinfo->max_blocks = 3320 DIV_ROUND_UP(size, PAGE_SIZE); 3321 } else if (!strcmp(this_char,"nr_blocks")) { 3322 sbinfo->max_blocks = memparse(value, &rest); 3323 if (*rest) 3324 goto bad_val; 3325 } else if (!strcmp(this_char,"nr_inodes")) { 3326 sbinfo->max_inodes = memparse(value, &rest); 3327 if (*rest) 3328 goto bad_val; 3329 } else if (!strcmp(this_char,"mode")) { 3330 if (remount) 3331 continue; 3332 sbinfo->mode = simple_strtoul(value, &rest, 8) & 07777; 3333 if (*rest) 3334 goto bad_val; 3335 } else if (!strcmp(this_char,"uid")) { 3336 if (remount) 3337 continue; 3338 uid = simple_strtoul(value, &rest, 0); 3339 if (*rest) 3340 goto bad_val; 3341 sbinfo->uid = make_kuid(current_user_ns(), uid); 3342 if (!uid_valid(sbinfo->uid)) 3343 goto bad_val; 3344 } else if (!strcmp(this_char,"gid")) { 3345 if (remount) 3346 continue; 3347 gid = simple_strtoul(value, &rest, 0); 3348 if (*rest) 3349 goto bad_val; 3350 sbinfo->gid = make_kgid(current_user_ns(), gid); 3351 if (!gid_valid(sbinfo->gid)) 3352 goto bad_val; 3353 #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE 3354 } else if (!strcmp(this_char, "huge")) { 3355 int huge; 3356 huge = shmem_parse_huge(value); 3357 if (huge < 0) 3358 goto bad_val; 3359 if (!has_transparent_hugepage() && 3360 huge != SHMEM_HUGE_NEVER) 3361 goto bad_val; 3362 sbinfo->huge = huge; 3363 #endif 3364 #ifdef CONFIG_NUMA 3365 } else if (!strcmp(this_char,"mpol")) { 3366 mpol_put(mpol); 3367 mpol = NULL; 3368 if (mpol_parse_str(value, &mpol)) 3369 goto bad_val; 3370 #endif 3371 } else { 3372 pr_err("tmpfs: Bad mount option %s\n", this_char); 3373 goto error; 3374 } 3375 } 3376 sbinfo->mpol = mpol; 3377 return 0; 3378 3379 bad_val: 3380 pr_err("tmpfs: Bad value '%s' for mount option '%s'\n", 3381 value, this_char); 3382 error: 3383 mpol_put(mpol); 3384 return 1; 3385 3386 } 3387 3388 static int shmem_remount_fs(struct super_block *sb, int *flags, char *data) 3389 { 3390 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 3391 struct shmem_sb_info config = *sbinfo; 3392 unsigned long inodes; 3393 int error = -EINVAL; 3394 3395 config.mpol = NULL; 3396 if (shmem_parse_options(data, &config, true)) 3397 return error; 3398 3399 spin_lock(&sbinfo->stat_lock); 3400 inodes = sbinfo->max_inodes - sbinfo->free_inodes; 3401 if (percpu_counter_compare(&sbinfo->used_blocks, config.max_blocks) > 0) 3402 goto out; 3403 if (config.max_inodes < inodes) 3404 goto out; 3405 /* 3406 * Those tests disallow limited->unlimited while any are in use; 3407 * but we must separately disallow unlimited->limited, because 3408 * in that case we have no record of how much is already in use. 3409 */ 3410 if (config.max_blocks && !sbinfo->max_blocks) 3411 goto out; 3412 if (config.max_inodes && !sbinfo->max_inodes) 3413 goto out; 3414 3415 error = 0; 3416 sbinfo->huge = config.huge; 3417 sbinfo->max_blocks = config.max_blocks; 3418 sbinfo->max_inodes = config.max_inodes; 3419 sbinfo->free_inodes = config.max_inodes - inodes; 3420 3421 /* 3422 * Preserve previous mempolicy unless mpol remount option was specified. 3423 */ 3424 if (config.mpol) { 3425 mpol_put(sbinfo->mpol); 3426 sbinfo->mpol = config.mpol; /* transfers initial ref */ 3427 } 3428 out: 3429 spin_unlock(&sbinfo->stat_lock); 3430 return error; 3431 } 3432 3433 static int shmem_show_options(struct seq_file *seq, struct dentry *root) 3434 { 3435 struct shmem_sb_info *sbinfo = SHMEM_SB(root->d_sb); 3436 3437 if (sbinfo->max_blocks != shmem_default_max_blocks()) 3438 seq_printf(seq, ",size=%luk", 3439 sbinfo->max_blocks << (PAGE_SHIFT - 10)); 3440 if (sbinfo->max_inodes != shmem_default_max_inodes()) 3441 seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes); 3442 if (sbinfo->mode != (0777 | S_ISVTX)) 3443 seq_printf(seq, ",mode=%03ho", sbinfo->mode); 3444 if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID)) 3445 seq_printf(seq, ",uid=%u", 3446 from_kuid_munged(&init_user_ns, sbinfo->uid)); 3447 if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID)) 3448 seq_printf(seq, ",gid=%u", 3449 from_kgid_munged(&init_user_ns, sbinfo->gid)); 3450 #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE 3451 /* Rightly or wrongly, show huge mount option unmasked by shmem_huge */ 3452 if (sbinfo->huge) 3453 seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge)); 3454 #endif 3455 shmem_show_mpol(seq, sbinfo->mpol); 3456 return 0; 3457 } 3458 3459 #endif /* CONFIG_TMPFS */ 3460 3461 static void shmem_put_super(struct super_block *sb) 3462 { 3463 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 3464 3465 percpu_counter_destroy(&sbinfo->used_blocks); 3466 mpol_put(sbinfo->mpol); 3467 kfree(sbinfo); 3468 sb->s_fs_info = NULL; 3469 } 3470 3471 int shmem_fill_super(struct super_block *sb, void *data, int silent) 3472 { 3473 struct inode *inode; 3474 struct shmem_sb_info *sbinfo; 3475 int err = -ENOMEM; 3476 3477 /* Round up to L1_CACHE_BYTES to resist false sharing */ 3478 sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info), 3479 L1_CACHE_BYTES), GFP_KERNEL); 3480 if (!sbinfo) 3481 return -ENOMEM; 3482 3483 sbinfo->mode = 0777 | S_ISVTX; 3484 sbinfo->uid = current_fsuid(); 3485 sbinfo->gid = current_fsgid(); 3486 sb->s_fs_info = sbinfo; 3487 3488 #ifdef CONFIG_TMPFS 3489 /* 3490 * Per default we only allow half of the physical ram per 3491 * tmpfs instance, limiting inodes to one per page of lowmem; 3492 * but the internal instance is left unlimited. 3493 */ 3494 if (!(sb->s_flags & SB_KERNMOUNT)) { 3495 sbinfo->max_blocks = shmem_default_max_blocks(); 3496 sbinfo->max_inodes = shmem_default_max_inodes(); 3497 if (shmem_parse_options(data, sbinfo, false)) { 3498 err = -EINVAL; 3499 goto failed; 3500 } 3501 } else { 3502 sb->s_flags |= SB_NOUSER; 3503 } 3504 sb->s_export_op = &shmem_export_ops; 3505 sb->s_flags |= SB_NOSEC; 3506 #else 3507 sb->s_flags |= SB_NOUSER; 3508 #endif 3509 3510 spin_lock_init(&sbinfo->stat_lock); 3511 if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL)) 3512 goto failed; 3513 sbinfo->free_inodes = sbinfo->max_inodes; 3514 spin_lock_init(&sbinfo->shrinklist_lock); 3515 INIT_LIST_HEAD(&sbinfo->shrinklist); 3516 3517 sb->s_maxbytes = MAX_LFS_FILESIZE; 3518 sb->s_blocksize = PAGE_SIZE; 3519 sb->s_blocksize_bits = PAGE_SHIFT; 3520 sb->s_magic = TMPFS_MAGIC; 3521 sb->s_op = &shmem_ops; 3522 sb->s_time_gran = 1; 3523 #ifdef CONFIG_TMPFS_XATTR 3524 sb->s_xattr = shmem_xattr_handlers; 3525 #endif 3526 #ifdef CONFIG_TMPFS_POSIX_ACL 3527 sb->s_flags |= SB_POSIXACL; 3528 #endif 3529 uuid_gen(&sb->s_uuid); 3530 3531 inode = shmem_get_inode(sb, NULL, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE); 3532 if (!inode) 3533 goto failed; 3534 inode->i_uid = sbinfo->uid; 3535 inode->i_gid = sbinfo->gid; 3536 sb->s_root = d_make_root(inode); 3537 if (!sb->s_root) 3538 goto failed; 3539 return 0; 3540 3541 failed: 3542 shmem_put_super(sb); 3543 return err; 3544 } 3545 3546 static struct kmem_cache *shmem_inode_cachep; 3547 3548 static struct inode *shmem_alloc_inode(struct super_block *sb) 3549 { 3550 struct shmem_inode_info *info; 3551 info = kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL); 3552 if (!info) 3553 return NULL; 3554 return &info->vfs_inode; 3555 } 3556 3557 static void shmem_destroy_callback(struct rcu_head *head) 3558 { 3559 struct inode *inode = container_of(head, struct inode, i_rcu); 3560 if (S_ISLNK(inode->i_mode)) 3561 kfree(inode->i_link); 3562 kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); 3563 } 3564 3565 static void shmem_destroy_inode(struct inode *inode) 3566 { 3567 if (S_ISREG(inode->i_mode)) 3568 mpol_free_shared_policy(&SHMEM_I(inode)->policy); 3569 call_rcu(&inode->i_rcu, shmem_destroy_callback); 3570 } 3571 3572 static void shmem_init_inode(void *foo) 3573 { 3574 struct shmem_inode_info *info = foo; 3575 inode_init_once(&info->vfs_inode); 3576 } 3577 3578 static void shmem_init_inodecache(void) 3579 { 3580 shmem_inode_cachep = kmem_cache_create("shmem_inode_cache", 3581 sizeof(struct shmem_inode_info), 3582 0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode); 3583 } 3584 3585 static void shmem_destroy_inodecache(void) 3586 { 3587 kmem_cache_destroy(shmem_inode_cachep); 3588 } 3589 3590 static const struct address_space_operations shmem_aops = { 3591 .writepage = shmem_writepage, 3592 .set_page_dirty = __set_page_dirty_no_writeback, 3593 #ifdef CONFIG_TMPFS 3594 .write_begin = shmem_write_begin, 3595 .write_end = shmem_write_end, 3596 #endif 3597 #ifdef CONFIG_MIGRATION 3598 .migratepage = migrate_page, 3599 #endif 3600 .error_remove_page = generic_error_remove_page, 3601 }; 3602 3603 static const struct file_operations shmem_file_operations = { 3604 .mmap = shmem_mmap, 3605 .get_unmapped_area = shmem_get_unmapped_area, 3606 #ifdef CONFIG_TMPFS 3607 .llseek = shmem_file_llseek, 3608 .read_iter = shmem_file_read_iter, 3609 .write_iter = generic_file_write_iter, 3610 .fsync = noop_fsync, 3611 .splice_read = generic_file_splice_read, 3612 .splice_write = iter_file_splice_write, 3613 .fallocate = shmem_fallocate, 3614 #endif 3615 }; 3616 3617 static const struct inode_operations shmem_inode_operations = { 3618 .getattr = shmem_getattr, 3619 .setattr = shmem_setattr, 3620 #ifdef CONFIG_TMPFS_XATTR 3621 .listxattr = shmem_listxattr, 3622 .set_acl = simple_set_acl, 3623 #endif 3624 }; 3625 3626 static const struct inode_operations shmem_dir_inode_operations = { 3627 #ifdef CONFIG_TMPFS 3628 .create = shmem_create, 3629 .lookup = simple_lookup, 3630 .link = shmem_link, 3631 .unlink = shmem_unlink, 3632 .symlink = shmem_symlink, 3633 .mkdir = shmem_mkdir, 3634 .rmdir = shmem_rmdir, 3635 .mknod = shmem_mknod, 3636 .rename = shmem_rename2, 3637 .tmpfile = shmem_tmpfile, 3638 #endif 3639 #ifdef CONFIG_TMPFS_XATTR 3640 .listxattr = shmem_listxattr, 3641 #endif 3642 #ifdef CONFIG_TMPFS_POSIX_ACL 3643 .setattr = shmem_setattr, 3644 .set_acl = simple_set_acl, 3645 #endif 3646 }; 3647 3648 static const struct inode_operations shmem_special_inode_operations = { 3649 #ifdef CONFIG_TMPFS_XATTR 3650 .listxattr = shmem_listxattr, 3651 #endif 3652 #ifdef CONFIG_TMPFS_POSIX_ACL 3653 .setattr = shmem_setattr, 3654 .set_acl = simple_set_acl, 3655 #endif 3656 }; 3657 3658 static const struct super_operations shmem_ops = { 3659 .alloc_inode = shmem_alloc_inode, 3660 .destroy_inode = shmem_destroy_inode, 3661 #ifdef CONFIG_TMPFS 3662 .statfs = shmem_statfs, 3663 .remount_fs = shmem_remount_fs, 3664 .show_options = shmem_show_options, 3665 #endif 3666 .evict_inode = shmem_evict_inode, 3667 .drop_inode = generic_delete_inode, 3668 .put_super = shmem_put_super, 3669 #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE 3670 .nr_cached_objects = shmem_unused_huge_count, 3671 .free_cached_objects = shmem_unused_huge_scan, 3672 #endif 3673 }; 3674 3675 static const struct vm_operations_struct shmem_vm_ops = { 3676 .fault = shmem_fault, 3677 .map_pages = filemap_map_pages, 3678 #ifdef CONFIG_NUMA 3679 .set_policy = shmem_set_policy, 3680 .get_policy = shmem_get_policy, 3681 #endif 3682 }; 3683 3684 static struct dentry *shmem_mount(struct file_system_type *fs_type, 3685 int flags, const char *dev_name, void *data) 3686 { 3687 return mount_nodev(fs_type, flags, data, shmem_fill_super); 3688 } 3689 3690 static struct file_system_type shmem_fs_type = { 3691 .owner = THIS_MODULE, 3692 .name = "tmpfs", 3693 .mount = shmem_mount, 3694 .kill_sb = kill_litter_super, 3695 .fs_flags = FS_USERNS_MOUNT, 3696 }; 3697 3698 int __init shmem_init(void) 3699 { 3700 int error; 3701 3702 /* If rootfs called this, don't re-init */ 3703 if (shmem_inode_cachep) 3704 return 0; 3705 3706 shmem_init_inodecache(); 3707 3708 error = register_filesystem(&shmem_fs_type); 3709 if (error) { 3710 pr_err("Could not register tmpfs\n"); 3711 goto out2; 3712 } 3713 3714 shm_mnt = kern_mount(&shmem_fs_type); 3715 if (IS_ERR(shm_mnt)) { 3716 error = PTR_ERR(shm_mnt); 3717 pr_err("Could not kern_mount tmpfs\n"); 3718 goto out1; 3719 } 3720 3721 #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE 3722 if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY) 3723 SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge; 3724 else 3725 shmem_huge = 0; /* just in case it was patched */ 3726 #endif 3727 return 0; 3728 3729 out1: 3730 unregister_filesystem(&shmem_fs_type); 3731 out2: 3732 shmem_destroy_inodecache(); 3733 shm_mnt = ERR_PTR(error); 3734 return error; 3735 } 3736 3737 #if defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && defined(CONFIG_SYSFS) 3738 static ssize_t shmem_enabled_show(struct kobject *kobj, 3739 struct kobj_attribute *attr, char *buf) 3740 { 3741 int values[] = { 3742 SHMEM_HUGE_ALWAYS, 3743 SHMEM_HUGE_WITHIN_SIZE, 3744 SHMEM_HUGE_ADVISE, 3745 SHMEM_HUGE_NEVER, 3746 SHMEM_HUGE_DENY, 3747 SHMEM_HUGE_FORCE, 3748 }; 3749 int i, count; 3750 3751 for (i = 0, count = 0; i < ARRAY_SIZE(values); i++) { 3752 const char *fmt = shmem_huge == values[i] ? "[%s] " : "%s "; 3753 3754 count += sprintf(buf + count, fmt, 3755 shmem_format_huge(values[i])); 3756 } 3757 buf[count - 1] = '\n'; 3758 return count; 3759 } 3760 3761 static ssize_t shmem_enabled_store(struct kobject *kobj, 3762 struct kobj_attribute *attr, const char *buf, size_t count) 3763 { 3764 char tmp[16]; 3765 int huge; 3766 3767 if (count + 1 > sizeof(tmp)) 3768 return -EINVAL; 3769 memcpy(tmp, buf, count); 3770 tmp[count] = '\0'; 3771 if (count && tmp[count - 1] == '\n') 3772 tmp[count - 1] = '\0'; 3773 3774 huge = shmem_parse_huge(tmp); 3775 if (huge == -EINVAL) 3776 return -EINVAL; 3777 if (!has_transparent_hugepage() && 3778 huge != SHMEM_HUGE_NEVER && huge != SHMEM_HUGE_DENY) 3779 return -EINVAL; 3780 3781 shmem_huge = huge; 3782 if (shmem_huge > SHMEM_HUGE_DENY) 3783 SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge; 3784 return count; 3785 } 3786 3787 struct kobj_attribute shmem_enabled_attr = 3788 __ATTR(shmem_enabled, 0644, shmem_enabled_show, shmem_enabled_store); 3789 #endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE && CONFIG_SYSFS */ 3790 3791 #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE 3792 bool shmem_huge_enabled(struct vm_area_struct *vma) 3793 { 3794 struct inode *inode = file_inode(vma->vm_file); 3795 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 3796 loff_t i_size; 3797 pgoff_t off; 3798 3799 if (shmem_huge == SHMEM_HUGE_FORCE) 3800 return true; 3801 if (shmem_huge == SHMEM_HUGE_DENY) 3802 return false; 3803 switch (sbinfo->huge) { 3804 case SHMEM_HUGE_NEVER: 3805 return false; 3806 case SHMEM_HUGE_ALWAYS: 3807 return true; 3808 case SHMEM_HUGE_WITHIN_SIZE: 3809 off = round_up(vma->vm_pgoff, HPAGE_PMD_NR); 3810 i_size = round_up(i_size_read(inode), PAGE_SIZE); 3811 if (i_size >= HPAGE_PMD_SIZE && 3812 i_size >> PAGE_SHIFT >= off) 3813 return true; 3814 /* fall through */ 3815 case SHMEM_HUGE_ADVISE: 3816 /* TODO: implement fadvise() hints */ 3817 return (vma->vm_flags & VM_HUGEPAGE); 3818 default: 3819 VM_BUG_ON(1); 3820 return false; 3821 } 3822 } 3823 #endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE */ 3824 3825 #else /* !CONFIG_SHMEM */ 3826 3827 /* 3828 * tiny-shmem: simple shmemfs and tmpfs using ramfs code 3829 * 3830 * This is intended for small system where the benefits of the full 3831 * shmem code (swap-backed and resource-limited) are outweighed by 3832 * their complexity. On systems without swap this code should be 3833 * effectively equivalent, but much lighter weight. 3834 */ 3835 3836 static struct file_system_type shmem_fs_type = { 3837 .name = "tmpfs", 3838 .mount = ramfs_mount, 3839 .kill_sb = kill_litter_super, 3840 .fs_flags = FS_USERNS_MOUNT, 3841 }; 3842 3843 int __init shmem_init(void) 3844 { 3845 BUG_ON(register_filesystem(&shmem_fs_type) != 0); 3846 3847 shm_mnt = kern_mount(&shmem_fs_type); 3848 BUG_ON(IS_ERR(shm_mnt)); 3849 3850 return 0; 3851 } 3852 3853 int shmem_unuse(swp_entry_t swap, struct page *page) 3854 { 3855 return 0; 3856 } 3857 3858 int shmem_lock(struct file *file, int lock, struct user_struct *user) 3859 { 3860 return 0; 3861 } 3862 3863 void shmem_unlock_mapping(struct address_space *mapping) 3864 { 3865 } 3866 3867 #ifdef CONFIG_MMU 3868 unsigned long shmem_get_unmapped_area(struct file *file, 3869 unsigned long addr, unsigned long len, 3870 unsigned long pgoff, unsigned long flags) 3871 { 3872 return current->mm->get_unmapped_area(file, addr, len, pgoff, flags); 3873 } 3874 #endif 3875 3876 void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) 3877 { 3878 truncate_inode_pages_range(inode->i_mapping, lstart, lend); 3879 } 3880 EXPORT_SYMBOL_GPL(shmem_truncate_range); 3881 3882 #define shmem_vm_ops generic_file_vm_ops 3883 #define shmem_file_operations ramfs_file_operations 3884 #define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev) 3885 #define shmem_acct_size(flags, size) 0 3886 #define shmem_unacct_size(flags, size) do {} while (0) 3887 3888 #endif /* CONFIG_SHMEM */ 3889 3890 /* common code */ 3891 3892 static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, loff_t size, 3893 unsigned long flags, unsigned int i_flags) 3894 { 3895 struct inode *inode; 3896 struct file *res; 3897 3898 if (IS_ERR(mnt)) 3899 return ERR_CAST(mnt); 3900 3901 if (size < 0 || size > MAX_LFS_FILESIZE) 3902 return ERR_PTR(-EINVAL); 3903 3904 if (shmem_acct_size(flags, size)) 3905 return ERR_PTR(-ENOMEM); 3906 3907 inode = shmem_get_inode(mnt->mnt_sb, NULL, S_IFREG | S_IRWXUGO, 0, 3908 flags); 3909 if (unlikely(!inode)) { 3910 shmem_unacct_size(flags, size); 3911 return ERR_PTR(-ENOSPC); 3912 } 3913 inode->i_flags |= i_flags; 3914 inode->i_size = size; 3915 clear_nlink(inode); /* It is unlinked */ 3916 res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size)); 3917 if (!IS_ERR(res)) 3918 res = alloc_file_pseudo(inode, mnt, name, O_RDWR, 3919 &shmem_file_operations); 3920 if (IS_ERR(res)) 3921 iput(inode); 3922 return res; 3923 } 3924 3925 /** 3926 * shmem_kernel_file_setup - get an unlinked file living in tmpfs which must be 3927 * kernel internal. There will be NO LSM permission checks against the 3928 * underlying inode. So users of this interface must do LSM checks at a 3929 * higher layer. The users are the big_key and shm implementations. LSM 3930 * checks are provided at the key or shm level rather than the inode. 3931 * @name: name for dentry (to be seen in /proc/<pid>/maps 3932 * @size: size to be set for the file 3933 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size 3934 */ 3935 struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags) 3936 { 3937 return __shmem_file_setup(shm_mnt, name, size, flags, S_PRIVATE); 3938 } 3939 3940 /** 3941 * shmem_file_setup - get an unlinked file living in tmpfs 3942 * @name: name for dentry (to be seen in /proc/<pid>/maps 3943 * @size: size to be set for the file 3944 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size 3945 */ 3946 struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags) 3947 { 3948 return __shmem_file_setup(shm_mnt, name, size, flags, 0); 3949 } 3950 EXPORT_SYMBOL_GPL(shmem_file_setup); 3951 3952 /** 3953 * shmem_file_setup_with_mnt - get an unlinked file living in tmpfs 3954 * @mnt: the tmpfs mount where the file will be created 3955 * @name: name for dentry (to be seen in /proc/<pid>/maps 3956 * @size: size to be set for the file 3957 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size 3958 */ 3959 struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, const char *name, 3960 loff_t size, unsigned long flags) 3961 { 3962 return __shmem_file_setup(mnt, name, size, flags, 0); 3963 } 3964 EXPORT_SYMBOL_GPL(shmem_file_setup_with_mnt); 3965 3966 /** 3967 * shmem_zero_setup - setup a shared anonymous mapping 3968 * @vma: the vma to be mmapped is prepared by do_mmap_pgoff 3969 */ 3970 int shmem_zero_setup(struct vm_area_struct *vma) 3971 { 3972 struct file *file; 3973 loff_t size = vma->vm_end - vma->vm_start; 3974 3975 /* 3976 * Cloning a new file under mmap_sem leads to a lock ordering conflict 3977 * between XFS directory reading and selinux: since this file is only 3978 * accessible to the user through its mapping, use S_PRIVATE flag to 3979 * bypass file security, in the same way as shmem_kernel_file_setup(). 3980 */ 3981 file = shmem_kernel_file_setup("dev/zero", size, vma->vm_flags); 3982 if (IS_ERR(file)) 3983 return PTR_ERR(file); 3984 3985 if (vma->vm_file) 3986 fput(vma->vm_file); 3987 vma->vm_file = file; 3988 vma->vm_ops = &shmem_vm_ops; 3989 3990 if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && 3991 ((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) < 3992 (vma->vm_end & HPAGE_PMD_MASK)) { 3993 khugepaged_enter(vma, vma->vm_flags); 3994 } 3995 3996 return 0; 3997 } 3998 3999 /** 4000 * shmem_read_mapping_page_gfp - read into page cache, using specified page allocation flags. 4001 * @mapping: the page's address_space 4002 * @index: the page index 4003 * @gfp: the page allocator flags to use if allocating 4004 * 4005 * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)", 4006 * with any new page allocations done using the specified allocation flags. 4007 * But read_cache_page_gfp() uses the ->readpage() method: which does not 4008 * suit tmpfs, since it may have pages in swapcache, and needs to find those 4009 * for itself; although drivers/gpu/drm i915 and ttm rely upon this support. 4010 * 4011 * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in 4012 * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily. 4013 */ 4014 struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, 4015 pgoff_t index, gfp_t gfp) 4016 { 4017 #ifdef CONFIG_SHMEM 4018 struct inode *inode = mapping->host; 4019 struct page *page; 4020 int error; 4021 4022 BUG_ON(mapping->a_ops != &shmem_aops); 4023 error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, 4024 gfp, NULL, NULL, NULL); 4025 if (error) 4026 page = ERR_PTR(error); 4027 else 4028 unlock_page(page); 4029 return page; 4030 #else 4031 /* 4032 * The tiny !SHMEM case uses ramfs without swap 4033 */ 4034 return read_cache_page_gfp(mapping, index, gfp); 4035 #endif 4036 } 4037 EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp); 4038