1 /* 2 * Resizable virtual memory filesystem for Linux. 3 * 4 * Copyright (C) 2000 Linus Torvalds. 5 * 2000 Transmeta Corp. 6 * 2000-2001 Christoph Rohland 7 * 2000-2001 SAP AG 8 * 2002 Red Hat Inc. 9 * Copyright (C) 2002-2011 Hugh Dickins. 10 * Copyright (C) 2011 Google Inc. 11 * Copyright (C) 2002-2005 VERITAS Software Corporation. 12 * Copyright (C) 2004 Andi Kleen, SuSE Labs 13 * 14 * Extended attribute support for tmpfs: 15 * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net> 16 * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com> 17 * 18 * tiny-shmem: 19 * Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com> 20 * 21 * This file is released under the GPL. 22 */ 23 24 #include <linux/fs.h> 25 #include <linux/init.h> 26 #include <linux/vfs.h> 27 #include <linux/mount.h> 28 #include <linux/ramfs.h> 29 #include <linux/pagemap.h> 30 #include <linux/file.h> 31 #include <linux/mm.h> 32 #include <linux/random.h> 33 #include <linux/sched/signal.h> 34 #include <linux/export.h> 35 #include <linux/swap.h> 36 #include <linux/uio.h> 37 #include <linux/khugepaged.h> 38 #include <linux/hugetlb.h> 39 40 #include <asm/tlbflush.h> /* for arch/microblaze update_mmu_cache() */ 41 42 static struct vfsmount *shm_mnt; 43 44 #ifdef CONFIG_SHMEM 45 /* 46 * This virtual memory filesystem is heavily based on the ramfs. It 47 * extends ramfs by the ability to use swap and honor resource limits 48 * which makes it a completely usable filesystem. 49 */ 50 51 #include <linux/xattr.h> 52 #include <linux/exportfs.h> 53 #include <linux/posix_acl.h> 54 #include <linux/posix_acl_xattr.h> 55 #include <linux/mman.h> 56 #include <linux/string.h> 57 #include <linux/slab.h> 58 #include <linux/backing-dev.h> 59 #include <linux/shmem_fs.h> 60 #include <linux/writeback.h> 61 #include <linux/blkdev.h> 62 #include <linux/pagevec.h> 63 #include <linux/percpu_counter.h> 64 #include <linux/falloc.h> 65 #include <linux/splice.h> 66 #include <linux/security.h> 67 #include <linux/swapops.h> 68 #include <linux/mempolicy.h> 69 #include <linux/namei.h> 70 #include <linux/ctype.h> 71 #include <linux/migrate.h> 72 #include <linux/highmem.h> 73 #include <linux/seq_file.h> 74 #include <linux/magic.h> 75 #include <linux/syscalls.h> 76 #include <linux/fcntl.h> 77 #include <uapi/linux/memfd.h> 78 #include <linux/userfaultfd_k.h> 79 #include <linux/rmap.h> 80 #include <linux/uuid.h> 81 82 #include <linux/uaccess.h> 83 #include <asm/pgtable.h> 84 85 #include "internal.h" 86 87 #define BLOCKS_PER_PAGE (PAGE_SIZE/512) 88 #define VM_ACCT(size) (PAGE_ALIGN(size) >> PAGE_SHIFT) 89 90 /* Pretend that each entry is of this size in directory's i_size */ 91 #define BOGO_DIRENT_SIZE 20 92 93 /* Symlink up to this size is kmalloc'ed instead of using a swappable page */ 94 #define SHORT_SYMLINK_LEN 128 95 96 /* 97 * shmem_fallocate communicates with shmem_fault or shmem_writepage via 98 * inode->i_private (with i_mutex making sure that it has only one user at 99 * a time): we would prefer not to enlarge the shmem inode just for that. 100 */ 101 struct shmem_falloc { 102 wait_queue_head_t *waitq; /* faults into hole wait for punch to end */ 103 pgoff_t start; /* start of range currently being fallocated */ 104 pgoff_t next; /* the next page offset to be fallocated */ 105 pgoff_t nr_falloced; /* how many new pages have been fallocated */ 106 pgoff_t nr_unswapped; /* how often writepage refused to swap out */ 107 }; 108 109 #ifdef CONFIG_TMPFS 110 static unsigned long shmem_default_max_blocks(void) 111 { 112 return totalram_pages / 2; 113 } 114 115 static unsigned long shmem_default_max_inodes(void) 116 { 117 return min(totalram_pages - totalhigh_pages, totalram_pages / 2); 118 } 119 #endif 120 121 static bool shmem_should_replace_page(struct page *page, gfp_t gfp); 122 static int shmem_replace_page(struct page **pagep, gfp_t gfp, 123 struct shmem_inode_info *info, pgoff_t index); 124 static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, 125 struct page **pagep, enum sgp_type sgp, 126 gfp_t gfp, struct vm_area_struct *vma, 127 struct vm_fault *vmf, vm_fault_t *fault_type); 128 129 int shmem_getpage(struct inode *inode, pgoff_t index, 130 struct page **pagep, enum sgp_type sgp) 131 { 132 return shmem_getpage_gfp(inode, index, pagep, sgp, 133 mapping_gfp_mask(inode->i_mapping), NULL, NULL, NULL); 134 } 135 136 static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) 137 { 138 return sb->s_fs_info; 139 } 140 141 /* 142 * shmem_file_setup pre-accounts the whole fixed size of a VM object, 143 * for shared memory and for shared anonymous (/dev/zero) mappings 144 * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1), 145 * consistent with the pre-accounting of private mappings ... 146 */ 147 static inline int shmem_acct_size(unsigned long flags, loff_t size) 148 { 149 return (flags & VM_NORESERVE) ? 150 0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size)); 151 } 152 153 static inline void shmem_unacct_size(unsigned long flags, loff_t size) 154 { 155 if (!(flags & VM_NORESERVE)) 156 vm_unacct_memory(VM_ACCT(size)); 157 } 158 159 static inline int shmem_reacct_size(unsigned long flags, 160 loff_t oldsize, loff_t newsize) 161 { 162 if (!(flags & VM_NORESERVE)) { 163 if (VM_ACCT(newsize) > VM_ACCT(oldsize)) 164 return security_vm_enough_memory_mm(current->mm, 165 VM_ACCT(newsize) - VM_ACCT(oldsize)); 166 else if (VM_ACCT(newsize) < VM_ACCT(oldsize)) 167 vm_unacct_memory(VM_ACCT(oldsize) - VM_ACCT(newsize)); 168 } 169 return 0; 170 } 171 172 /* 173 * ... whereas tmpfs objects are accounted incrementally as 174 * pages are allocated, in order to allow large sparse files. 175 * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM, 176 * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM. 177 */ 178 static inline int shmem_acct_block(unsigned long flags, long pages) 179 { 180 if (!(flags & VM_NORESERVE)) 181 return 0; 182 183 return security_vm_enough_memory_mm(current->mm, 184 pages * VM_ACCT(PAGE_SIZE)); 185 } 186 187 static inline void shmem_unacct_blocks(unsigned long flags, long pages) 188 { 189 if (flags & VM_NORESERVE) 190 vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE)); 191 } 192 193 static inline bool shmem_inode_acct_block(struct inode *inode, long pages) 194 { 195 struct shmem_inode_info *info = SHMEM_I(inode); 196 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 197 198 if (shmem_acct_block(info->flags, pages)) 199 return false; 200 201 if (sbinfo->max_blocks) { 202 if (percpu_counter_compare(&sbinfo->used_blocks, 203 sbinfo->max_blocks - pages) > 0) 204 goto unacct; 205 percpu_counter_add(&sbinfo->used_blocks, pages); 206 } 207 208 return true; 209 210 unacct: 211 shmem_unacct_blocks(info->flags, pages); 212 return false; 213 } 214 215 static inline void shmem_inode_unacct_blocks(struct inode *inode, long pages) 216 { 217 struct shmem_inode_info *info = SHMEM_I(inode); 218 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 219 220 if (sbinfo->max_blocks) 221 percpu_counter_sub(&sbinfo->used_blocks, pages); 222 shmem_unacct_blocks(info->flags, pages); 223 } 224 225 static const struct super_operations shmem_ops; 226 static const struct address_space_operations shmem_aops; 227 static const struct file_operations shmem_file_operations; 228 static const struct inode_operations shmem_inode_operations; 229 static const struct inode_operations shmem_dir_inode_operations; 230 static const struct inode_operations shmem_special_inode_operations; 231 static const struct vm_operations_struct shmem_vm_ops; 232 static struct file_system_type shmem_fs_type; 233 234 bool vma_is_shmem(struct vm_area_struct *vma) 235 { 236 return vma->vm_ops == &shmem_vm_ops; 237 } 238 239 static LIST_HEAD(shmem_swaplist); 240 static DEFINE_MUTEX(shmem_swaplist_mutex); 241 242 static int shmem_reserve_inode(struct super_block *sb) 243 { 244 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 245 if (sbinfo->max_inodes) { 246 spin_lock(&sbinfo->stat_lock); 247 if (!sbinfo->free_inodes) { 248 spin_unlock(&sbinfo->stat_lock); 249 return -ENOSPC; 250 } 251 sbinfo->free_inodes--; 252 spin_unlock(&sbinfo->stat_lock); 253 } 254 return 0; 255 } 256 257 static void shmem_free_inode(struct super_block *sb) 258 { 259 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 260 if (sbinfo->max_inodes) { 261 spin_lock(&sbinfo->stat_lock); 262 sbinfo->free_inodes++; 263 spin_unlock(&sbinfo->stat_lock); 264 } 265 } 266 267 /** 268 * shmem_recalc_inode - recalculate the block usage of an inode 269 * @inode: inode to recalc 270 * 271 * We have to calculate the free blocks since the mm can drop 272 * undirtied hole pages behind our back. 273 * 274 * But normally info->alloced == inode->i_mapping->nrpages + info->swapped 275 * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped) 276 * 277 * It has to be called with the spinlock held. 278 */ 279 static void shmem_recalc_inode(struct inode *inode) 280 { 281 struct shmem_inode_info *info = SHMEM_I(inode); 282 long freed; 283 284 freed = info->alloced - info->swapped - inode->i_mapping->nrpages; 285 if (freed > 0) { 286 info->alloced -= freed; 287 inode->i_blocks -= freed * BLOCKS_PER_PAGE; 288 shmem_inode_unacct_blocks(inode, freed); 289 } 290 } 291 292 bool shmem_charge(struct inode *inode, long pages) 293 { 294 struct shmem_inode_info *info = SHMEM_I(inode); 295 unsigned long flags; 296 297 if (!shmem_inode_acct_block(inode, pages)) 298 return false; 299 300 spin_lock_irqsave(&info->lock, flags); 301 info->alloced += pages; 302 inode->i_blocks += pages * BLOCKS_PER_PAGE; 303 shmem_recalc_inode(inode); 304 spin_unlock_irqrestore(&info->lock, flags); 305 inode->i_mapping->nrpages += pages; 306 307 return true; 308 } 309 310 void shmem_uncharge(struct inode *inode, long pages) 311 { 312 struct shmem_inode_info *info = SHMEM_I(inode); 313 unsigned long flags; 314 315 spin_lock_irqsave(&info->lock, flags); 316 info->alloced -= pages; 317 inode->i_blocks -= pages * BLOCKS_PER_PAGE; 318 shmem_recalc_inode(inode); 319 spin_unlock_irqrestore(&info->lock, flags); 320 321 shmem_inode_unacct_blocks(inode, pages); 322 } 323 324 /* 325 * Replace item expected in xarray by a new item, while holding xa_lock. 326 */ 327 static int shmem_replace_entry(struct address_space *mapping, 328 pgoff_t index, void *expected, void *replacement) 329 { 330 XA_STATE(xas, &mapping->i_pages, index); 331 void *item; 332 333 VM_BUG_ON(!expected); 334 VM_BUG_ON(!replacement); 335 item = xas_load(&xas); 336 if (item != expected) 337 return -ENOENT; 338 xas_store(&xas, replacement); 339 return 0; 340 } 341 342 /* 343 * Sometimes, before we decide whether to proceed or to fail, we must check 344 * that an entry was not already brought back from swap by a racing thread. 345 * 346 * Checking page is not enough: by the time a SwapCache page is locked, it 347 * might be reused, and again be SwapCache, using the same swap as before. 348 */ 349 static bool shmem_confirm_swap(struct address_space *mapping, 350 pgoff_t index, swp_entry_t swap) 351 { 352 void *item; 353 354 rcu_read_lock(); 355 item = radix_tree_lookup(&mapping->i_pages, index); 356 rcu_read_unlock(); 357 return item == swp_to_radix_entry(swap); 358 } 359 360 /* 361 * Definitions for "huge tmpfs": tmpfs mounted with the huge= option 362 * 363 * SHMEM_HUGE_NEVER: 364 * disables huge pages for the mount; 365 * SHMEM_HUGE_ALWAYS: 366 * enables huge pages for the mount; 367 * SHMEM_HUGE_WITHIN_SIZE: 368 * only allocate huge pages if the page will be fully within i_size, 369 * also respect fadvise()/madvise() hints; 370 * SHMEM_HUGE_ADVISE: 371 * only allocate huge pages if requested with fadvise()/madvise(); 372 */ 373 374 #define SHMEM_HUGE_NEVER 0 375 #define SHMEM_HUGE_ALWAYS 1 376 #define SHMEM_HUGE_WITHIN_SIZE 2 377 #define SHMEM_HUGE_ADVISE 3 378 379 /* 380 * Special values. 381 * Only can be set via /sys/kernel/mm/transparent_hugepage/shmem_enabled: 382 * 383 * SHMEM_HUGE_DENY: 384 * disables huge on shm_mnt and all mounts, for emergency use; 385 * SHMEM_HUGE_FORCE: 386 * enables huge on shm_mnt and all mounts, w/o needing option, for testing; 387 * 388 */ 389 #define SHMEM_HUGE_DENY (-1) 390 #define SHMEM_HUGE_FORCE (-2) 391 392 #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE 393 /* ifdef here to avoid bloating shmem.o when not necessary */ 394 395 static int shmem_huge __read_mostly; 396 397 #if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS) 398 static int shmem_parse_huge(const char *str) 399 { 400 if (!strcmp(str, "never")) 401 return SHMEM_HUGE_NEVER; 402 if (!strcmp(str, "always")) 403 return SHMEM_HUGE_ALWAYS; 404 if (!strcmp(str, "within_size")) 405 return SHMEM_HUGE_WITHIN_SIZE; 406 if (!strcmp(str, "advise")) 407 return SHMEM_HUGE_ADVISE; 408 if (!strcmp(str, "deny")) 409 return SHMEM_HUGE_DENY; 410 if (!strcmp(str, "force")) 411 return SHMEM_HUGE_FORCE; 412 return -EINVAL; 413 } 414 415 static const char *shmem_format_huge(int huge) 416 { 417 switch (huge) { 418 case SHMEM_HUGE_NEVER: 419 return "never"; 420 case SHMEM_HUGE_ALWAYS: 421 return "always"; 422 case SHMEM_HUGE_WITHIN_SIZE: 423 return "within_size"; 424 case SHMEM_HUGE_ADVISE: 425 return "advise"; 426 case SHMEM_HUGE_DENY: 427 return "deny"; 428 case SHMEM_HUGE_FORCE: 429 return "force"; 430 default: 431 VM_BUG_ON(1); 432 return "bad_val"; 433 } 434 } 435 #endif 436 437 static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, 438 struct shrink_control *sc, unsigned long nr_to_split) 439 { 440 LIST_HEAD(list), *pos, *next; 441 LIST_HEAD(to_remove); 442 struct inode *inode; 443 struct shmem_inode_info *info; 444 struct page *page; 445 unsigned long batch = sc ? sc->nr_to_scan : 128; 446 int removed = 0, split = 0; 447 448 if (list_empty(&sbinfo->shrinklist)) 449 return SHRINK_STOP; 450 451 spin_lock(&sbinfo->shrinklist_lock); 452 list_for_each_safe(pos, next, &sbinfo->shrinklist) { 453 info = list_entry(pos, struct shmem_inode_info, shrinklist); 454 455 /* pin the inode */ 456 inode = igrab(&info->vfs_inode); 457 458 /* inode is about to be evicted */ 459 if (!inode) { 460 list_del_init(&info->shrinklist); 461 removed++; 462 goto next; 463 } 464 465 /* Check if there's anything to gain */ 466 if (round_up(inode->i_size, PAGE_SIZE) == 467 round_up(inode->i_size, HPAGE_PMD_SIZE)) { 468 list_move(&info->shrinklist, &to_remove); 469 removed++; 470 goto next; 471 } 472 473 list_move(&info->shrinklist, &list); 474 next: 475 if (!--batch) 476 break; 477 } 478 spin_unlock(&sbinfo->shrinklist_lock); 479 480 list_for_each_safe(pos, next, &to_remove) { 481 info = list_entry(pos, struct shmem_inode_info, shrinklist); 482 inode = &info->vfs_inode; 483 list_del_init(&info->shrinklist); 484 iput(inode); 485 } 486 487 list_for_each_safe(pos, next, &list) { 488 int ret; 489 490 info = list_entry(pos, struct shmem_inode_info, shrinklist); 491 inode = &info->vfs_inode; 492 493 if (nr_to_split && split >= nr_to_split) 494 goto leave; 495 496 page = find_get_page(inode->i_mapping, 497 (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT); 498 if (!page) 499 goto drop; 500 501 /* No huge page at the end of the file: nothing to split */ 502 if (!PageTransHuge(page)) { 503 put_page(page); 504 goto drop; 505 } 506 507 /* 508 * Leave the inode on the list if we failed to lock 509 * the page at this time. 510 * 511 * Waiting for the lock may lead to deadlock in the 512 * reclaim path. 513 */ 514 if (!trylock_page(page)) { 515 put_page(page); 516 goto leave; 517 } 518 519 ret = split_huge_page(page); 520 unlock_page(page); 521 put_page(page); 522 523 /* If split failed leave the inode on the list */ 524 if (ret) 525 goto leave; 526 527 split++; 528 drop: 529 list_del_init(&info->shrinklist); 530 removed++; 531 leave: 532 iput(inode); 533 } 534 535 spin_lock(&sbinfo->shrinklist_lock); 536 list_splice_tail(&list, &sbinfo->shrinklist); 537 sbinfo->shrinklist_len -= removed; 538 spin_unlock(&sbinfo->shrinklist_lock); 539 540 return split; 541 } 542 543 static long shmem_unused_huge_scan(struct super_block *sb, 544 struct shrink_control *sc) 545 { 546 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 547 548 if (!READ_ONCE(sbinfo->shrinklist_len)) 549 return SHRINK_STOP; 550 551 return shmem_unused_huge_shrink(sbinfo, sc, 0); 552 } 553 554 static long shmem_unused_huge_count(struct super_block *sb, 555 struct shrink_control *sc) 556 { 557 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 558 return READ_ONCE(sbinfo->shrinklist_len); 559 } 560 #else /* !CONFIG_TRANSPARENT_HUGE_PAGECACHE */ 561 562 #define shmem_huge SHMEM_HUGE_DENY 563 564 static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, 565 struct shrink_control *sc, unsigned long nr_to_split) 566 { 567 return 0; 568 } 569 #endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE */ 570 571 static inline bool is_huge_enabled(struct shmem_sb_info *sbinfo) 572 { 573 if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && 574 (shmem_huge == SHMEM_HUGE_FORCE || sbinfo->huge) && 575 shmem_huge != SHMEM_HUGE_DENY) 576 return true; 577 return false; 578 } 579 580 /* 581 * Like add_to_page_cache_locked, but error if expected item has gone. 582 */ 583 static int shmem_add_to_page_cache(struct page *page, 584 struct address_space *mapping, 585 pgoff_t index, void *expected) 586 { 587 int error, nr = hpage_nr_pages(page); 588 589 VM_BUG_ON_PAGE(PageTail(page), page); 590 VM_BUG_ON_PAGE(index != round_down(index, nr), page); 591 VM_BUG_ON_PAGE(!PageLocked(page), page); 592 VM_BUG_ON_PAGE(!PageSwapBacked(page), page); 593 VM_BUG_ON(expected && PageTransHuge(page)); 594 595 page_ref_add(page, nr); 596 page->mapping = mapping; 597 page->index = index; 598 599 xa_lock_irq(&mapping->i_pages); 600 if (PageTransHuge(page)) { 601 void __rcu **results; 602 pgoff_t idx; 603 int i; 604 605 error = 0; 606 if (radix_tree_gang_lookup_slot(&mapping->i_pages, 607 &results, &idx, index, 1) && 608 idx < index + HPAGE_PMD_NR) { 609 error = -EEXIST; 610 } 611 612 if (!error) { 613 for (i = 0; i < HPAGE_PMD_NR; i++) { 614 error = radix_tree_insert(&mapping->i_pages, 615 index + i, page + i); 616 VM_BUG_ON(error); 617 } 618 count_vm_event(THP_FILE_ALLOC); 619 } 620 } else if (!expected) { 621 error = radix_tree_insert(&mapping->i_pages, index, page); 622 } else { 623 error = shmem_replace_entry(mapping, index, expected, page); 624 } 625 626 if (!error) { 627 mapping->nrpages += nr; 628 if (PageTransHuge(page)) 629 __inc_node_page_state(page, NR_SHMEM_THPS); 630 __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr); 631 __mod_node_page_state(page_pgdat(page), NR_SHMEM, nr); 632 xa_unlock_irq(&mapping->i_pages); 633 } else { 634 page->mapping = NULL; 635 xa_unlock_irq(&mapping->i_pages); 636 page_ref_sub(page, nr); 637 } 638 return error; 639 } 640 641 /* 642 * Like delete_from_page_cache, but substitutes swap for page. 643 */ 644 static void shmem_delete_from_page_cache(struct page *page, void *radswap) 645 { 646 struct address_space *mapping = page->mapping; 647 int error; 648 649 VM_BUG_ON_PAGE(PageCompound(page), page); 650 651 xa_lock_irq(&mapping->i_pages); 652 error = shmem_replace_entry(mapping, page->index, page, radswap); 653 page->mapping = NULL; 654 mapping->nrpages--; 655 __dec_node_page_state(page, NR_FILE_PAGES); 656 __dec_node_page_state(page, NR_SHMEM); 657 xa_unlock_irq(&mapping->i_pages); 658 put_page(page); 659 BUG_ON(error); 660 } 661 662 /* 663 * Remove swap entry from radix tree, free the swap and its page cache. 664 */ 665 static int shmem_free_swap(struct address_space *mapping, 666 pgoff_t index, void *radswap) 667 { 668 void *old; 669 670 xa_lock_irq(&mapping->i_pages); 671 old = radix_tree_delete_item(&mapping->i_pages, index, radswap); 672 xa_unlock_irq(&mapping->i_pages); 673 if (old != radswap) 674 return -ENOENT; 675 free_swap_and_cache(radix_to_swp_entry(radswap)); 676 return 0; 677 } 678 679 /* 680 * Determine (in bytes) how many of the shmem object's pages mapped by the 681 * given offsets are swapped out. 682 * 683 * This is safe to call without i_mutex or the i_pages lock thanks to RCU, 684 * as long as the inode doesn't go away and racy results are not a problem. 685 */ 686 unsigned long shmem_partial_swap_usage(struct address_space *mapping, 687 pgoff_t start, pgoff_t end) 688 { 689 struct radix_tree_iter iter; 690 void __rcu **slot; 691 struct page *page; 692 unsigned long swapped = 0; 693 694 rcu_read_lock(); 695 696 radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { 697 if (iter.index >= end) 698 break; 699 700 page = radix_tree_deref_slot(slot); 701 702 if (radix_tree_deref_retry(page)) { 703 slot = radix_tree_iter_retry(&iter); 704 continue; 705 } 706 707 if (xa_is_value(page)) 708 swapped++; 709 710 if (need_resched()) { 711 slot = radix_tree_iter_resume(slot, &iter); 712 cond_resched_rcu(); 713 } 714 } 715 716 rcu_read_unlock(); 717 718 return swapped << PAGE_SHIFT; 719 } 720 721 /* 722 * Determine (in bytes) how many of the shmem object's pages mapped by the 723 * given vma is swapped out. 724 * 725 * This is safe to call without i_mutex or the i_pages lock thanks to RCU, 726 * as long as the inode doesn't go away and racy results are not a problem. 727 */ 728 unsigned long shmem_swap_usage(struct vm_area_struct *vma) 729 { 730 struct inode *inode = file_inode(vma->vm_file); 731 struct shmem_inode_info *info = SHMEM_I(inode); 732 struct address_space *mapping = inode->i_mapping; 733 unsigned long swapped; 734 735 /* Be careful as we don't hold info->lock */ 736 swapped = READ_ONCE(info->swapped); 737 738 /* 739 * The easier cases are when the shmem object has nothing in swap, or 740 * the vma maps it whole. Then we can simply use the stats that we 741 * already track. 742 */ 743 if (!swapped) 744 return 0; 745 746 if (!vma->vm_pgoff && vma->vm_end - vma->vm_start >= inode->i_size) 747 return swapped << PAGE_SHIFT; 748 749 /* Here comes the more involved part */ 750 return shmem_partial_swap_usage(mapping, 751 linear_page_index(vma, vma->vm_start), 752 linear_page_index(vma, vma->vm_end)); 753 } 754 755 /* 756 * SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists. 757 */ 758 void shmem_unlock_mapping(struct address_space *mapping) 759 { 760 struct pagevec pvec; 761 pgoff_t indices[PAGEVEC_SIZE]; 762 pgoff_t index = 0; 763 764 pagevec_init(&pvec); 765 /* 766 * Minor point, but we might as well stop if someone else SHM_LOCKs it. 767 */ 768 while (!mapping_unevictable(mapping)) { 769 /* 770 * Avoid pagevec_lookup(): find_get_pages() returns 0 as if it 771 * has finished, if it hits a row of PAGEVEC_SIZE swap entries. 772 */ 773 pvec.nr = find_get_entries(mapping, index, 774 PAGEVEC_SIZE, pvec.pages, indices); 775 if (!pvec.nr) 776 break; 777 index = indices[pvec.nr - 1] + 1; 778 pagevec_remove_exceptionals(&pvec); 779 check_move_unevictable_pages(pvec.pages, pvec.nr); 780 pagevec_release(&pvec); 781 cond_resched(); 782 } 783 } 784 785 /* 786 * Remove range of pages and swap entries from radix tree, and free them. 787 * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate. 788 */ 789 static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, 790 bool unfalloc) 791 { 792 struct address_space *mapping = inode->i_mapping; 793 struct shmem_inode_info *info = SHMEM_I(inode); 794 pgoff_t start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT; 795 pgoff_t end = (lend + 1) >> PAGE_SHIFT; 796 unsigned int partial_start = lstart & (PAGE_SIZE - 1); 797 unsigned int partial_end = (lend + 1) & (PAGE_SIZE - 1); 798 struct pagevec pvec; 799 pgoff_t indices[PAGEVEC_SIZE]; 800 long nr_swaps_freed = 0; 801 pgoff_t index; 802 int i; 803 804 if (lend == -1) 805 end = -1; /* unsigned, so actually very big */ 806 807 pagevec_init(&pvec); 808 index = start; 809 while (index < end) { 810 pvec.nr = find_get_entries(mapping, index, 811 min(end - index, (pgoff_t)PAGEVEC_SIZE), 812 pvec.pages, indices); 813 if (!pvec.nr) 814 break; 815 for (i = 0; i < pagevec_count(&pvec); i++) { 816 struct page *page = pvec.pages[i]; 817 818 index = indices[i]; 819 if (index >= end) 820 break; 821 822 if (xa_is_value(page)) { 823 if (unfalloc) 824 continue; 825 nr_swaps_freed += !shmem_free_swap(mapping, 826 index, page); 827 continue; 828 } 829 830 VM_BUG_ON_PAGE(page_to_pgoff(page) != index, page); 831 832 if (!trylock_page(page)) 833 continue; 834 835 if (PageTransTail(page)) { 836 /* Middle of THP: zero out the page */ 837 clear_highpage(page); 838 unlock_page(page); 839 continue; 840 } else if (PageTransHuge(page)) { 841 if (index == round_down(end, HPAGE_PMD_NR)) { 842 /* 843 * Range ends in the middle of THP: 844 * zero out the page 845 */ 846 clear_highpage(page); 847 unlock_page(page); 848 continue; 849 } 850 index += HPAGE_PMD_NR - 1; 851 i += HPAGE_PMD_NR - 1; 852 } 853 854 if (!unfalloc || !PageUptodate(page)) { 855 VM_BUG_ON_PAGE(PageTail(page), page); 856 if (page_mapping(page) == mapping) { 857 VM_BUG_ON_PAGE(PageWriteback(page), page); 858 truncate_inode_page(mapping, page); 859 } 860 } 861 unlock_page(page); 862 } 863 pagevec_remove_exceptionals(&pvec); 864 pagevec_release(&pvec); 865 cond_resched(); 866 index++; 867 } 868 869 if (partial_start) { 870 struct page *page = NULL; 871 shmem_getpage(inode, start - 1, &page, SGP_READ); 872 if (page) { 873 unsigned int top = PAGE_SIZE; 874 if (start > end) { 875 top = partial_end; 876 partial_end = 0; 877 } 878 zero_user_segment(page, partial_start, top); 879 set_page_dirty(page); 880 unlock_page(page); 881 put_page(page); 882 } 883 } 884 if (partial_end) { 885 struct page *page = NULL; 886 shmem_getpage(inode, end, &page, SGP_READ); 887 if (page) { 888 zero_user_segment(page, 0, partial_end); 889 set_page_dirty(page); 890 unlock_page(page); 891 put_page(page); 892 } 893 } 894 if (start >= end) 895 return; 896 897 index = start; 898 while (index < end) { 899 cond_resched(); 900 901 pvec.nr = find_get_entries(mapping, index, 902 min(end - index, (pgoff_t)PAGEVEC_SIZE), 903 pvec.pages, indices); 904 if (!pvec.nr) { 905 /* If all gone or hole-punch or unfalloc, we're done */ 906 if (index == start || end != -1) 907 break; 908 /* But if truncating, restart to make sure all gone */ 909 index = start; 910 continue; 911 } 912 for (i = 0; i < pagevec_count(&pvec); i++) { 913 struct page *page = pvec.pages[i]; 914 915 index = indices[i]; 916 if (index >= end) 917 break; 918 919 if (xa_is_value(page)) { 920 if (unfalloc) 921 continue; 922 if (shmem_free_swap(mapping, index, page)) { 923 /* Swap was replaced by page: retry */ 924 index--; 925 break; 926 } 927 nr_swaps_freed++; 928 continue; 929 } 930 931 lock_page(page); 932 933 if (PageTransTail(page)) { 934 /* Middle of THP: zero out the page */ 935 clear_highpage(page); 936 unlock_page(page); 937 /* 938 * Partial thp truncate due 'start' in middle 939 * of THP: don't need to look on these pages 940 * again on !pvec.nr restart. 941 */ 942 if (index != round_down(end, HPAGE_PMD_NR)) 943 start++; 944 continue; 945 } else if (PageTransHuge(page)) { 946 if (index == round_down(end, HPAGE_PMD_NR)) { 947 /* 948 * Range ends in the middle of THP: 949 * zero out the page 950 */ 951 clear_highpage(page); 952 unlock_page(page); 953 continue; 954 } 955 index += HPAGE_PMD_NR - 1; 956 i += HPAGE_PMD_NR - 1; 957 } 958 959 if (!unfalloc || !PageUptodate(page)) { 960 VM_BUG_ON_PAGE(PageTail(page), page); 961 if (page_mapping(page) == mapping) { 962 VM_BUG_ON_PAGE(PageWriteback(page), page); 963 truncate_inode_page(mapping, page); 964 } else { 965 /* Page was replaced by swap: retry */ 966 unlock_page(page); 967 index--; 968 break; 969 } 970 } 971 unlock_page(page); 972 } 973 pagevec_remove_exceptionals(&pvec); 974 pagevec_release(&pvec); 975 index++; 976 } 977 978 spin_lock_irq(&info->lock); 979 info->swapped -= nr_swaps_freed; 980 shmem_recalc_inode(inode); 981 spin_unlock_irq(&info->lock); 982 } 983 984 void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) 985 { 986 shmem_undo_range(inode, lstart, lend, false); 987 inode->i_ctime = inode->i_mtime = current_time(inode); 988 } 989 EXPORT_SYMBOL_GPL(shmem_truncate_range); 990 991 static int shmem_getattr(const struct path *path, struct kstat *stat, 992 u32 request_mask, unsigned int query_flags) 993 { 994 struct inode *inode = path->dentry->d_inode; 995 struct shmem_inode_info *info = SHMEM_I(inode); 996 struct shmem_sb_info *sb_info = SHMEM_SB(inode->i_sb); 997 998 if (info->alloced - info->swapped != inode->i_mapping->nrpages) { 999 spin_lock_irq(&info->lock); 1000 shmem_recalc_inode(inode); 1001 spin_unlock_irq(&info->lock); 1002 } 1003 generic_fillattr(inode, stat); 1004 1005 if (is_huge_enabled(sb_info)) 1006 stat->blksize = HPAGE_PMD_SIZE; 1007 1008 return 0; 1009 } 1010 1011 static int shmem_setattr(struct dentry *dentry, struct iattr *attr) 1012 { 1013 struct inode *inode = d_inode(dentry); 1014 struct shmem_inode_info *info = SHMEM_I(inode); 1015 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 1016 int error; 1017 1018 error = setattr_prepare(dentry, attr); 1019 if (error) 1020 return error; 1021 1022 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { 1023 loff_t oldsize = inode->i_size; 1024 loff_t newsize = attr->ia_size; 1025 1026 /* protected by i_mutex */ 1027 if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) || 1028 (newsize > oldsize && (info->seals & F_SEAL_GROW))) 1029 return -EPERM; 1030 1031 if (newsize != oldsize) { 1032 error = shmem_reacct_size(SHMEM_I(inode)->flags, 1033 oldsize, newsize); 1034 if (error) 1035 return error; 1036 i_size_write(inode, newsize); 1037 inode->i_ctime = inode->i_mtime = current_time(inode); 1038 } 1039 if (newsize <= oldsize) { 1040 loff_t holebegin = round_up(newsize, PAGE_SIZE); 1041 if (oldsize > holebegin) 1042 unmap_mapping_range(inode->i_mapping, 1043 holebegin, 0, 1); 1044 if (info->alloced) 1045 shmem_truncate_range(inode, 1046 newsize, (loff_t)-1); 1047 /* unmap again to remove racily COWed private pages */ 1048 if (oldsize > holebegin) 1049 unmap_mapping_range(inode->i_mapping, 1050 holebegin, 0, 1); 1051 1052 /* 1053 * Part of the huge page can be beyond i_size: subject 1054 * to shrink under memory pressure. 1055 */ 1056 if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) { 1057 spin_lock(&sbinfo->shrinklist_lock); 1058 /* 1059 * _careful to defend against unlocked access to 1060 * ->shrink_list in shmem_unused_huge_shrink() 1061 */ 1062 if (list_empty_careful(&info->shrinklist)) { 1063 list_add_tail(&info->shrinklist, 1064 &sbinfo->shrinklist); 1065 sbinfo->shrinklist_len++; 1066 } 1067 spin_unlock(&sbinfo->shrinklist_lock); 1068 } 1069 } 1070 } 1071 1072 setattr_copy(inode, attr); 1073 if (attr->ia_valid & ATTR_MODE) 1074 error = posix_acl_chmod(inode, inode->i_mode); 1075 return error; 1076 } 1077 1078 static void shmem_evict_inode(struct inode *inode) 1079 { 1080 struct shmem_inode_info *info = SHMEM_I(inode); 1081 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 1082 1083 if (inode->i_mapping->a_ops == &shmem_aops) { 1084 shmem_unacct_size(info->flags, inode->i_size); 1085 inode->i_size = 0; 1086 shmem_truncate_range(inode, 0, (loff_t)-1); 1087 if (!list_empty(&info->shrinklist)) { 1088 spin_lock(&sbinfo->shrinklist_lock); 1089 if (!list_empty(&info->shrinklist)) { 1090 list_del_init(&info->shrinklist); 1091 sbinfo->shrinklist_len--; 1092 } 1093 spin_unlock(&sbinfo->shrinklist_lock); 1094 } 1095 if (!list_empty(&info->swaplist)) { 1096 mutex_lock(&shmem_swaplist_mutex); 1097 list_del_init(&info->swaplist); 1098 mutex_unlock(&shmem_swaplist_mutex); 1099 } 1100 } 1101 1102 simple_xattrs_free(&info->xattrs); 1103 WARN_ON(inode->i_blocks); 1104 shmem_free_inode(inode->i_sb); 1105 clear_inode(inode); 1106 } 1107 1108 static unsigned long find_swap_entry(struct radix_tree_root *root, void *item) 1109 { 1110 struct radix_tree_iter iter; 1111 void __rcu **slot; 1112 unsigned long found = -1; 1113 unsigned int checked = 0; 1114 1115 rcu_read_lock(); 1116 radix_tree_for_each_slot(slot, root, &iter, 0) { 1117 void *entry = radix_tree_deref_slot(slot); 1118 1119 if (radix_tree_deref_retry(entry)) { 1120 slot = radix_tree_iter_retry(&iter); 1121 continue; 1122 } 1123 if (entry == item) { 1124 found = iter.index; 1125 break; 1126 } 1127 checked++; 1128 if ((checked % 4096) != 0) 1129 continue; 1130 slot = radix_tree_iter_resume(slot, &iter); 1131 cond_resched_rcu(); 1132 } 1133 1134 rcu_read_unlock(); 1135 return found; 1136 } 1137 1138 /* 1139 * If swap found in inode, free it and move page from swapcache to filecache. 1140 */ 1141 static int shmem_unuse_inode(struct shmem_inode_info *info, 1142 swp_entry_t swap, struct page **pagep) 1143 { 1144 struct address_space *mapping = info->vfs_inode.i_mapping; 1145 void *radswap; 1146 pgoff_t index; 1147 gfp_t gfp; 1148 int error = 0; 1149 1150 radswap = swp_to_radix_entry(swap); 1151 index = find_swap_entry(&mapping->i_pages, radswap); 1152 if (index == -1) 1153 return -EAGAIN; /* tell shmem_unuse we found nothing */ 1154 1155 /* 1156 * Move _head_ to start search for next from here. 1157 * But be careful: shmem_evict_inode checks list_empty without taking 1158 * mutex, and there's an instant in list_move_tail when info->swaplist 1159 * would appear empty, if it were the only one on shmem_swaplist. 1160 */ 1161 if (shmem_swaplist.next != &info->swaplist) 1162 list_move_tail(&shmem_swaplist, &info->swaplist); 1163 1164 gfp = mapping_gfp_mask(mapping); 1165 if (shmem_should_replace_page(*pagep, gfp)) { 1166 mutex_unlock(&shmem_swaplist_mutex); 1167 error = shmem_replace_page(pagep, gfp, info, index); 1168 mutex_lock(&shmem_swaplist_mutex); 1169 /* 1170 * We needed to drop mutex to make that restrictive page 1171 * allocation, but the inode might have been freed while we 1172 * dropped it: although a racing shmem_evict_inode() cannot 1173 * complete without emptying the radix_tree, our page lock 1174 * on this swapcache page is not enough to prevent that - 1175 * free_swap_and_cache() of our swap entry will only 1176 * trylock_page(), removing swap from radix_tree whatever. 1177 * 1178 * We must not proceed to shmem_add_to_page_cache() if the 1179 * inode has been freed, but of course we cannot rely on 1180 * inode or mapping or info to check that. However, we can 1181 * safely check if our swap entry is still in use (and here 1182 * it can't have got reused for another page): if it's still 1183 * in use, then the inode cannot have been freed yet, and we 1184 * can safely proceed (if it's no longer in use, that tells 1185 * nothing about the inode, but we don't need to unuse swap). 1186 */ 1187 if (!page_swapcount(*pagep)) 1188 error = -ENOENT; 1189 } 1190 1191 /* 1192 * We rely on shmem_swaplist_mutex, not only to protect the swaplist, 1193 * but also to hold up shmem_evict_inode(): so inode cannot be freed 1194 * beneath us (pagelock doesn't help until the page is in pagecache). 1195 */ 1196 if (!error) 1197 error = shmem_add_to_page_cache(*pagep, mapping, index, 1198 radswap); 1199 if (error != -ENOMEM) { 1200 /* 1201 * Truncation and eviction use free_swap_and_cache(), which 1202 * only does trylock page: if we raced, best clean up here. 1203 */ 1204 delete_from_swap_cache(*pagep); 1205 set_page_dirty(*pagep); 1206 if (!error) { 1207 spin_lock_irq(&info->lock); 1208 info->swapped--; 1209 spin_unlock_irq(&info->lock); 1210 swap_free(swap); 1211 } 1212 } 1213 return error; 1214 } 1215 1216 /* 1217 * Search through swapped inodes to find and replace swap by page. 1218 */ 1219 int shmem_unuse(swp_entry_t swap, struct page *page) 1220 { 1221 struct list_head *this, *next; 1222 struct shmem_inode_info *info; 1223 struct mem_cgroup *memcg; 1224 int error = 0; 1225 1226 /* 1227 * There's a faint possibility that swap page was replaced before 1228 * caller locked it: caller will come back later with the right page. 1229 */ 1230 if (unlikely(!PageSwapCache(page) || page_private(page) != swap.val)) 1231 goto out; 1232 1233 /* 1234 * Charge page using GFP_KERNEL while we can wait, before taking 1235 * the shmem_swaplist_mutex which might hold up shmem_writepage(). 1236 * Charged back to the user (not to caller) when swap account is used. 1237 */ 1238 error = mem_cgroup_try_charge_delay(page, current->mm, GFP_KERNEL, 1239 &memcg, false); 1240 if (error) 1241 goto out; 1242 /* No radix_tree_preload: swap entry keeps a place for page in tree */ 1243 error = -EAGAIN; 1244 1245 mutex_lock(&shmem_swaplist_mutex); 1246 list_for_each_safe(this, next, &shmem_swaplist) { 1247 info = list_entry(this, struct shmem_inode_info, swaplist); 1248 if (info->swapped) 1249 error = shmem_unuse_inode(info, swap, &page); 1250 else 1251 list_del_init(&info->swaplist); 1252 cond_resched(); 1253 if (error != -EAGAIN) 1254 break; 1255 /* found nothing in this: move on to search the next */ 1256 } 1257 mutex_unlock(&shmem_swaplist_mutex); 1258 1259 if (error) { 1260 if (error != -ENOMEM) 1261 error = 0; 1262 mem_cgroup_cancel_charge(page, memcg, false); 1263 } else 1264 mem_cgroup_commit_charge(page, memcg, true, false); 1265 out: 1266 unlock_page(page); 1267 put_page(page); 1268 return error; 1269 } 1270 1271 /* 1272 * Move the page from the page cache to the swap cache. 1273 */ 1274 static int shmem_writepage(struct page *page, struct writeback_control *wbc) 1275 { 1276 struct shmem_inode_info *info; 1277 struct address_space *mapping; 1278 struct inode *inode; 1279 swp_entry_t swap; 1280 pgoff_t index; 1281 1282 VM_BUG_ON_PAGE(PageCompound(page), page); 1283 BUG_ON(!PageLocked(page)); 1284 mapping = page->mapping; 1285 index = page->index; 1286 inode = mapping->host; 1287 info = SHMEM_I(inode); 1288 if (info->flags & VM_LOCKED) 1289 goto redirty; 1290 if (!total_swap_pages) 1291 goto redirty; 1292 1293 /* 1294 * Our capabilities prevent regular writeback or sync from ever calling 1295 * shmem_writepage; but a stacking filesystem might use ->writepage of 1296 * its underlying filesystem, in which case tmpfs should write out to 1297 * swap only in response to memory pressure, and not for the writeback 1298 * threads or sync. 1299 */ 1300 if (!wbc->for_reclaim) { 1301 WARN_ON_ONCE(1); /* Still happens? Tell us about it! */ 1302 goto redirty; 1303 } 1304 1305 /* 1306 * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC 1307 * value into swapfile.c, the only way we can correctly account for a 1308 * fallocated page arriving here is now to initialize it and write it. 1309 * 1310 * That's okay for a page already fallocated earlier, but if we have 1311 * not yet completed the fallocation, then (a) we want to keep track 1312 * of this page in case we have to undo it, and (b) it may not be a 1313 * good idea to continue anyway, once we're pushing into swap. So 1314 * reactivate the page, and let shmem_fallocate() quit when too many. 1315 */ 1316 if (!PageUptodate(page)) { 1317 if (inode->i_private) { 1318 struct shmem_falloc *shmem_falloc; 1319 spin_lock(&inode->i_lock); 1320 shmem_falloc = inode->i_private; 1321 if (shmem_falloc && 1322 !shmem_falloc->waitq && 1323 index >= shmem_falloc->start && 1324 index < shmem_falloc->next) 1325 shmem_falloc->nr_unswapped++; 1326 else 1327 shmem_falloc = NULL; 1328 spin_unlock(&inode->i_lock); 1329 if (shmem_falloc) 1330 goto redirty; 1331 } 1332 clear_highpage(page); 1333 flush_dcache_page(page); 1334 SetPageUptodate(page); 1335 } 1336 1337 swap = get_swap_page(page); 1338 if (!swap.val) 1339 goto redirty; 1340 1341 /* 1342 * Add inode to shmem_unuse()'s list of swapped-out inodes, 1343 * if it's not already there. Do it now before the page is 1344 * moved to swap cache, when its pagelock no longer protects 1345 * the inode from eviction. But don't unlock the mutex until 1346 * we've incremented swapped, because shmem_unuse_inode() will 1347 * prune a !swapped inode from the swaplist under this mutex. 1348 */ 1349 mutex_lock(&shmem_swaplist_mutex); 1350 if (list_empty(&info->swaplist)) 1351 list_add_tail(&info->swaplist, &shmem_swaplist); 1352 1353 if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { 1354 spin_lock_irq(&info->lock); 1355 shmem_recalc_inode(inode); 1356 info->swapped++; 1357 spin_unlock_irq(&info->lock); 1358 1359 swap_shmem_alloc(swap); 1360 shmem_delete_from_page_cache(page, swp_to_radix_entry(swap)); 1361 1362 mutex_unlock(&shmem_swaplist_mutex); 1363 BUG_ON(page_mapped(page)); 1364 swap_writepage(page, wbc); 1365 return 0; 1366 } 1367 1368 mutex_unlock(&shmem_swaplist_mutex); 1369 put_swap_page(page, swap); 1370 redirty: 1371 set_page_dirty(page); 1372 if (wbc->for_reclaim) 1373 return AOP_WRITEPAGE_ACTIVATE; /* Return with page locked */ 1374 unlock_page(page); 1375 return 0; 1376 } 1377 1378 #if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS) 1379 static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) 1380 { 1381 char buffer[64]; 1382 1383 if (!mpol || mpol->mode == MPOL_DEFAULT) 1384 return; /* show nothing */ 1385 1386 mpol_to_str(buffer, sizeof(buffer), mpol); 1387 1388 seq_printf(seq, ",mpol=%s", buffer); 1389 } 1390 1391 static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) 1392 { 1393 struct mempolicy *mpol = NULL; 1394 if (sbinfo->mpol) { 1395 spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */ 1396 mpol = sbinfo->mpol; 1397 mpol_get(mpol); 1398 spin_unlock(&sbinfo->stat_lock); 1399 } 1400 return mpol; 1401 } 1402 #else /* !CONFIG_NUMA || !CONFIG_TMPFS */ 1403 static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) 1404 { 1405 } 1406 static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) 1407 { 1408 return NULL; 1409 } 1410 #endif /* CONFIG_NUMA && CONFIG_TMPFS */ 1411 #ifndef CONFIG_NUMA 1412 #define vm_policy vm_private_data 1413 #endif 1414 1415 static void shmem_pseudo_vma_init(struct vm_area_struct *vma, 1416 struct shmem_inode_info *info, pgoff_t index) 1417 { 1418 /* Create a pseudo vma that just contains the policy */ 1419 vma_init(vma, NULL); 1420 /* Bias interleave by inode number to distribute better across nodes */ 1421 vma->vm_pgoff = index + info->vfs_inode.i_ino; 1422 vma->vm_policy = mpol_shared_policy_lookup(&info->policy, index); 1423 } 1424 1425 static void shmem_pseudo_vma_destroy(struct vm_area_struct *vma) 1426 { 1427 /* Drop reference taken by mpol_shared_policy_lookup() */ 1428 mpol_cond_put(vma->vm_policy); 1429 } 1430 1431 static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, 1432 struct shmem_inode_info *info, pgoff_t index) 1433 { 1434 struct vm_area_struct pvma; 1435 struct page *page; 1436 struct vm_fault vmf; 1437 1438 shmem_pseudo_vma_init(&pvma, info, index); 1439 vmf.vma = &pvma; 1440 vmf.address = 0; 1441 page = swap_cluster_readahead(swap, gfp, &vmf); 1442 shmem_pseudo_vma_destroy(&pvma); 1443 1444 return page; 1445 } 1446 1447 static struct page *shmem_alloc_hugepage(gfp_t gfp, 1448 struct shmem_inode_info *info, pgoff_t index) 1449 { 1450 struct vm_area_struct pvma; 1451 struct inode *inode = &info->vfs_inode; 1452 struct address_space *mapping = inode->i_mapping; 1453 pgoff_t idx, hindex; 1454 void __rcu **results; 1455 struct page *page; 1456 1457 if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) 1458 return NULL; 1459 1460 hindex = round_down(index, HPAGE_PMD_NR); 1461 rcu_read_lock(); 1462 if (radix_tree_gang_lookup_slot(&mapping->i_pages, &results, &idx, 1463 hindex, 1) && idx < hindex + HPAGE_PMD_NR) { 1464 rcu_read_unlock(); 1465 return NULL; 1466 } 1467 rcu_read_unlock(); 1468 1469 shmem_pseudo_vma_init(&pvma, info, hindex); 1470 page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN, 1471 HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(), true); 1472 shmem_pseudo_vma_destroy(&pvma); 1473 if (page) 1474 prep_transhuge_page(page); 1475 return page; 1476 } 1477 1478 static struct page *shmem_alloc_page(gfp_t gfp, 1479 struct shmem_inode_info *info, pgoff_t index) 1480 { 1481 struct vm_area_struct pvma; 1482 struct page *page; 1483 1484 shmem_pseudo_vma_init(&pvma, info, index); 1485 page = alloc_page_vma(gfp, &pvma, 0); 1486 shmem_pseudo_vma_destroy(&pvma); 1487 1488 return page; 1489 } 1490 1491 static struct page *shmem_alloc_and_acct_page(gfp_t gfp, 1492 struct inode *inode, 1493 pgoff_t index, bool huge) 1494 { 1495 struct shmem_inode_info *info = SHMEM_I(inode); 1496 struct page *page; 1497 int nr; 1498 int err = -ENOSPC; 1499 1500 if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) 1501 huge = false; 1502 nr = huge ? HPAGE_PMD_NR : 1; 1503 1504 if (!shmem_inode_acct_block(inode, nr)) 1505 goto failed; 1506 1507 if (huge) 1508 page = shmem_alloc_hugepage(gfp, info, index); 1509 else 1510 page = shmem_alloc_page(gfp, info, index); 1511 if (page) { 1512 __SetPageLocked(page); 1513 __SetPageSwapBacked(page); 1514 return page; 1515 } 1516 1517 err = -ENOMEM; 1518 shmem_inode_unacct_blocks(inode, nr); 1519 failed: 1520 return ERR_PTR(err); 1521 } 1522 1523 /* 1524 * When a page is moved from swapcache to shmem filecache (either by the 1525 * usual swapin of shmem_getpage_gfp(), or by the less common swapoff of 1526 * shmem_unuse_inode()), it may have been read in earlier from swap, in 1527 * ignorance of the mapping it belongs to. If that mapping has special 1528 * constraints (like the gma500 GEM driver, which requires RAM below 4GB), 1529 * we may need to copy to a suitable page before moving to filecache. 1530 * 1531 * In a future release, this may well be extended to respect cpuset and 1532 * NUMA mempolicy, and applied also to anonymous pages in do_swap_page(); 1533 * but for now it is a simple matter of zone. 1534 */ 1535 static bool shmem_should_replace_page(struct page *page, gfp_t gfp) 1536 { 1537 return page_zonenum(page) > gfp_zone(gfp); 1538 } 1539 1540 static int shmem_replace_page(struct page **pagep, gfp_t gfp, 1541 struct shmem_inode_info *info, pgoff_t index) 1542 { 1543 struct page *oldpage, *newpage; 1544 struct address_space *swap_mapping; 1545 pgoff_t swap_index; 1546 int error; 1547 1548 oldpage = *pagep; 1549 swap_index = page_private(oldpage); 1550 swap_mapping = page_mapping(oldpage); 1551 1552 /* 1553 * We have arrived here because our zones are constrained, so don't 1554 * limit chance of success by further cpuset and node constraints. 1555 */ 1556 gfp &= ~GFP_CONSTRAINT_MASK; 1557 newpage = shmem_alloc_page(gfp, info, index); 1558 if (!newpage) 1559 return -ENOMEM; 1560 1561 get_page(newpage); 1562 copy_highpage(newpage, oldpage); 1563 flush_dcache_page(newpage); 1564 1565 __SetPageLocked(newpage); 1566 __SetPageSwapBacked(newpage); 1567 SetPageUptodate(newpage); 1568 set_page_private(newpage, swap_index); 1569 SetPageSwapCache(newpage); 1570 1571 /* 1572 * Our caller will very soon move newpage out of swapcache, but it's 1573 * a nice clean interface for us to replace oldpage by newpage there. 1574 */ 1575 xa_lock_irq(&swap_mapping->i_pages); 1576 error = shmem_replace_entry(swap_mapping, swap_index, oldpage, newpage); 1577 if (!error) { 1578 __inc_node_page_state(newpage, NR_FILE_PAGES); 1579 __dec_node_page_state(oldpage, NR_FILE_PAGES); 1580 } 1581 xa_unlock_irq(&swap_mapping->i_pages); 1582 1583 if (unlikely(error)) { 1584 /* 1585 * Is this possible? I think not, now that our callers check 1586 * both PageSwapCache and page_private after getting page lock; 1587 * but be defensive. Reverse old to newpage for clear and free. 1588 */ 1589 oldpage = newpage; 1590 } else { 1591 mem_cgroup_migrate(oldpage, newpage); 1592 lru_cache_add_anon(newpage); 1593 *pagep = newpage; 1594 } 1595 1596 ClearPageSwapCache(oldpage); 1597 set_page_private(oldpage, 0); 1598 1599 unlock_page(oldpage); 1600 put_page(oldpage); 1601 put_page(oldpage); 1602 return error; 1603 } 1604 1605 /* 1606 * shmem_getpage_gfp - find page in cache, or get from swap, or allocate 1607 * 1608 * If we allocate a new one we do not mark it dirty. That's up to the 1609 * vm. If we swap it in we mark it dirty since we also free the swap 1610 * entry since a page cannot live in both the swap and page cache. 1611 * 1612 * fault_mm and fault_type are only supplied by shmem_fault: 1613 * otherwise they are NULL. 1614 */ 1615 static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, 1616 struct page **pagep, enum sgp_type sgp, gfp_t gfp, 1617 struct vm_area_struct *vma, struct vm_fault *vmf, 1618 vm_fault_t *fault_type) 1619 { 1620 struct address_space *mapping = inode->i_mapping; 1621 struct shmem_inode_info *info = SHMEM_I(inode); 1622 struct shmem_sb_info *sbinfo; 1623 struct mm_struct *charge_mm; 1624 struct mem_cgroup *memcg; 1625 struct page *page; 1626 swp_entry_t swap; 1627 enum sgp_type sgp_huge = sgp; 1628 pgoff_t hindex = index; 1629 int error; 1630 int once = 0; 1631 int alloced = 0; 1632 1633 if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT)) 1634 return -EFBIG; 1635 if (sgp == SGP_NOHUGE || sgp == SGP_HUGE) 1636 sgp = SGP_CACHE; 1637 repeat: 1638 swap.val = 0; 1639 page = find_lock_entry(mapping, index); 1640 if (xa_is_value(page)) { 1641 swap = radix_to_swp_entry(page); 1642 page = NULL; 1643 } 1644 1645 if (sgp <= SGP_CACHE && 1646 ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) { 1647 error = -EINVAL; 1648 goto unlock; 1649 } 1650 1651 if (page && sgp == SGP_WRITE) 1652 mark_page_accessed(page); 1653 1654 /* fallocated page? */ 1655 if (page && !PageUptodate(page)) { 1656 if (sgp != SGP_READ) 1657 goto clear; 1658 unlock_page(page); 1659 put_page(page); 1660 page = NULL; 1661 } 1662 if (page || (sgp == SGP_READ && !swap.val)) { 1663 *pagep = page; 1664 return 0; 1665 } 1666 1667 /* 1668 * Fast cache lookup did not find it: 1669 * bring it back from swap or allocate. 1670 */ 1671 sbinfo = SHMEM_SB(inode->i_sb); 1672 charge_mm = vma ? vma->vm_mm : current->mm; 1673 1674 if (swap.val) { 1675 /* Look it up and read it in.. */ 1676 page = lookup_swap_cache(swap, NULL, 0); 1677 if (!page) { 1678 /* Or update major stats only when swapin succeeds?? */ 1679 if (fault_type) { 1680 *fault_type |= VM_FAULT_MAJOR; 1681 count_vm_event(PGMAJFAULT); 1682 count_memcg_event_mm(charge_mm, PGMAJFAULT); 1683 } 1684 /* Here we actually start the io */ 1685 page = shmem_swapin(swap, gfp, info, index); 1686 if (!page) { 1687 error = -ENOMEM; 1688 goto failed; 1689 } 1690 } 1691 1692 /* We have to do this with page locked to prevent races */ 1693 lock_page(page); 1694 if (!PageSwapCache(page) || page_private(page) != swap.val || 1695 !shmem_confirm_swap(mapping, index, swap)) { 1696 error = -EEXIST; /* try again */ 1697 goto unlock; 1698 } 1699 if (!PageUptodate(page)) { 1700 error = -EIO; 1701 goto failed; 1702 } 1703 wait_on_page_writeback(page); 1704 1705 if (shmem_should_replace_page(page, gfp)) { 1706 error = shmem_replace_page(&page, gfp, info, index); 1707 if (error) 1708 goto failed; 1709 } 1710 1711 error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg, 1712 false); 1713 if (!error) { 1714 error = shmem_add_to_page_cache(page, mapping, index, 1715 swp_to_radix_entry(swap)); 1716 /* 1717 * We already confirmed swap under page lock, and make 1718 * no memory allocation here, so usually no possibility 1719 * of error; but free_swap_and_cache() only trylocks a 1720 * page, so it is just possible that the entry has been 1721 * truncated or holepunched since swap was confirmed. 1722 * shmem_undo_range() will have done some of the 1723 * unaccounting, now delete_from_swap_cache() will do 1724 * the rest. 1725 * Reset swap.val? No, leave it so "failed" goes back to 1726 * "repeat": reading a hole and writing should succeed. 1727 */ 1728 if (error) { 1729 mem_cgroup_cancel_charge(page, memcg, false); 1730 delete_from_swap_cache(page); 1731 } 1732 } 1733 if (error) 1734 goto failed; 1735 1736 mem_cgroup_commit_charge(page, memcg, true, false); 1737 1738 spin_lock_irq(&info->lock); 1739 info->swapped--; 1740 shmem_recalc_inode(inode); 1741 spin_unlock_irq(&info->lock); 1742 1743 if (sgp == SGP_WRITE) 1744 mark_page_accessed(page); 1745 1746 delete_from_swap_cache(page); 1747 set_page_dirty(page); 1748 swap_free(swap); 1749 1750 } else { 1751 if (vma && userfaultfd_missing(vma)) { 1752 *fault_type = handle_userfault(vmf, VM_UFFD_MISSING); 1753 return 0; 1754 } 1755 1756 /* shmem_symlink() */ 1757 if (mapping->a_ops != &shmem_aops) 1758 goto alloc_nohuge; 1759 if (shmem_huge == SHMEM_HUGE_DENY || sgp_huge == SGP_NOHUGE) 1760 goto alloc_nohuge; 1761 if (shmem_huge == SHMEM_HUGE_FORCE) 1762 goto alloc_huge; 1763 switch (sbinfo->huge) { 1764 loff_t i_size; 1765 pgoff_t off; 1766 case SHMEM_HUGE_NEVER: 1767 goto alloc_nohuge; 1768 case SHMEM_HUGE_WITHIN_SIZE: 1769 off = round_up(index, HPAGE_PMD_NR); 1770 i_size = round_up(i_size_read(inode), PAGE_SIZE); 1771 if (i_size >= HPAGE_PMD_SIZE && 1772 i_size >> PAGE_SHIFT >= off) 1773 goto alloc_huge; 1774 /* fallthrough */ 1775 case SHMEM_HUGE_ADVISE: 1776 if (sgp_huge == SGP_HUGE) 1777 goto alloc_huge; 1778 /* TODO: implement fadvise() hints */ 1779 goto alloc_nohuge; 1780 } 1781 1782 alloc_huge: 1783 page = shmem_alloc_and_acct_page(gfp, inode, index, true); 1784 if (IS_ERR(page)) { 1785 alloc_nohuge: page = shmem_alloc_and_acct_page(gfp, inode, 1786 index, false); 1787 } 1788 if (IS_ERR(page)) { 1789 int retry = 5; 1790 error = PTR_ERR(page); 1791 page = NULL; 1792 if (error != -ENOSPC) 1793 goto failed; 1794 /* 1795 * Try to reclaim some spece by splitting a huge page 1796 * beyond i_size on the filesystem. 1797 */ 1798 while (retry--) { 1799 int ret; 1800 ret = shmem_unused_huge_shrink(sbinfo, NULL, 1); 1801 if (ret == SHRINK_STOP) 1802 break; 1803 if (ret) 1804 goto alloc_nohuge; 1805 } 1806 goto failed; 1807 } 1808 1809 if (PageTransHuge(page)) 1810 hindex = round_down(index, HPAGE_PMD_NR); 1811 else 1812 hindex = index; 1813 1814 if (sgp == SGP_WRITE) 1815 __SetPageReferenced(page); 1816 1817 error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg, 1818 PageTransHuge(page)); 1819 if (error) 1820 goto unacct; 1821 error = radix_tree_maybe_preload_order(gfp & GFP_RECLAIM_MASK, 1822 compound_order(page)); 1823 if (!error) { 1824 error = shmem_add_to_page_cache(page, mapping, hindex, 1825 NULL); 1826 radix_tree_preload_end(); 1827 } 1828 if (error) { 1829 mem_cgroup_cancel_charge(page, memcg, 1830 PageTransHuge(page)); 1831 goto unacct; 1832 } 1833 mem_cgroup_commit_charge(page, memcg, false, 1834 PageTransHuge(page)); 1835 lru_cache_add_anon(page); 1836 1837 spin_lock_irq(&info->lock); 1838 info->alloced += 1 << compound_order(page); 1839 inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page); 1840 shmem_recalc_inode(inode); 1841 spin_unlock_irq(&info->lock); 1842 alloced = true; 1843 1844 if (PageTransHuge(page) && 1845 DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) < 1846 hindex + HPAGE_PMD_NR - 1) { 1847 /* 1848 * Part of the huge page is beyond i_size: subject 1849 * to shrink under memory pressure. 1850 */ 1851 spin_lock(&sbinfo->shrinklist_lock); 1852 /* 1853 * _careful to defend against unlocked access to 1854 * ->shrink_list in shmem_unused_huge_shrink() 1855 */ 1856 if (list_empty_careful(&info->shrinklist)) { 1857 list_add_tail(&info->shrinklist, 1858 &sbinfo->shrinklist); 1859 sbinfo->shrinklist_len++; 1860 } 1861 spin_unlock(&sbinfo->shrinklist_lock); 1862 } 1863 1864 /* 1865 * Let SGP_FALLOC use the SGP_WRITE optimization on a new page. 1866 */ 1867 if (sgp == SGP_FALLOC) 1868 sgp = SGP_WRITE; 1869 clear: 1870 /* 1871 * Let SGP_WRITE caller clear ends if write does not fill page; 1872 * but SGP_FALLOC on a page fallocated earlier must initialize 1873 * it now, lest undo on failure cancel our earlier guarantee. 1874 */ 1875 if (sgp != SGP_WRITE && !PageUptodate(page)) { 1876 struct page *head = compound_head(page); 1877 int i; 1878 1879 for (i = 0; i < (1 << compound_order(head)); i++) { 1880 clear_highpage(head + i); 1881 flush_dcache_page(head + i); 1882 } 1883 SetPageUptodate(head); 1884 } 1885 } 1886 1887 /* Perhaps the file has been truncated since we checked */ 1888 if (sgp <= SGP_CACHE && 1889 ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) { 1890 if (alloced) { 1891 ClearPageDirty(page); 1892 delete_from_page_cache(page); 1893 spin_lock_irq(&info->lock); 1894 shmem_recalc_inode(inode); 1895 spin_unlock_irq(&info->lock); 1896 } 1897 error = -EINVAL; 1898 goto unlock; 1899 } 1900 *pagep = page + index - hindex; 1901 return 0; 1902 1903 /* 1904 * Error recovery. 1905 */ 1906 unacct: 1907 shmem_inode_unacct_blocks(inode, 1 << compound_order(page)); 1908 1909 if (PageTransHuge(page)) { 1910 unlock_page(page); 1911 put_page(page); 1912 goto alloc_nohuge; 1913 } 1914 failed: 1915 if (swap.val && !shmem_confirm_swap(mapping, index, swap)) 1916 error = -EEXIST; 1917 unlock: 1918 if (page) { 1919 unlock_page(page); 1920 put_page(page); 1921 } 1922 if (error == -ENOSPC && !once++) { 1923 spin_lock_irq(&info->lock); 1924 shmem_recalc_inode(inode); 1925 spin_unlock_irq(&info->lock); 1926 goto repeat; 1927 } 1928 if (error == -EEXIST) /* from above or from radix_tree_insert */ 1929 goto repeat; 1930 return error; 1931 } 1932 1933 /* 1934 * This is like autoremove_wake_function, but it removes the wait queue 1935 * entry unconditionally - even if something else had already woken the 1936 * target. 1937 */ 1938 static int synchronous_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) 1939 { 1940 int ret = default_wake_function(wait, mode, sync, key); 1941 list_del_init(&wait->entry); 1942 return ret; 1943 } 1944 1945 static vm_fault_t shmem_fault(struct vm_fault *vmf) 1946 { 1947 struct vm_area_struct *vma = vmf->vma; 1948 struct inode *inode = file_inode(vma->vm_file); 1949 gfp_t gfp = mapping_gfp_mask(inode->i_mapping); 1950 enum sgp_type sgp; 1951 int err; 1952 vm_fault_t ret = VM_FAULT_LOCKED; 1953 1954 /* 1955 * Trinity finds that probing a hole which tmpfs is punching can 1956 * prevent the hole-punch from ever completing: which in turn 1957 * locks writers out with its hold on i_mutex. So refrain from 1958 * faulting pages into the hole while it's being punched. Although 1959 * shmem_undo_range() does remove the additions, it may be unable to 1960 * keep up, as each new page needs its own unmap_mapping_range() call, 1961 * and the i_mmap tree grows ever slower to scan if new vmas are added. 1962 * 1963 * It does not matter if we sometimes reach this check just before the 1964 * hole-punch begins, so that one fault then races with the punch: 1965 * we just need to make racing faults a rare case. 1966 * 1967 * The implementation below would be much simpler if we just used a 1968 * standard mutex or completion: but we cannot take i_mutex in fault, 1969 * and bloating every shmem inode for this unlikely case would be sad. 1970 */ 1971 if (unlikely(inode->i_private)) { 1972 struct shmem_falloc *shmem_falloc; 1973 1974 spin_lock(&inode->i_lock); 1975 shmem_falloc = inode->i_private; 1976 if (shmem_falloc && 1977 shmem_falloc->waitq && 1978 vmf->pgoff >= shmem_falloc->start && 1979 vmf->pgoff < shmem_falloc->next) { 1980 wait_queue_head_t *shmem_falloc_waitq; 1981 DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function); 1982 1983 ret = VM_FAULT_NOPAGE; 1984 if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) && 1985 !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) { 1986 /* It's polite to up mmap_sem if we can */ 1987 up_read(&vma->vm_mm->mmap_sem); 1988 ret = VM_FAULT_RETRY; 1989 } 1990 1991 shmem_falloc_waitq = shmem_falloc->waitq; 1992 prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait, 1993 TASK_UNINTERRUPTIBLE); 1994 spin_unlock(&inode->i_lock); 1995 schedule(); 1996 1997 /* 1998 * shmem_falloc_waitq points into the shmem_fallocate() 1999 * stack of the hole-punching task: shmem_falloc_waitq 2000 * is usually invalid by the time we reach here, but 2001 * finish_wait() does not dereference it in that case; 2002 * though i_lock needed lest racing with wake_up_all(). 2003 */ 2004 spin_lock(&inode->i_lock); 2005 finish_wait(shmem_falloc_waitq, &shmem_fault_wait); 2006 spin_unlock(&inode->i_lock); 2007 return ret; 2008 } 2009 spin_unlock(&inode->i_lock); 2010 } 2011 2012 sgp = SGP_CACHE; 2013 2014 if ((vma->vm_flags & VM_NOHUGEPAGE) || 2015 test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) 2016 sgp = SGP_NOHUGE; 2017 else if (vma->vm_flags & VM_HUGEPAGE) 2018 sgp = SGP_HUGE; 2019 2020 err = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, sgp, 2021 gfp, vma, vmf, &ret); 2022 if (err) 2023 return vmf_error(err); 2024 return ret; 2025 } 2026 2027 unsigned long shmem_get_unmapped_area(struct file *file, 2028 unsigned long uaddr, unsigned long len, 2029 unsigned long pgoff, unsigned long flags) 2030 { 2031 unsigned long (*get_area)(struct file *, 2032 unsigned long, unsigned long, unsigned long, unsigned long); 2033 unsigned long addr; 2034 unsigned long offset; 2035 unsigned long inflated_len; 2036 unsigned long inflated_addr; 2037 unsigned long inflated_offset; 2038 2039 if (len > TASK_SIZE) 2040 return -ENOMEM; 2041 2042 get_area = current->mm->get_unmapped_area; 2043 addr = get_area(file, uaddr, len, pgoff, flags); 2044 2045 if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) 2046 return addr; 2047 if (IS_ERR_VALUE(addr)) 2048 return addr; 2049 if (addr & ~PAGE_MASK) 2050 return addr; 2051 if (addr > TASK_SIZE - len) 2052 return addr; 2053 2054 if (shmem_huge == SHMEM_HUGE_DENY) 2055 return addr; 2056 if (len < HPAGE_PMD_SIZE) 2057 return addr; 2058 if (flags & MAP_FIXED) 2059 return addr; 2060 /* 2061 * Our priority is to support MAP_SHARED mapped hugely; 2062 * and support MAP_PRIVATE mapped hugely too, until it is COWed. 2063 * But if caller specified an address hint, respect that as before. 2064 */ 2065 if (uaddr) 2066 return addr; 2067 2068 if (shmem_huge != SHMEM_HUGE_FORCE) { 2069 struct super_block *sb; 2070 2071 if (file) { 2072 VM_BUG_ON(file->f_op != &shmem_file_operations); 2073 sb = file_inode(file)->i_sb; 2074 } else { 2075 /* 2076 * Called directly from mm/mmap.c, or drivers/char/mem.c 2077 * for "/dev/zero", to create a shared anonymous object. 2078 */ 2079 if (IS_ERR(shm_mnt)) 2080 return addr; 2081 sb = shm_mnt->mnt_sb; 2082 } 2083 if (SHMEM_SB(sb)->huge == SHMEM_HUGE_NEVER) 2084 return addr; 2085 } 2086 2087 offset = (pgoff << PAGE_SHIFT) & (HPAGE_PMD_SIZE-1); 2088 if (offset && offset + len < 2 * HPAGE_PMD_SIZE) 2089 return addr; 2090 if ((addr & (HPAGE_PMD_SIZE-1)) == offset) 2091 return addr; 2092 2093 inflated_len = len + HPAGE_PMD_SIZE - PAGE_SIZE; 2094 if (inflated_len > TASK_SIZE) 2095 return addr; 2096 if (inflated_len < len) 2097 return addr; 2098 2099 inflated_addr = get_area(NULL, 0, inflated_len, 0, flags); 2100 if (IS_ERR_VALUE(inflated_addr)) 2101 return addr; 2102 if (inflated_addr & ~PAGE_MASK) 2103 return addr; 2104 2105 inflated_offset = inflated_addr & (HPAGE_PMD_SIZE-1); 2106 inflated_addr += offset - inflated_offset; 2107 if (inflated_offset > offset) 2108 inflated_addr += HPAGE_PMD_SIZE; 2109 2110 if (inflated_addr > TASK_SIZE - len) 2111 return addr; 2112 return inflated_addr; 2113 } 2114 2115 #ifdef CONFIG_NUMA 2116 static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol) 2117 { 2118 struct inode *inode = file_inode(vma->vm_file); 2119 return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol); 2120 } 2121 2122 static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, 2123 unsigned long addr) 2124 { 2125 struct inode *inode = file_inode(vma->vm_file); 2126 pgoff_t index; 2127 2128 index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 2129 return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index); 2130 } 2131 #endif 2132 2133 int shmem_lock(struct file *file, int lock, struct user_struct *user) 2134 { 2135 struct inode *inode = file_inode(file); 2136 struct shmem_inode_info *info = SHMEM_I(inode); 2137 int retval = -ENOMEM; 2138 2139 spin_lock_irq(&info->lock); 2140 if (lock && !(info->flags & VM_LOCKED)) { 2141 if (!user_shm_lock(inode->i_size, user)) 2142 goto out_nomem; 2143 info->flags |= VM_LOCKED; 2144 mapping_set_unevictable(file->f_mapping); 2145 } 2146 if (!lock && (info->flags & VM_LOCKED) && user) { 2147 user_shm_unlock(inode->i_size, user); 2148 info->flags &= ~VM_LOCKED; 2149 mapping_clear_unevictable(file->f_mapping); 2150 } 2151 retval = 0; 2152 2153 out_nomem: 2154 spin_unlock_irq(&info->lock); 2155 return retval; 2156 } 2157 2158 static int shmem_mmap(struct file *file, struct vm_area_struct *vma) 2159 { 2160 file_accessed(file); 2161 vma->vm_ops = &shmem_vm_ops; 2162 if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && 2163 ((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) < 2164 (vma->vm_end & HPAGE_PMD_MASK)) { 2165 khugepaged_enter(vma, vma->vm_flags); 2166 } 2167 return 0; 2168 } 2169 2170 static struct inode *shmem_get_inode(struct super_block *sb, const struct inode *dir, 2171 umode_t mode, dev_t dev, unsigned long flags) 2172 { 2173 struct inode *inode; 2174 struct shmem_inode_info *info; 2175 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 2176 2177 if (shmem_reserve_inode(sb)) 2178 return NULL; 2179 2180 inode = new_inode(sb); 2181 if (inode) { 2182 inode->i_ino = get_next_ino(); 2183 inode_init_owner(inode, dir, mode); 2184 inode->i_blocks = 0; 2185 inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); 2186 inode->i_generation = prandom_u32(); 2187 info = SHMEM_I(inode); 2188 memset(info, 0, (char *)inode - (char *)info); 2189 spin_lock_init(&info->lock); 2190 info->seals = F_SEAL_SEAL; 2191 info->flags = flags & VM_NORESERVE; 2192 INIT_LIST_HEAD(&info->shrinklist); 2193 INIT_LIST_HEAD(&info->swaplist); 2194 simple_xattrs_init(&info->xattrs); 2195 cache_no_acl(inode); 2196 2197 switch (mode & S_IFMT) { 2198 default: 2199 inode->i_op = &shmem_special_inode_operations; 2200 init_special_inode(inode, mode, dev); 2201 break; 2202 case S_IFREG: 2203 inode->i_mapping->a_ops = &shmem_aops; 2204 inode->i_op = &shmem_inode_operations; 2205 inode->i_fop = &shmem_file_operations; 2206 mpol_shared_policy_init(&info->policy, 2207 shmem_get_sbmpol(sbinfo)); 2208 break; 2209 case S_IFDIR: 2210 inc_nlink(inode); 2211 /* Some things misbehave if size == 0 on a directory */ 2212 inode->i_size = 2 * BOGO_DIRENT_SIZE; 2213 inode->i_op = &shmem_dir_inode_operations; 2214 inode->i_fop = &simple_dir_operations; 2215 break; 2216 case S_IFLNK: 2217 /* 2218 * Must not load anything in the rbtree, 2219 * mpol_free_shared_policy will not be called. 2220 */ 2221 mpol_shared_policy_init(&info->policy, NULL); 2222 break; 2223 } 2224 2225 lockdep_annotate_inode_mutex_key(inode); 2226 } else 2227 shmem_free_inode(sb); 2228 return inode; 2229 } 2230 2231 bool shmem_mapping(struct address_space *mapping) 2232 { 2233 return mapping->a_ops == &shmem_aops; 2234 } 2235 2236 static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, 2237 pmd_t *dst_pmd, 2238 struct vm_area_struct *dst_vma, 2239 unsigned long dst_addr, 2240 unsigned long src_addr, 2241 bool zeropage, 2242 struct page **pagep) 2243 { 2244 struct inode *inode = file_inode(dst_vma->vm_file); 2245 struct shmem_inode_info *info = SHMEM_I(inode); 2246 struct address_space *mapping = inode->i_mapping; 2247 gfp_t gfp = mapping_gfp_mask(mapping); 2248 pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); 2249 struct mem_cgroup *memcg; 2250 spinlock_t *ptl; 2251 void *page_kaddr; 2252 struct page *page; 2253 pte_t _dst_pte, *dst_pte; 2254 int ret; 2255 2256 ret = -ENOMEM; 2257 if (!shmem_inode_acct_block(inode, 1)) 2258 goto out; 2259 2260 if (!*pagep) { 2261 page = shmem_alloc_page(gfp, info, pgoff); 2262 if (!page) 2263 goto out_unacct_blocks; 2264 2265 if (!zeropage) { /* mcopy_atomic */ 2266 page_kaddr = kmap_atomic(page); 2267 ret = copy_from_user(page_kaddr, 2268 (const void __user *)src_addr, 2269 PAGE_SIZE); 2270 kunmap_atomic(page_kaddr); 2271 2272 /* fallback to copy_from_user outside mmap_sem */ 2273 if (unlikely(ret)) { 2274 *pagep = page; 2275 shmem_inode_unacct_blocks(inode, 1); 2276 /* don't free the page */ 2277 return -EFAULT; 2278 } 2279 } else { /* mfill_zeropage_atomic */ 2280 clear_highpage(page); 2281 } 2282 } else { 2283 page = *pagep; 2284 *pagep = NULL; 2285 } 2286 2287 VM_BUG_ON(PageLocked(page) || PageSwapBacked(page)); 2288 __SetPageLocked(page); 2289 __SetPageSwapBacked(page); 2290 __SetPageUptodate(page); 2291 2292 ret = mem_cgroup_try_charge_delay(page, dst_mm, gfp, &memcg, false); 2293 if (ret) 2294 goto out_release; 2295 2296 ret = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK); 2297 if (!ret) { 2298 ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL); 2299 radix_tree_preload_end(); 2300 } 2301 if (ret) 2302 goto out_release_uncharge; 2303 2304 mem_cgroup_commit_charge(page, memcg, false, false); 2305 2306 _dst_pte = mk_pte(page, dst_vma->vm_page_prot); 2307 if (dst_vma->vm_flags & VM_WRITE) 2308 _dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte)); 2309 2310 ret = -EEXIST; 2311 dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); 2312 if (!pte_none(*dst_pte)) 2313 goto out_release_uncharge_unlock; 2314 2315 lru_cache_add_anon(page); 2316 2317 spin_lock(&info->lock); 2318 info->alloced++; 2319 inode->i_blocks += BLOCKS_PER_PAGE; 2320 shmem_recalc_inode(inode); 2321 spin_unlock(&info->lock); 2322 2323 inc_mm_counter(dst_mm, mm_counter_file(page)); 2324 page_add_file_rmap(page, false); 2325 set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); 2326 2327 /* No need to invalidate - it was non-present before */ 2328 update_mmu_cache(dst_vma, dst_addr, dst_pte); 2329 unlock_page(page); 2330 pte_unmap_unlock(dst_pte, ptl); 2331 ret = 0; 2332 out: 2333 return ret; 2334 out_release_uncharge_unlock: 2335 pte_unmap_unlock(dst_pte, ptl); 2336 out_release_uncharge: 2337 mem_cgroup_cancel_charge(page, memcg, false); 2338 out_release: 2339 unlock_page(page); 2340 put_page(page); 2341 out_unacct_blocks: 2342 shmem_inode_unacct_blocks(inode, 1); 2343 goto out; 2344 } 2345 2346 int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, 2347 pmd_t *dst_pmd, 2348 struct vm_area_struct *dst_vma, 2349 unsigned long dst_addr, 2350 unsigned long src_addr, 2351 struct page **pagep) 2352 { 2353 return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, 2354 dst_addr, src_addr, false, pagep); 2355 } 2356 2357 int shmem_mfill_zeropage_pte(struct mm_struct *dst_mm, 2358 pmd_t *dst_pmd, 2359 struct vm_area_struct *dst_vma, 2360 unsigned long dst_addr) 2361 { 2362 struct page *page = NULL; 2363 2364 return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, 2365 dst_addr, 0, true, &page); 2366 } 2367 2368 #ifdef CONFIG_TMPFS 2369 static const struct inode_operations shmem_symlink_inode_operations; 2370 static const struct inode_operations shmem_short_symlink_operations; 2371 2372 #ifdef CONFIG_TMPFS_XATTR 2373 static int shmem_initxattrs(struct inode *, const struct xattr *, void *); 2374 #else 2375 #define shmem_initxattrs NULL 2376 #endif 2377 2378 static int 2379 shmem_write_begin(struct file *file, struct address_space *mapping, 2380 loff_t pos, unsigned len, unsigned flags, 2381 struct page **pagep, void **fsdata) 2382 { 2383 struct inode *inode = mapping->host; 2384 struct shmem_inode_info *info = SHMEM_I(inode); 2385 pgoff_t index = pos >> PAGE_SHIFT; 2386 2387 /* i_mutex is held by caller */ 2388 if (unlikely(info->seals & (F_SEAL_WRITE | F_SEAL_GROW))) { 2389 if (info->seals & F_SEAL_WRITE) 2390 return -EPERM; 2391 if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size) 2392 return -EPERM; 2393 } 2394 2395 return shmem_getpage(inode, index, pagep, SGP_WRITE); 2396 } 2397 2398 static int 2399 shmem_write_end(struct file *file, struct address_space *mapping, 2400 loff_t pos, unsigned len, unsigned copied, 2401 struct page *page, void *fsdata) 2402 { 2403 struct inode *inode = mapping->host; 2404 2405 if (pos + copied > inode->i_size) 2406 i_size_write(inode, pos + copied); 2407 2408 if (!PageUptodate(page)) { 2409 struct page *head = compound_head(page); 2410 if (PageTransCompound(page)) { 2411 int i; 2412 2413 for (i = 0; i < HPAGE_PMD_NR; i++) { 2414 if (head + i == page) 2415 continue; 2416 clear_highpage(head + i); 2417 flush_dcache_page(head + i); 2418 } 2419 } 2420 if (copied < PAGE_SIZE) { 2421 unsigned from = pos & (PAGE_SIZE - 1); 2422 zero_user_segments(page, 0, from, 2423 from + copied, PAGE_SIZE); 2424 } 2425 SetPageUptodate(head); 2426 } 2427 set_page_dirty(page); 2428 unlock_page(page); 2429 put_page(page); 2430 2431 return copied; 2432 } 2433 2434 static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) 2435 { 2436 struct file *file = iocb->ki_filp; 2437 struct inode *inode = file_inode(file); 2438 struct address_space *mapping = inode->i_mapping; 2439 pgoff_t index; 2440 unsigned long offset; 2441 enum sgp_type sgp = SGP_READ; 2442 int error = 0; 2443 ssize_t retval = 0; 2444 loff_t *ppos = &iocb->ki_pos; 2445 2446 /* 2447 * Might this read be for a stacking filesystem? Then when reading 2448 * holes of a sparse file, we actually need to allocate those pages, 2449 * and even mark them dirty, so it cannot exceed the max_blocks limit. 2450 */ 2451 if (!iter_is_iovec(to)) 2452 sgp = SGP_CACHE; 2453 2454 index = *ppos >> PAGE_SHIFT; 2455 offset = *ppos & ~PAGE_MASK; 2456 2457 for (;;) { 2458 struct page *page = NULL; 2459 pgoff_t end_index; 2460 unsigned long nr, ret; 2461 loff_t i_size = i_size_read(inode); 2462 2463 end_index = i_size >> PAGE_SHIFT; 2464 if (index > end_index) 2465 break; 2466 if (index == end_index) { 2467 nr = i_size & ~PAGE_MASK; 2468 if (nr <= offset) 2469 break; 2470 } 2471 2472 error = shmem_getpage(inode, index, &page, sgp); 2473 if (error) { 2474 if (error == -EINVAL) 2475 error = 0; 2476 break; 2477 } 2478 if (page) { 2479 if (sgp == SGP_CACHE) 2480 set_page_dirty(page); 2481 unlock_page(page); 2482 } 2483 2484 /* 2485 * We must evaluate after, since reads (unlike writes) 2486 * are called without i_mutex protection against truncate 2487 */ 2488 nr = PAGE_SIZE; 2489 i_size = i_size_read(inode); 2490 end_index = i_size >> PAGE_SHIFT; 2491 if (index == end_index) { 2492 nr = i_size & ~PAGE_MASK; 2493 if (nr <= offset) { 2494 if (page) 2495 put_page(page); 2496 break; 2497 } 2498 } 2499 nr -= offset; 2500 2501 if (page) { 2502 /* 2503 * If users can be writing to this page using arbitrary 2504 * virtual addresses, take care about potential aliasing 2505 * before reading the page on the kernel side. 2506 */ 2507 if (mapping_writably_mapped(mapping)) 2508 flush_dcache_page(page); 2509 /* 2510 * Mark the page accessed if we read the beginning. 2511 */ 2512 if (!offset) 2513 mark_page_accessed(page); 2514 } else { 2515 page = ZERO_PAGE(0); 2516 get_page(page); 2517 } 2518 2519 /* 2520 * Ok, we have the page, and it's up-to-date, so 2521 * now we can copy it to user space... 2522 */ 2523 ret = copy_page_to_iter(page, offset, nr, to); 2524 retval += ret; 2525 offset += ret; 2526 index += offset >> PAGE_SHIFT; 2527 offset &= ~PAGE_MASK; 2528 2529 put_page(page); 2530 if (!iov_iter_count(to)) 2531 break; 2532 if (ret < nr) { 2533 error = -EFAULT; 2534 break; 2535 } 2536 cond_resched(); 2537 } 2538 2539 *ppos = ((loff_t) index << PAGE_SHIFT) + offset; 2540 file_accessed(file); 2541 return retval ? retval : error; 2542 } 2543 2544 /* 2545 * llseek SEEK_DATA or SEEK_HOLE through the radix_tree. 2546 */ 2547 static pgoff_t shmem_seek_hole_data(struct address_space *mapping, 2548 pgoff_t index, pgoff_t end, int whence) 2549 { 2550 struct page *page; 2551 struct pagevec pvec; 2552 pgoff_t indices[PAGEVEC_SIZE]; 2553 bool done = false; 2554 int i; 2555 2556 pagevec_init(&pvec); 2557 pvec.nr = 1; /* start small: we may be there already */ 2558 while (!done) { 2559 pvec.nr = find_get_entries(mapping, index, 2560 pvec.nr, pvec.pages, indices); 2561 if (!pvec.nr) { 2562 if (whence == SEEK_DATA) 2563 index = end; 2564 break; 2565 } 2566 for (i = 0; i < pvec.nr; i++, index++) { 2567 if (index < indices[i]) { 2568 if (whence == SEEK_HOLE) { 2569 done = true; 2570 break; 2571 } 2572 index = indices[i]; 2573 } 2574 page = pvec.pages[i]; 2575 if (page && !xa_is_value(page)) { 2576 if (!PageUptodate(page)) 2577 page = NULL; 2578 } 2579 if (index >= end || 2580 (page && whence == SEEK_DATA) || 2581 (!page && whence == SEEK_HOLE)) { 2582 done = true; 2583 break; 2584 } 2585 } 2586 pagevec_remove_exceptionals(&pvec); 2587 pagevec_release(&pvec); 2588 pvec.nr = PAGEVEC_SIZE; 2589 cond_resched(); 2590 } 2591 return index; 2592 } 2593 2594 static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence) 2595 { 2596 struct address_space *mapping = file->f_mapping; 2597 struct inode *inode = mapping->host; 2598 pgoff_t start, end; 2599 loff_t new_offset; 2600 2601 if (whence != SEEK_DATA && whence != SEEK_HOLE) 2602 return generic_file_llseek_size(file, offset, whence, 2603 MAX_LFS_FILESIZE, i_size_read(inode)); 2604 inode_lock(inode); 2605 /* We're holding i_mutex so we can access i_size directly */ 2606 2607 if (offset < 0) 2608 offset = -EINVAL; 2609 else if (offset >= inode->i_size) 2610 offset = -ENXIO; 2611 else { 2612 start = offset >> PAGE_SHIFT; 2613 end = (inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT; 2614 new_offset = shmem_seek_hole_data(mapping, start, end, whence); 2615 new_offset <<= PAGE_SHIFT; 2616 if (new_offset > offset) { 2617 if (new_offset < inode->i_size) 2618 offset = new_offset; 2619 else if (whence == SEEK_DATA) 2620 offset = -ENXIO; 2621 else 2622 offset = inode->i_size; 2623 } 2624 } 2625 2626 if (offset >= 0) 2627 offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE); 2628 inode_unlock(inode); 2629 return offset; 2630 } 2631 2632 static long shmem_fallocate(struct file *file, int mode, loff_t offset, 2633 loff_t len) 2634 { 2635 struct inode *inode = file_inode(file); 2636 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 2637 struct shmem_inode_info *info = SHMEM_I(inode); 2638 struct shmem_falloc shmem_falloc; 2639 pgoff_t start, index, end; 2640 int error; 2641 2642 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) 2643 return -EOPNOTSUPP; 2644 2645 inode_lock(inode); 2646 2647 if (mode & FALLOC_FL_PUNCH_HOLE) { 2648 struct address_space *mapping = file->f_mapping; 2649 loff_t unmap_start = round_up(offset, PAGE_SIZE); 2650 loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1; 2651 DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq); 2652 2653 /* protected by i_mutex */ 2654 if (info->seals & F_SEAL_WRITE) { 2655 error = -EPERM; 2656 goto out; 2657 } 2658 2659 shmem_falloc.waitq = &shmem_falloc_waitq; 2660 shmem_falloc.start = unmap_start >> PAGE_SHIFT; 2661 shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT; 2662 spin_lock(&inode->i_lock); 2663 inode->i_private = &shmem_falloc; 2664 spin_unlock(&inode->i_lock); 2665 2666 if ((u64)unmap_end > (u64)unmap_start) 2667 unmap_mapping_range(mapping, unmap_start, 2668 1 + unmap_end - unmap_start, 0); 2669 shmem_truncate_range(inode, offset, offset + len - 1); 2670 /* No need to unmap again: hole-punching leaves COWed pages */ 2671 2672 spin_lock(&inode->i_lock); 2673 inode->i_private = NULL; 2674 wake_up_all(&shmem_falloc_waitq); 2675 WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.head)); 2676 spin_unlock(&inode->i_lock); 2677 error = 0; 2678 goto out; 2679 } 2680 2681 /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */ 2682 error = inode_newsize_ok(inode, offset + len); 2683 if (error) 2684 goto out; 2685 2686 if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) { 2687 error = -EPERM; 2688 goto out; 2689 } 2690 2691 start = offset >> PAGE_SHIFT; 2692 end = (offset + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 2693 /* Try to avoid a swapstorm if len is impossible to satisfy */ 2694 if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) { 2695 error = -ENOSPC; 2696 goto out; 2697 } 2698 2699 shmem_falloc.waitq = NULL; 2700 shmem_falloc.start = start; 2701 shmem_falloc.next = start; 2702 shmem_falloc.nr_falloced = 0; 2703 shmem_falloc.nr_unswapped = 0; 2704 spin_lock(&inode->i_lock); 2705 inode->i_private = &shmem_falloc; 2706 spin_unlock(&inode->i_lock); 2707 2708 for (index = start; index < end; index++) { 2709 struct page *page; 2710 2711 /* 2712 * Good, the fallocate(2) manpage permits EINTR: we may have 2713 * been interrupted because we are using up too much memory. 2714 */ 2715 if (signal_pending(current)) 2716 error = -EINTR; 2717 else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced) 2718 error = -ENOMEM; 2719 else 2720 error = shmem_getpage(inode, index, &page, SGP_FALLOC); 2721 if (error) { 2722 /* Remove the !PageUptodate pages we added */ 2723 if (index > start) { 2724 shmem_undo_range(inode, 2725 (loff_t)start << PAGE_SHIFT, 2726 ((loff_t)index << PAGE_SHIFT) - 1, true); 2727 } 2728 goto undone; 2729 } 2730 2731 /* 2732 * Inform shmem_writepage() how far we have reached. 2733 * No need for lock or barrier: we have the page lock. 2734 */ 2735 shmem_falloc.next++; 2736 if (!PageUptodate(page)) 2737 shmem_falloc.nr_falloced++; 2738 2739 /* 2740 * If !PageUptodate, leave it that way so that freeable pages 2741 * can be recognized if we need to rollback on error later. 2742 * But set_page_dirty so that memory pressure will swap rather 2743 * than free the pages we are allocating (and SGP_CACHE pages 2744 * might still be clean: we now need to mark those dirty too). 2745 */ 2746 set_page_dirty(page); 2747 unlock_page(page); 2748 put_page(page); 2749 cond_resched(); 2750 } 2751 2752 if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) 2753 i_size_write(inode, offset + len); 2754 inode->i_ctime = current_time(inode); 2755 undone: 2756 spin_lock(&inode->i_lock); 2757 inode->i_private = NULL; 2758 spin_unlock(&inode->i_lock); 2759 out: 2760 inode_unlock(inode); 2761 return error; 2762 } 2763 2764 static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) 2765 { 2766 struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); 2767 2768 buf->f_type = TMPFS_MAGIC; 2769 buf->f_bsize = PAGE_SIZE; 2770 buf->f_namelen = NAME_MAX; 2771 if (sbinfo->max_blocks) { 2772 buf->f_blocks = sbinfo->max_blocks; 2773 buf->f_bavail = 2774 buf->f_bfree = sbinfo->max_blocks - 2775 percpu_counter_sum(&sbinfo->used_blocks); 2776 } 2777 if (sbinfo->max_inodes) { 2778 buf->f_files = sbinfo->max_inodes; 2779 buf->f_ffree = sbinfo->free_inodes; 2780 } 2781 /* else leave those fields 0 like simple_statfs */ 2782 return 0; 2783 } 2784 2785 /* 2786 * File creation. Allocate an inode, and we're done.. 2787 */ 2788 static int 2789 shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) 2790 { 2791 struct inode *inode; 2792 int error = -ENOSPC; 2793 2794 inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE); 2795 if (inode) { 2796 error = simple_acl_create(dir, inode); 2797 if (error) 2798 goto out_iput; 2799 error = security_inode_init_security(inode, dir, 2800 &dentry->d_name, 2801 shmem_initxattrs, NULL); 2802 if (error && error != -EOPNOTSUPP) 2803 goto out_iput; 2804 2805 error = 0; 2806 dir->i_size += BOGO_DIRENT_SIZE; 2807 dir->i_ctime = dir->i_mtime = current_time(dir); 2808 d_instantiate(dentry, inode); 2809 dget(dentry); /* Extra count - pin the dentry in core */ 2810 } 2811 return error; 2812 out_iput: 2813 iput(inode); 2814 return error; 2815 } 2816 2817 static int 2818 shmem_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) 2819 { 2820 struct inode *inode; 2821 int error = -ENOSPC; 2822 2823 inode = shmem_get_inode(dir->i_sb, dir, mode, 0, VM_NORESERVE); 2824 if (inode) { 2825 error = security_inode_init_security(inode, dir, 2826 NULL, 2827 shmem_initxattrs, NULL); 2828 if (error && error != -EOPNOTSUPP) 2829 goto out_iput; 2830 error = simple_acl_create(dir, inode); 2831 if (error) 2832 goto out_iput; 2833 d_tmpfile(dentry, inode); 2834 } 2835 return error; 2836 out_iput: 2837 iput(inode); 2838 return error; 2839 } 2840 2841 static int shmem_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) 2842 { 2843 int error; 2844 2845 if ((error = shmem_mknod(dir, dentry, mode | S_IFDIR, 0))) 2846 return error; 2847 inc_nlink(dir); 2848 return 0; 2849 } 2850 2851 static int shmem_create(struct inode *dir, struct dentry *dentry, umode_t mode, 2852 bool excl) 2853 { 2854 return shmem_mknod(dir, dentry, mode | S_IFREG, 0); 2855 } 2856 2857 /* 2858 * Link a file.. 2859 */ 2860 static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) 2861 { 2862 struct inode *inode = d_inode(old_dentry); 2863 int ret; 2864 2865 /* 2866 * No ordinary (disk based) filesystem counts links as inodes; 2867 * but each new link needs a new dentry, pinning lowmem, and 2868 * tmpfs dentries cannot be pruned until they are unlinked. 2869 */ 2870 ret = shmem_reserve_inode(inode->i_sb); 2871 if (ret) 2872 goto out; 2873 2874 dir->i_size += BOGO_DIRENT_SIZE; 2875 inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); 2876 inc_nlink(inode); 2877 ihold(inode); /* New dentry reference */ 2878 dget(dentry); /* Extra pinning count for the created dentry */ 2879 d_instantiate(dentry, inode); 2880 out: 2881 return ret; 2882 } 2883 2884 static int shmem_unlink(struct inode *dir, struct dentry *dentry) 2885 { 2886 struct inode *inode = d_inode(dentry); 2887 2888 if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) 2889 shmem_free_inode(inode->i_sb); 2890 2891 dir->i_size -= BOGO_DIRENT_SIZE; 2892 inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); 2893 drop_nlink(inode); 2894 dput(dentry); /* Undo the count from "create" - this does all the work */ 2895 return 0; 2896 } 2897 2898 static int shmem_rmdir(struct inode *dir, struct dentry *dentry) 2899 { 2900 if (!simple_empty(dentry)) 2901 return -ENOTEMPTY; 2902 2903 drop_nlink(d_inode(dentry)); 2904 drop_nlink(dir); 2905 return shmem_unlink(dir, dentry); 2906 } 2907 2908 static int shmem_exchange(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) 2909 { 2910 bool old_is_dir = d_is_dir(old_dentry); 2911 bool new_is_dir = d_is_dir(new_dentry); 2912 2913 if (old_dir != new_dir && old_is_dir != new_is_dir) { 2914 if (old_is_dir) { 2915 drop_nlink(old_dir); 2916 inc_nlink(new_dir); 2917 } else { 2918 drop_nlink(new_dir); 2919 inc_nlink(old_dir); 2920 } 2921 } 2922 old_dir->i_ctime = old_dir->i_mtime = 2923 new_dir->i_ctime = new_dir->i_mtime = 2924 d_inode(old_dentry)->i_ctime = 2925 d_inode(new_dentry)->i_ctime = current_time(old_dir); 2926 2927 return 0; 2928 } 2929 2930 static int shmem_whiteout(struct inode *old_dir, struct dentry *old_dentry) 2931 { 2932 struct dentry *whiteout; 2933 int error; 2934 2935 whiteout = d_alloc(old_dentry->d_parent, &old_dentry->d_name); 2936 if (!whiteout) 2937 return -ENOMEM; 2938 2939 error = shmem_mknod(old_dir, whiteout, 2940 S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV); 2941 dput(whiteout); 2942 if (error) 2943 return error; 2944 2945 /* 2946 * Cheat and hash the whiteout while the old dentry is still in 2947 * place, instead of playing games with FS_RENAME_DOES_D_MOVE. 2948 * 2949 * d_lookup() will consistently find one of them at this point, 2950 * not sure which one, but that isn't even important. 2951 */ 2952 d_rehash(whiteout); 2953 return 0; 2954 } 2955 2956 /* 2957 * The VFS layer already does all the dentry stuff for rename, 2958 * we just have to decrement the usage count for the target if 2959 * it exists so that the VFS layer correctly free's it when it 2960 * gets overwritten. 2961 */ 2962 static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) 2963 { 2964 struct inode *inode = d_inode(old_dentry); 2965 int they_are_dirs = S_ISDIR(inode->i_mode); 2966 2967 if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) 2968 return -EINVAL; 2969 2970 if (flags & RENAME_EXCHANGE) 2971 return shmem_exchange(old_dir, old_dentry, new_dir, new_dentry); 2972 2973 if (!simple_empty(new_dentry)) 2974 return -ENOTEMPTY; 2975 2976 if (flags & RENAME_WHITEOUT) { 2977 int error; 2978 2979 error = shmem_whiteout(old_dir, old_dentry); 2980 if (error) 2981 return error; 2982 } 2983 2984 if (d_really_is_positive(new_dentry)) { 2985 (void) shmem_unlink(new_dir, new_dentry); 2986 if (they_are_dirs) { 2987 drop_nlink(d_inode(new_dentry)); 2988 drop_nlink(old_dir); 2989 } 2990 } else if (they_are_dirs) { 2991 drop_nlink(old_dir); 2992 inc_nlink(new_dir); 2993 } 2994 2995 old_dir->i_size -= BOGO_DIRENT_SIZE; 2996 new_dir->i_size += BOGO_DIRENT_SIZE; 2997 old_dir->i_ctime = old_dir->i_mtime = 2998 new_dir->i_ctime = new_dir->i_mtime = 2999 inode->i_ctime = current_time(old_dir); 3000 return 0; 3001 } 3002 3003 static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *symname) 3004 { 3005 int error; 3006 int len; 3007 struct inode *inode; 3008 struct page *page; 3009 3010 len = strlen(symname) + 1; 3011 if (len > PAGE_SIZE) 3012 return -ENAMETOOLONG; 3013 3014 inode = shmem_get_inode(dir->i_sb, dir, S_IFLNK | 0777, 0, 3015 VM_NORESERVE); 3016 if (!inode) 3017 return -ENOSPC; 3018 3019 error = security_inode_init_security(inode, dir, &dentry->d_name, 3020 shmem_initxattrs, NULL); 3021 if (error) { 3022 if (error != -EOPNOTSUPP) { 3023 iput(inode); 3024 return error; 3025 } 3026 error = 0; 3027 } 3028 3029 inode->i_size = len-1; 3030 if (len <= SHORT_SYMLINK_LEN) { 3031 inode->i_link = kmemdup(symname, len, GFP_KERNEL); 3032 if (!inode->i_link) { 3033 iput(inode); 3034 return -ENOMEM; 3035 } 3036 inode->i_op = &shmem_short_symlink_operations; 3037 } else { 3038 inode_nohighmem(inode); 3039 error = shmem_getpage(inode, 0, &page, SGP_WRITE); 3040 if (error) { 3041 iput(inode); 3042 return error; 3043 } 3044 inode->i_mapping->a_ops = &shmem_aops; 3045 inode->i_op = &shmem_symlink_inode_operations; 3046 memcpy(page_address(page), symname, len); 3047 SetPageUptodate(page); 3048 set_page_dirty(page); 3049 unlock_page(page); 3050 put_page(page); 3051 } 3052 dir->i_size += BOGO_DIRENT_SIZE; 3053 dir->i_ctime = dir->i_mtime = current_time(dir); 3054 d_instantiate(dentry, inode); 3055 dget(dentry); 3056 return 0; 3057 } 3058 3059 static void shmem_put_link(void *arg) 3060 { 3061 mark_page_accessed(arg); 3062 put_page(arg); 3063 } 3064 3065 static const char *shmem_get_link(struct dentry *dentry, 3066 struct inode *inode, 3067 struct delayed_call *done) 3068 { 3069 struct page *page = NULL; 3070 int error; 3071 if (!dentry) { 3072 page = find_get_page(inode->i_mapping, 0); 3073 if (!page) 3074 return ERR_PTR(-ECHILD); 3075 if (!PageUptodate(page)) { 3076 put_page(page); 3077 return ERR_PTR(-ECHILD); 3078 } 3079 } else { 3080 error = shmem_getpage(inode, 0, &page, SGP_READ); 3081 if (error) 3082 return ERR_PTR(error); 3083 unlock_page(page); 3084 } 3085 set_delayed_call(done, shmem_put_link, page); 3086 return page_address(page); 3087 } 3088 3089 #ifdef CONFIG_TMPFS_XATTR 3090 /* 3091 * Superblocks without xattr inode operations may get some security.* xattr 3092 * support from the LSM "for free". As soon as we have any other xattrs 3093 * like ACLs, we also need to implement the security.* handlers at 3094 * filesystem level, though. 3095 */ 3096 3097 /* 3098 * Callback for security_inode_init_security() for acquiring xattrs. 3099 */ 3100 static int shmem_initxattrs(struct inode *inode, 3101 const struct xattr *xattr_array, 3102 void *fs_info) 3103 { 3104 struct shmem_inode_info *info = SHMEM_I(inode); 3105 const struct xattr *xattr; 3106 struct simple_xattr *new_xattr; 3107 size_t len; 3108 3109 for (xattr = xattr_array; xattr->name != NULL; xattr++) { 3110 new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len); 3111 if (!new_xattr) 3112 return -ENOMEM; 3113 3114 len = strlen(xattr->name) + 1; 3115 new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len, 3116 GFP_KERNEL); 3117 if (!new_xattr->name) { 3118 kfree(new_xattr); 3119 return -ENOMEM; 3120 } 3121 3122 memcpy(new_xattr->name, XATTR_SECURITY_PREFIX, 3123 XATTR_SECURITY_PREFIX_LEN); 3124 memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN, 3125 xattr->name, len); 3126 3127 simple_xattr_list_add(&info->xattrs, new_xattr); 3128 } 3129 3130 return 0; 3131 } 3132 3133 static int shmem_xattr_handler_get(const struct xattr_handler *handler, 3134 struct dentry *unused, struct inode *inode, 3135 const char *name, void *buffer, size_t size) 3136 { 3137 struct shmem_inode_info *info = SHMEM_I(inode); 3138 3139 name = xattr_full_name(handler, name); 3140 return simple_xattr_get(&info->xattrs, name, buffer, size); 3141 } 3142 3143 static int shmem_xattr_handler_set(const struct xattr_handler *handler, 3144 struct dentry *unused, struct inode *inode, 3145 const char *name, const void *value, 3146 size_t size, int flags) 3147 { 3148 struct shmem_inode_info *info = SHMEM_I(inode); 3149 3150 name = xattr_full_name(handler, name); 3151 return simple_xattr_set(&info->xattrs, name, value, size, flags); 3152 } 3153 3154 static const struct xattr_handler shmem_security_xattr_handler = { 3155 .prefix = XATTR_SECURITY_PREFIX, 3156 .get = shmem_xattr_handler_get, 3157 .set = shmem_xattr_handler_set, 3158 }; 3159 3160 static const struct xattr_handler shmem_trusted_xattr_handler = { 3161 .prefix = XATTR_TRUSTED_PREFIX, 3162 .get = shmem_xattr_handler_get, 3163 .set = shmem_xattr_handler_set, 3164 }; 3165 3166 static const struct xattr_handler *shmem_xattr_handlers[] = { 3167 #ifdef CONFIG_TMPFS_POSIX_ACL 3168 &posix_acl_access_xattr_handler, 3169 &posix_acl_default_xattr_handler, 3170 #endif 3171 &shmem_security_xattr_handler, 3172 &shmem_trusted_xattr_handler, 3173 NULL 3174 }; 3175 3176 static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size) 3177 { 3178 struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); 3179 return simple_xattr_list(d_inode(dentry), &info->xattrs, buffer, size); 3180 } 3181 #endif /* CONFIG_TMPFS_XATTR */ 3182 3183 static const struct inode_operations shmem_short_symlink_operations = { 3184 .get_link = simple_get_link, 3185 #ifdef CONFIG_TMPFS_XATTR 3186 .listxattr = shmem_listxattr, 3187 #endif 3188 }; 3189 3190 static const struct inode_operations shmem_symlink_inode_operations = { 3191 .get_link = shmem_get_link, 3192 #ifdef CONFIG_TMPFS_XATTR 3193 .listxattr = shmem_listxattr, 3194 #endif 3195 }; 3196 3197 static struct dentry *shmem_get_parent(struct dentry *child) 3198 { 3199 return ERR_PTR(-ESTALE); 3200 } 3201 3202 static int shmem_match(struct inode *ino, void *vfh) 3203 { 3204 __u32 *fh = vfh; 3205 __u64 inum = fh[2]; 3206 inum = (inum << 32) | fh[1]; 3207 return ino->i_ino == inum && fh[0] == ino->i_generation; 3208 } 3209 3210 /* Find any alias of inode, but prefer a hashed alias */ 3211 static struct dentry *shmem_find_alias(struct inode *inode) 3212 { 3213 struct dentry *alias = d_find_alias(inode); 3214 3215 return alias ?: d_find_any_alias(inode); 3216 } 3217 3218 3219 static struct dentry *shmem_fh_to_dentry(struct super_block *sb, 3220 struct fid *fid, int fh_len, int fh_type) 3221 { 3222 struct inode *inode; 3223 struct dentry *dentry = NULL; 3224 u64 inum; 3225 3226 if (fh_len < 3) 3227 return NULL; 3228 3229 inum = fid->raw[2]; 3230 inum = (inum << 32) | fid->raw[1]; 3231 3232 inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]), 3233 shmem_match, fid->raw); 3234 if (inode) { 3235 dentry = shmem_find_alias(inode); 3236 iput(inode); 3237 } 3238 3239 return dentry; 3240 } 3241 3242 static int shmem_encode_fh(struct inode *inode, __u32 *fh, int *len, 3243 struct inode *parent) 3244 { 3245 if (*len < 3) { 3246 *len = 3; 3247 return FILEID_INVALID; 3248 } 3249 3250 if (inode_unhashed(inode)) { 3251 /* Unfortunately insert_inode_hash is not idempotent, 3252 * so as we hash inodes here rather than at creation 3253 * time, we need a lock to ensure we only try 3254 * to do it once 3255 */ 3256 static DEFINE_SPINLOCK(lock); 3257 spin_lock(&lock); 3258 if (inode_unhashed(inode)) 3259 __insert_inode_hash(inode, 3260 inode->i_ino + inode->i_generation); 3261 spin_unlock(&lock); 3262 } 3263 3264 fh[0] = inode->i_generation; 3265 fh[1] = inode->i_ino; 3266 fh[2] = ((__u64)inode->i_ino) >> 32; 3267 3268 *len = 3; 3269 return 1; 3270 } 3271 3272 static const struct export_operations shmem_export_ops = { 3273 .get_parent = shmem_get_parent, 3274 .encode_fh = shmem_encode_fh, 3275 .fh_to_dentry = shmem_fh_to_dentry, 3276 }; 3277 3278 static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, 3279 bool remount) 3280 { 3281 char *this_char, *value, *rest; 3282 struct mempolicy *mpol = NULL; 3283 uid_t uid; 3284 gid_t gid; 3285 3286 while (options != NULL) { 3287 this_char = options; 3288 for (;;) { 3289 /* 3290 * NUL-terminate this option: unfortunately, 3291 * mount options form a comma-separated list, 3292 * but mpol's nodelist may also contain commas. 3293 */ 3294 options = strchr(options, ','); 3295 if (options == NULL) 3296 break; 3297 options++; 3298 if (!isdigit(*options)) { 3299 options[-1] = '\0'; 3300 break; 3301 } 3302 } 3303 if (!*this_char) 3304 continue; 3305 if ((value = strchr(this_char,'=')) != NULL) { 3306 *value++ = 0; 3307 } else { 3308 pr_err("tmpfs: No value for mount option '%s'\n", 3309 this_char); 3310 goto error; 3311 } 3312 3313 if (!strcmp(this_char,"size")) { 3314 unsigned long long size; 3315 size = memparse(value,&rest); 3316 if (*rest == '%') { 3317 size <<= PAGE_SHIFT; 3318 size *= totalram_pages; 3319 do_div(size, 100); 3320 rest++; 3321 } 3322 if (*rest) 3323 goto bad_val; 3324 sbinfo->max_blocks = 3325 DIV_ROUND_UP(size, PAGE_SIZE); 3326 } else if (!strcmp(this_char,"nr_blocks")) { 3327 sbinfo->max_blocks = memparse(value, &rest); 3328 if (*rest) 3329 goto bad_val; 3330 } else if (!strcmp(this_char,"nr_inodes")) { 3331 sbinfo->max_inodes = memparse(value, &rest); 3332 if (*rest) 3333 goto bad_val; 3334 } else if (!strcmp(this_char,"mode")) { 3335 if (remount) 3336 continue; 3337 sbinfo->mode = simple_strtoul(value, &rest, 8) & 07777; 3338 if (*rest) 3339 goto bad_val; 3340 } else if (!strcmp(this_char,"uid")) { 3341 if (remount) 3342 continue; 3343 uid = simple_strtoul(value, &rest, 0); 3344 if (*rest) 3345 goto bad_val; 3346 sbinfo->uid = make_kuid(current_user_ns(), uid); 3347 if (!uid_valid(sbinfo->uid)) 3348 goto bad_val; 3349 } else if (!strcmp(this_char,"gid")) { 3350 if (remount) 3351 continue; 3352 gid = simple_strtoul(value, &rest, 0); 3353 if (*rest) 3354 goto bad_val; 3355 sbinfo->gid = make_kgid(current_user_ns(), gid); 3356 if (!gid_valid(sbinfo->gid)) 3357 goto bad_val; 3358 #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE 3359 } else if (!strcmp(this_char, "huge")) { 3360 int huge; 3361 huge = shmem_parse_huge(value); 3362 if (huge < 0) 3363 goto bad_val; 3364 if (!has_transparent_hugepage() && 3365 huge != SHMEM_HUGE_NEVER) 3366 goto bad_val; 3367 sbinfo->huge = huge; 3368 #endif 3369 #ifdef CONFIG_NUMA 3370 } else if (!strcmp(this_char,"mpol")) { 3371 mpol_put(mpol); 3372 mpol = NULL; 3373 if (mpol_parse_str(value, &mpol)) 3374 goto bad_val; 3375 #endif 3376 } else { 3377 pr_err("tmpfs: Bad mount option %s\n", this_char); 3378 goto error; 3379 } 3380 } 3381 sbinfo->mpol = mpol; 3382 return 0; 3383 3384 bad_val: 3385 pr_err("tmpfs: Bad value '%s' for mount option '%s'\n", 3386 value, this_char); 3387 error: 3388 mpol_put(mpol); 3389 return 1; 3390 3391 } 3392 3393 static int shmem_remount_fs(struct super_block *sb, int *flags, char *data) 3394 { 3395 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 3396 struct shmem_sb_info config = *sbinfo; 3397 unsigned long inodes; 3398 int error = -EINVAL; 3399 3400 config.mpol = NULL; 3401 if (shmem_parse_options(data, &config, true)) 3402 return error; 3403 3404 spin_lock(&sbinfo->stat_lock); 3405 inodes = sbinfo->max_inodes - sbinfo->free_inodes; 3406 if (percpu_counter_compare(&sbinfo->used_blocks, config.max_blocks) > 0) 3407 goto out; 3408 if (config.max_inodes < inodes) 3409 goto out; 3410 /* 3411 * Those tests disallow limited->unlimited while any are in use; 3412 * but we must separately disallow unlimited->limited, because 3413 * in that case we have no record of how much is already in use. 3414 */ 3415 if (config.max_blocks && !sbinfo->max_blocks) 3416 goto out; 3417 if (config.max_inodes && !sbinfo->max_inodes) 3418 goto out; 3419 3420 error = 0; 3421 sbinfo->huge = config.huge; 3422 sbinfo->max_blocks = config.max_blocks; 3423 sbinfo->max_inodes = config.max_inodes; 3424 sbinfo->free_inodes = config.max_inodes - inodes; 3425 3426 /* 3427 * Preserve previous mempolicy unless mpol remount option was specified. 3428 */ 3429 if (config.mpol) { 3430 mpol_put(sbinfo->mpol); 3431 sbinfo->mpol = config.mpol; /* transfers initial ref */ 3432 } 3433 out: 3434 spin_unlock(&sbinfo->stat_lock); 3435 return error; 3436 } 3437 3438 static int shmem_show_options(struct seq_file *seq, struct dentry *root) 3439 { 3440 struct shmem_sb_info *sbinfo = SHMEM_SB(root->d_sb); 3441 3442 if (sbinfo->max_blocks != shmem_default_max_blocks()) 3443 seq_printf(seq, ",size=%luk", 3444 sbinfo->max_blocks << (PAGE_SHIFT - 10)); 3445 if (sbinfo->max_inodes != shmem_default_max_inodes()) 3446 seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes); 3447 if (sbinfo->mode != (0777 | S_ISVTX)) 3448 seq_printf(seq, ",mode=%03ho", sbinfo->mode); 3449 if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID)) 3450 seq_printf(seq, ",uid=%u", 3451 from_kuid_munged(&init_user_ns, sbinfo->uid)); 3452 if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID)) 3453 seq_printf(seq, ",gid=%u", 3454 from_kgid_munged(&init_user_ns, sbinfo->gid)); 3455 #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE 3456 /* Rightly or wrongly, show huge mount option unmasked by shmem_huge */ 3457 if (sbinfo->huge) 3458 seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge)); 3459 #endif 3460 shmem_show_mpol(seq, sbinfo->mpol); 3461 return 0; 3462 } 3463 3464 #endif /* CONFIG_TMPFS */ 3465 3466 static void shmem_put_super(struct super_block *sb) 3467 { 3468 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 3469 3470 percpu_counter_destroy(&sbinfo->used_blocks); 3471 mpol_put(sbinfo->mpol); 3472 kfree(sbinfo); 3473 sb->s_fs_info = NULL; 3474 } 3475 3476 int shmem_fill_super(struct super_block *sb, void *data, int silent) 3477 { 3478 struct inode *inode; 3479 struct shmem_sb_info *sbinfo; 3480 int err = -ENOMEM; 3481 3482 /* Round up to L1_CACHE_BYTES to resist false sharing */ 3483 sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info), 3484 L1_CACHE_BYTES), GFP_KERNEL); 3485 if (!sbinfo) 3486 return -ENOMEM; 3487 3488 sbinfo->mode = 0777 | S_ISVTX; 3489 sbinfo->uid = current_fsuid(); 3490 sbinfo->gid = current_fsgid(); 3491 sb->s_fs_info = sbinfo; 3492 3493 #ifdef CONFIG_TMPFS 3494 /* 3495 * Per default we only allow half of the physical ram per 3496 * tmpfs instance, limiting inodes to one per page of lowmem; 3497 * but the internal instance is left unlimited. 3498 */ 3499 if (!(sb->s_flags & SB_KERNMOUNT)) { 3500 sbinfo->max_blocks = shmem_default_max_blocks(); 3501 sbinfo->max_inodes = shmem_default_max_inodes(); 3502 if (shmem_parse_options(data, sbinfo, false)) { 3503 err = -EINVAL; 3504 goto failed; 3505 } 3506 } else { 3507 sb->s_flags |= SB_NOUSER; 3508 } 3509 sb->s_export_op = &shmem_export_ops; 3510 sb->s_flags |= SB_NOSEC; 3511 #else 3512 sb->s_flags |= SB_NOUSER; 3513 #endif 3514 3515 spin_lock_init(&sbinfo->stat_lock); 3516 if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL)) 3517 goto failed; 3518 sbinfo->free_inodes = sbinfo->max_inodes; 3519 spin_lock_init(&sbinfo->shrinklist_lock); 3520 INIT_LIST_HEAD(&sbinfo->shrinklist); 3521 3522 sb->s_maxbytes = MAX_LFS_FILESIZE; 3523 sb->s_blocksize = PAGE_SIZE; 3524 sb->s_blocksize_bits = PAGE_SHIFT; 3525 sb->s_magic = TMPFS_MAGIC; 3526 sb->s_op = &shmem_ops; 3527 sb->s_time_gran = 1; 3528 #ifdef CONFIG_TMPFS_XATTR 3529 sb->s_xattr = shmem_xattr_handlers; 3530 #endif 3531 #ifdef CONFIG_TMPFS_POSIX_ACL 3532 sb->s_flags |= SB_POSIXACL; 3533 #endif 3534 uuid_gen(&sb->s_uuid); 3535 3536 inode = shmem_get_inode(sb, NULL, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE); 3537 if (!inode) 3538 goto failed; 3539 inode->i_uid = sbinfo->uid; 3540 inode->i_gid = sbinfo->gid; 3541 sb->s_root = d_make_root(inode); 3542 if (!sb->s_root) 3543 goto failed; 3544 return 0; 3545 3546 failed: 3547 shmem_put_super(sb); 3548 return err; 3549 } 3550 3551 static struct kmem_cache *shmem_inode_cachep; 3552 3553 static struct inode *shmem_alloc_inode(struct super_block *sb) 3554 { 3555 struct shmem_inode_info *info; 3556 info = kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL); 3557 if (!info) 3558 return NULL; 3559 return &info->vfs_inode; 3560 } 3561 3562 static void shmem_destroy_callback(struct rcu_head *head) 3563 { 3564 struct inode *inode = container_of(head, struct inode, i_rcu); 3565 if (S_ISLNK(inode->i_mode)) 3566 kfree(inode->i_link); 3567 kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); 3568 } 3569 3570 static void shmem_destroy_inode(struct inode *inode) 3571 { 3572 if (S_ISREG(inode->i_mode)) 3573 mpol_free_shared_policy(&SHMEM_I(inode)->policy); 3574 call_rcu(&inode->i_rcu, shmem_destroy_callback); 3575 } 3576 3577 static void shmem_init_inode(void *foo) 3578 { 3579 struct shmem_inode_info *info = foo; 3580 inode_init_once(&info->vfs_inode); 3581 } 3582 3583 static void shmem_init_inodecache(void) 3584 { 3585 shmem_inode_cachep = kmem_cache_create("shmem_inode_cache", 3586 sizeof(struct shmem_inode_info), 3587 0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode); 3588 } 3589 3590 static void shmem_destroy_inodecache(void) 3591 { 3592 kmem_cache_destroy(shmem_inode_cachep); 3593 } 3594 3595 static const struct address_space_operations shmem_aops = { 3596 .writepage = shmem_writepage, 3597 .set_page_dirty = __set_page_dirty_no_writeback, 3598 #ifdef CONFIG_TMPFS 3599 .write_begin = shmem_write_begin, 3600 .write_end = shmem_write_end, 3601 #endif 3602 #ifdef CONFIG_MIGRATION 3603 .migratepage = migrate_page, 3604 #endif 3605 .error_remove_page = generic_error_remove_page, 3606 }; 3607 3608 static const struct file_operations shmem_file_operations = { 3609 .mmap = shmem_mmap, 3610 .get_unmapped_area = shmem_get_unmapped_area, 3611 #ifdef CONFIG_TMPFS 3612 .llseek = shmem_file_llseek, 3613 .read_iter = shmem_file_read_iter, 3614 .write_iter = generic_file_write_iter, 3615 .fsync = noop_fsync, 3616 .splice_read = generic_file_splice_read, 3617 .splice_write = iter_file_splice_write, 3618 .fallocate = shmem_fallocate, 3619 #endif 3620 }; 3621 3622 static const struct inode_operations shmem_inode_operations = { 3623 .getattr = shmem_getattr, 3624 .setattr = shmem_setattr, 3625 #ifdef CONFIG_TMPFS_XATTR 3626 .listxattr = shmem_listxattr, 3627 .set_acl = simple_set_acl, 3628 #endif 3629 }; 3630 3631 static const struct inode_operations shmem_dir_inode_operations = { 3632 #ifdef CONFIG_TMPFS 3633 .create = shmem_create, 3634 .lookup = simple_lookup, 3635 .link = shmem_link, 3636 .unlink = shmem_unlink, 3637 .symlink = shmem_symlink, 3638 .mkdir = shmem_mkdir, 3639 .rmdir = shmem_rmdir, 3640 .mknod = shmem_mknod, 3641 .rename = shmem_rename2, 3642 .tmpfile = shmem_tmpfile, 3643 #endif 3644 #ifdef CONFIG_TMPFS_XATTR 3645 .listxattr = shmem_listxattr, 3646 #endif 3647 #ifdef CONFIG_TMPFS_POSIX_ACL 3648 .setattr = shmem_setattr, 3649 .set_acl = simple_set_acl, 3650 #endif 3651 }; 3652 3653 static const struct inode_operations shmem_special_inode_operations = { 3654 #ifdef CONFIG_TMPFS_XATTR 3655 .listxattr = shmem_listxattr, 3656 #endif 3657 #ifdef CONFIG_TMPFS_POSIX_ACL 3658 .setattr = shmem_setattr, 3659 .set_acl = simple_set_acl, 3660 #endif 3661 }; 3662 3663 static const struct super_operations shmem_ops = { 3664 .alloc_inode = shmem_alloc_inode, 3665 .destroy_inode = shmem_destroy_inode, 3666 #ifdef CONFIG_TMPFS 3667 .statfs = shmem_statfs, 3668 .remount_fs = shmem_remount_fs, 3669 .show_options = shmem_show_options, 3670 #endif 3671 .evict_inode = shmem_evict_inode, 3672 .drop_inode = generic_delete_inode, 3673 .put_super = shmem_put_super, 3674 #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE 3675 .nr_cached_objects = shmem_unused_huge_count, 3676 .free_cached_objects = shmem_unused_huge_scan, 3677 #endif 3678 }; 3679 3680 static const struct vm_operations_struct shmem_vm_ops = { 3681 .fault = shmem_fault, 3682 .map_pages = filemap_map_pages, 3683 #ifdef CONFIG_NUMA 3684 .set_policy = shmem_set_policy, 3685 .get_policy = shmem_get_policy, 3686 #endif 3687 }; 3688 3689 static struct dentry *shmem_mount(struct file_system_type *fs_type, 3690 int flags, const char *dev_name, void *data) 3691 { 3692 return mount_nodev(fs_type, flags, data, shmem_fill_super); 3693 } 3694 3695 static struct file_system_type shmem_fs_type = { 3696 .owner = THIS_MODULE, 3697 .name = "tmpfs", 3698 .mount = shmem_mount, 3699 .kill_sb = kill_litter_super, 3700 .fs_flags = FS_USERNS_MOUNT, 3701 }; 3702 3703 int __init shmem_init(void) 3704 { 3705 int error; 3706 3707 /* If rootfs called this, don't re-init */ 3708 if (shmem_inode_cachep) 3709 return 0; 3710 3711 shmem_init_inodecache(); 3712 3713 error = register_filesystem(&shmem_fs_type); 3714 if (error) { 3715 pr_err("Could not register tmpfs\n"); 3716 goto out2; 3717 } 3718 3719 shm_mnt = kern_mount(&shmem_fs_type); 3720 if (IS_ERR(shm_mnt)) { 3721 error = PTR_ERR(shm_mnt); 3722 pr_err("Could not kern_mount tmpfs\n"); 3723 goto out1; 3724 } 3725 3726 #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE 3727 if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY) 3728 SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge; 3729 else 3730 shmem_huge = 0; /* just in case it was patched */ 3731 #endif 3732 return 0; 3733 3734 out1: 3735 unregister_filesystem(&shmem_fs_type); 3736 out2: 3737 shmem_destroy_inodecache(); 3738 shm_mnt = ERR_PTR(error); 3739 return error; 3740 } 3741 3742 #if defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && defined(CONFIG_SYSFS) 3743 static ssize_t shmem_enabled_show(struct kobject *kobj, 3744 struct kobj_attribute *attr, char *buf) 3745 { 3746 int values[] = { 3747 SHMEM_HUGE_ALWAYS, 3748 SHMEM_HUGE_WITHIN_SIZE, 3749 SHMEM_HUGE_ADVISE, 3750 SHMEM_HUGE_NEVER, 3751 SHMEM_HUGE_DENY, 3752 SHMEM_HUGE_FORCE, 3753 }; 3754 int i, count; 3755 3756 for (i = 0, count = 0; i < ARRAY_SIZE(values); i++) { 3757 const char *fmt = shmem_huge == values[i] ? "[%s] " : "%s "; 3758 3759 count += sprintf(buf + count, fmt, 3760 shmem_format_huge(values[i])); 3761 } 3762 buf[count - 1] = '\n'; 3763 return count; 3764 } 3765 3766 static ssize_t shmem_enabled_store(struct kobject *kobj, 3767 struct kobj_attribute *attr, const char *buf, size_t count) 3768 { 3769 char tmp[16]; 3770 int huge; 3771 3772 if (count + 1 > sizeof(tmp)) 3773 return -EINVAL; 3774 memcpy(tmp, buf, count); 3775 tmp[count] = '\0'; 3776 if (count && tmp[count - 1] == '\n') 3777 tmp[count - 1] = '\0'; 3778 3779 huge = shmem_parse_huge(tmp); 3780 if (huge == -EINVAL) 3781 return -EINVAL; 3782 if (!has_transparent_hugepage() && 3783 huge != SHMEM_HUGE_NEVER && huge != SHMEM_HUGE_DENY) 3784 return -EINVAL; 3785 3786 shmem_huge = huge; 3787 if (shmem_huge > SHMEM_HUGE_DENY) 3788 SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge; 3789 return count; 3790 } 3791 3792 struct kobj_attribute shmem_enabled_attr = 3793 __ATTR(shmem_enabled, 0644, shmem_enabled_show, shmem_enabled_store); 3794 #endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE && CONFIG_SYSFS */ 3795 3796 #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE 3797 bool shmem_huge_enabled(struct vm_area_struct *vma) 3798 { 3799 struct inode *inode = file_inode(vma->vm_file); 3800 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 3801 loff_t i_size; 3802 pgoff_t off; 3803 3804 if (shmem_huge == SHMEM_HUGE_FORCE) 3805 return true; 3806 if (shmem_huge == SHMEM_HUGE_DENY) 3807 return false; 3808 switch (sbinfo->huge) { 3809 case SHMEM_HUGE_NEVER: 3810 return false; 3811 case SHMEM_HUGE_ALWAYS: 3812 return true; 3813 case SHMEM_HUGE_WITHIN_SIZE: 3814 off = round_up(vma->vm_pgoff, HPAGE_PMD_NR); 3815 i_size = round_up(i_size_read(inode), PAGE_SIZE); 3816 if (i_size >= HPAGE_PMD_SIZE && 3817 i_size >> PAGE_SHIFT >= off) 3818 return true; 3819 /* fall through */ 3820 case SHMEM_HUGE_ADVISE: 3821 /* TODO: implement fadvise() hints */ 3822 return (vma->vm_flags & VM_HUGEPAGE); 3823 default: 3824 VM_BUG_ON(1); 3825 return false; 3826 } 3827 } 3828 #endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE */ 3829 3830 #else /* !CONFIG_SHMEM */ 3831 3832 /* 3833 * tiny-shmem: simple shmemfs and tmpfs using ramfs code 3834 * 3835 * This is intended for small system where the benefits of the full 3836 * shmem code (swap-backed and resource-limited) are outweighed by 3837 * their complexity. On systems without swap this code should be 3838 * effectively equivalent, but much lighter weight. 3839 */ 3840 3841 static struct file_system_type shmem_fs_type = { 3842 .name = "tmpfs", 3843 .mount = ramfs_mount, 3844 .kill_sb = kill_litter_super, 3845 .fs_flags = FS_USERNS_MOUNT, 3846 }; 3847 3848 int __init shmem_init(void) 3849 { 3850 BUG_ON(register_filesystem(&shmem_fs_type) != 0); 3851 3852 shm_mnt = kern_mount(&shmem_fs_type); 3853 BUG_ON(IS_ERR(shm_mnt)); 3854 3855 return 0; 3856 } 3857 3858 int shmem_unuse(swp_entry_t swap, struct page *page) 3859 { 3860 return 0; 3861 } 3862 3863 int shmem_lock(struct file *file, int lock, struct user_struct *user) 3864 { 3865 return 0; 3866 } 3867 3868 void shmem_unlock_mapping(struct address_space *mapping) 3869 { 3870 } 3871 3872 #ifdef CONFIG_MMU 3873 unsigned long shmem_get_unmapped_area(struct file *file, 3874 unsigned long addr, unsigned long len, 3875 unsigned long pgoff, unsigned long flags) 3876 { 3877 return current->mm->get_unmapped_area(file, addr, len, pgoff, flags); 3878 } 3879 #endif 3880 3881 void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) 3882 { 3883 truncate_inode_pages_range(inode->i_mapping, lstart, lend); 3884 } 3885 EXPORT_SYMBOL_GPL(shmem_truncate_range); 3886 3887 #define shmem_vm_ops generic_file_vm_ops 3888 #define shmem_file_operations ramfs_file_operations 3889 #define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev) 3890 #define shmem_acct_size(flags, size) 0 3891 #define shmem_unacct_size(flags, size) do {} while (0) 3892 3893 #endif /* CONFIG_SHMEM */ 3894 3895 /* common code */ 3896 3897 static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, loff_t size, 3898 unsigned long flags, unsigned int i_flags) 3899 { 3900 struct inode *inode; 3901 struct file *res; 3902 3903 if (IS_ERR(mnt)) 3904 return ERR_CAST(mnt); 3905 3906 if (size < 0 || size > MAX_LFS_FILESIZE) 3907 return ERR_PTR(-EINVAL); 3908 3909 if (shmem_acct_size(flags, size)) 3910 return ERR_PTR(-ENOMEM); 3911 3912 inode = shmem_get_inode(mnt->mnt_sb, NULL, S_IFREG | S_IRWXUGO, 0, 3913 flags); 3914 if (unlikely(!inode)) { 3915 shmem_unacct_size(flags, size); 3916 return ERR_PTR(-ENOSPC); 3917 } 3918 inode->i_flags |= i_flags; 3919 inode->i_size = size; 3920 clear_nlink(inode); /* It is unlinked */ 3921 res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size)); 3922 if (!IS_ERR(res)) 3923 res = alloc_file_pseudo(inode, mnt, name, O_RDWR, 3924 &shmem_file_operations); 3925 if (IS_ERR(res)) 3926 iput(inode); 3927 return res; 3928 } 3929 3930 /** 3931 * shmem_kernel_file_setup - get an unlinked file living in tmpfs which must be 3932 * kernel internal. There will be NO LSM permission checks against the 3933 * underlying inode. So users of this interface must do LSM checks at a 3934 * higher layer. The users are the big_key and shm implementations. LSM 3935 * checks are provided at the key or shm level rather than the inode. 3936 * @name: name for dentry (to be seen in /proc/<pid>/maps 3937 * @size: size to be set for the file 3938 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size 3939 */ 3940 struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags) 3941 { 3942 return __shmem_file_setup(shm_mnt, name, size, flags, S_PRIVATE); 3943 } 3944 3945 /** 3946 * shmem_file_setup - get an unlinked file living in tmpfs 3947 * @name: name for dentry (to be seen in /proc/<pid>/maps 3948 * @size: size to be set for the file 3949 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size 3950 */ 3951 struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags) 3952 { 3953 return __shmem_file_setup(shm_mnt, name, size, flags, 0); 3954 } 3955 EXPORT_SYMBOL_GPL(shmem_file_setup); 3956 3957 /** 3958 * shmem_file_setup_with_mnt - get an unlinked file living in tmpfs 3959 * @mnt: the tmpfs mount where the file will be created 3960 * @name: name for dentry (to be seen in /proc/<pid>/maps 3961 * @size: size to be set for the file 3962 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size 3963 */ 3964 struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, const char *name, 3965 loff_t size, unsigned long flags) 3966 { 3967 return __shmem_file_setup(mnt, name, size, flags, 0); 3968 } 3969 EXPORT_SYMBOL_GPL(shmem_file_setup_with_mnt); 3970 3971 /** 3972 * shmem_zero_setup - setup a shared anonymous mapping 3973 * @vma: the vma to be mmapped is prepared by do_mmap_pgoff 3974 */ 3975 int shmem_zero_setup(struct vm_area_struct *vma) 3976 { 3977 struct file *file; 3978 loff_t size = vma->vm_end - vma->vm_start; 3979 3980 /* 3981 * Cloning a new file under mmap_sem leads to a lock ordering conflict 3982 * between XFS directory reading and selinux: since this file is only 3983 * accessible to the user through its mapping, use S_PRIVATE flag to 3984 * bypass file security, in the same way as shmem_kernel_file_setup(). 3985 */ 3986 file = shmem_kernel_file_setup("dev/zero", size, vma->vm_flags); 3987 if (IS_ERR(file)) 3988 return PTR_ERR(file); 3989 3990 if (vma->vm_file) 3991 fput(vma->vm_file); 3992 vma->vm_file = file; 3993 vma->vm_ops = &shmem_vm_ops; 3994 3995 if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && 3996 ((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) < 3997 (vma->vm_end & HPAGE_PMD_MASK)) { 3998 khugepaged_enter(vma, vma->vm_flags); 3999 } 4000 4001 return 0; 4002 } 4003 4004 /** 4005 * shmem_read_mapping_page_gfp - read into page cache, using specified page allocation flags. 4006 * @mapping: the page's address_space 4007 * @index: the page index 4008 * @gfp: the page allocator flags to use if allocating 4009 * 4010 * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)", 4011 * with any new page allocations done using the specified allocation flags. 4012 * But read_cache_page_gfp() uses the ->readpage() method: which does not 4013 * suit tmpfs, since it may have pages in swapcache, and needs to find those 4014 * for itself; although drivers/gpu/drm i915 and ttm rely upon this support. 4015 * 4016 * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in 4017 * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily. 4018 */ 4019 struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, 4020 pgoff_t index, gfp_t gfp) 4021 { 4022 #ifdef CONFIG_SHMEM 4023 struct inode *inode = mapping->host; 4024 struct page *page; 4025 int error; 4026 4027 BUG_ON(mapping->a_ops != &shmem_aops); 4028 error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, 4029 gfp, NULL, NULL, NULL); 4030 if (error) 4031 page = ERR_PTR(error); 4032 else 4033 unlock_page(page); 4034 return page; 4035 #else 4036 /* 4037 * The tiny !SHMEM case uses ramfs without swap 4038 */ 4039 return read_cache_page_gfp(mapping, index, gfp); 4040 #endif 4041 } 4042 EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp); 4043