1 /* 2 * Resizable virtual memory filesystem for Linux. 3 * 4 * Copyright (C) 2000 Linus Torvalds. 5 * 2000 Transmeta Corp. 6 * 2000-2001 Christoph Rohland 7 * 2000-2001 SAP AG 8 * 2002 Red Hat Inc. 9 * Copyright (C) 2002-2011 Hugh Dickins. 10 * Copyright (C) 2011 Google Inc. 11 * Copyright (C) 2002-2005 VERITAS Software Corporation. 12 * Copyright (C) 2004 Andi Kleen, SuSE Labs 13 * 14 * Extended attribute support for tmpfs: 15 * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net> 16 * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com> 17 * 18 * tiny-shmem: 19 * Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com> 20 * 21 * This file is released under the GPL. 22 */ 23 24 #include <linux/fs.h> 25 #include <linux/init.h> 26 #include <linux/vfs.h> 27 #include <linux/mount.h> 28 #include <linux/ramfs.h> 29 #include <linux/pagemap.h> 30 #include <linux/file.h> 31 #include <linux/mm.h> 32 #include <linux/random.h> 33 #include <linux/sched/signal.h> 34 #include <linux/export.h> 35 #include <linux/swap.h> 36 #include <linux/uio.h> 37 #include <linux/khugepaged.h> 38 #include <linux/hugetlb.h> 39 40 #include <asm/tlbflush.h> /* for arch/microblaze update_mmu_cache() */ 41 42 static struct vfsmount *shm_mnt; 43 44 #ifdef CONFIG_SHMEM 45 /* 46 * This virtual memory filesystem is heavily based on the ramfs. It 47 * extends ramfs by the ability to use swap and honor resource limits 48 * which makes it a completely usable filesystem. 49 */ 50 51 #include <linux/xattr.h> 52 #include <linux/exportfs.h> 53 #include <linux/posix_acl.h> 54 #include <linux/posix_acl_xattr.h> 55 #include <linux/mman.h> 56 #include <linux/string.h> 57 #include <linux/slab.h> 58 #include <linux/backing-dev.h> 59 #include <linux/shmem_fs.h> 60 #include <linux/writeback.h> 61 #include <linux/blkdev.h> 62 #include <linux/pagevec.h> 63 #include <linux/percpu_counter.h> 64 #include <linux/falloc.h> 65 #include <linux/splice.h> 66 #include <linux/security.h> 67 #include <linux/swapops.h> 68 #include <linux/mempolicy.h> 69 #include <linux/namei.h> 70 #include <linux/ctype.h> 71 #include <linux/migrate.h> 72 #include <linux/highmem.h> 73 #include <linux/seq_file.h> 74 #include <linux/magic.h> 75 #include <linux/syscalls.h> 76 #include <linux/fcntl.h> 77 #include <uapi/linux/memfd.h> 78 #include <linux/userfaultfd_k.h> 79 #include <linux/rmap.h> 80 #include <linux/uuid.h> 81 82 #include <linux/uaccess.h> 83 #include <asm/pgtable.h> 84 85 #include "internal.h" 86 87 #define BLOCKS_PER_PAGE (PAGE_SIZE/512) 88 #define VM_ACCT(size) (PAGE_ALIGN(size) >> PAGE_SHIFT) 89 90 /* Pretend that each entry is of this size in directory's i_size */ 91 #define BOGO_DIRENT_SIZE 20 92 93 /* Symlink up to this size is kmalloc'ed instead of using a swappable page */ 94 #define SHORT_SYMLINK_LEN 128 95 96 /* 97 * shmem_fallocate communicates with shmem_fault or shmem_writepage via 98 * inode->i_private (with i_mutex making sure that it has only one user at 99 * a time): we would prefer not to enlarge the shmem inode just for that. 100 */ 101 struct shmem_falloc { 102 wait_queue_head_t *waitq; /* faults into hole wait for punch to end */ 103 pgoff_t start; /* start of range currently being fallocated */ 104 pgoff_t next; /* the next page offset to be fallocated */ 105 pgoff_t nr_falloced; /* how many new pages have been fallocated */ 106 pgoff_t nr_unswapped; /* how often writepage refused to swap out */ 107 }; 108 109 #ifdef CONFIG_TMPFS 110 static unsigned long shmem_default_max_blocks(void) 111 { 112 return totalram_pages / 2; 113 } 114 115 static unsigned long shmem_default_max_inodes(void) 116 { 117 return min(totalram_pages - totalhigh_pages, totalram_pages / 2); 118 } 119 #endif 120 121 static bool shmem_should_replace_page(struct page *page, gfp_t gfp); 122 static int shmem_replace_page(struct page **pagep, gfp_t gfp, 123 struct shmem_inode_info *info, pgoff_t index); 124 static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, 125 struct page **pagep, enum sgp_type sgp, 126 gfp_t gfp, struct vm_area_struct *vma, 127 struct vm_fault *vmf, vm_fault_t *fault_type); 128 129 int shmem_getpage(struct inode *inode, pgoff_t index, 130 struct page **pagep, enum sgp_type sgp) 131 { 132 return shmem_getpage_gfp(inode, index, pagep, sgp, 133 mapping_gfp_mask(inode->i_mapping), NULL, NULL, NULL); 134 } 135 136 static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) 137 { 138 return sb->s_fs_info; 139 } 140 141 /* 142 * shmem_file_setup pre-accounts the whole fixed size of a VM object, 143 * for shared memory and for shared anonymous (/dev/zero) mappings 144 * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1), 145 * consistent with the pre-accounting of private mappings ... 146 */ 147 static inline int shmem_acct_size(unsigned long flags, loff_t size) 148 { 149 return (flags & VM_NORESERVE) ? 150 0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size)); 151 } 152 153 static inline void shmem_unacct_size(unsigned long flags, loff_t size) 154 { 155 if (!(flags & VM_NORESERVE)) 156 vm_unacct_memory(VM_ACCT(size)); 157 } 158 159 static inline int shmem_reacct_size(unsigned long flags, 160 loff_t oldsize, loff_t newsize) 161 { 162 if (!(flags & VM_NORESERVE)) { 163 if (VM_ACCT(newsize) > VM_ACCT(oldsize)) 164 return security_vm_enough_memory_mm(current->mm, 165 VM_ACCT(newsize) - VM_ACCT(oldsize)); 166 else if (VM_ACCT(newsize) < VM_ACCT(oldsize)) 167 vm_unacct_memory(VM_ACCT(oldsize) - VM_ACCT(newsize)); 168 } 169 return 0; 170 } 171 172 /* 173 * ... whereas tmpfs objects are accounted incrementally as 174 * pages are allocated, in order to allow large sparse files. 175 * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM, 176 * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM. 177 */ 178 static inline int shmem_acct_block(unsigned long flags, long pages) 179 { 180 if (!(flags & VM_NORESERVE)) 181 return 0; 182 183 return security_vm_enough_memory_mm(current->mm, 184 pages * VM_ACCT(PAGE_SIZE)); 185 } 186 187 static inline void shmem_unacct_blocks(unsigned long flags, long pages) 188 { 189 if (flags & VM_NORESERVE) 190 vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE)); 191 } 192 193 static inline bool shmem_inode_acct_block(struct inode *inode, long pages) 194 { 195 struct shmem_inode_info *info = SHMEM_I(inode); 196 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 197 198 if (shmem_acct_block(info->flags, pages)) 199 return false; 200 201 if (sbinfo->max_blocks) { 202 if (percpu_counter_compare(&sbinfo->used_blocks, 203 sbinfo->max_blocks - pages) > 0) 204 goto unacct; 205 percpu_counter_add(&sbinfo->used_blocks, pages); 206 } 207 208 return true; 209 210 unacct: 211 shmem_unacct_blocks(info->flags, pages); 212 return false; 213 } 214 215 static inline void shmem_inode_unacct_blocks(struct inode *inode, long pages) 216 { 217 struct shmem_inode_info *info = SHMEM_I(inode); 218 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 219 220 if (sbinfo->max_blocks) 221 percpu_counter_sub(&sbinfo->used_blocks, pages); 222 shmem_unacct_blocks(info->flags, pages); 223 } 224 225 static const struct super_operations shmem_ops; 226 static const struct address_space_operations shmem_aops; 227 static const struct file_operations shmem_file_operations; 228 static const struct inode_operations shmem_inode_operations; 229 static const struct inode_operations shmem_dir_inode_operations; 230 static const struct inode_operations shmem_special_inode_operations; 231 static const struct vm_operations_struct shmem_vm_ops; 232 static struct file_system_type shmem_fs_type; 233 234 bool vma_is_shmem(struct vm_area_struct *vma) 235 { 236 return vma->vm_ops == &shmem_vm_ops; 237 } 238 239 static LIST_HEAD(shmem_swaplist); 240 static DEFINE_MUTEX(shmem_swaplist_mutex); 241 242 static int shmem_reserve_inode(struct super_block *sb) 243 { 244 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 245 if (sbinfo->max_inodes) { 246 spin_lock(&sbinfo->stat_lock); 247 if (!sbinfo->free_inodes) { 248 spin_unlock(&sbinfo->stat_lock); 249 return -ENOSPC; 250 } 251 sbinfo->free_inodes--; 252 spin_unlock(&sbinfo->stat_lock); 253 } 254 return 0; 255 } 256 257 static void shmem_free_inode(struct super_block *sb) 258 { 259 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 260 if (sbinfo->max_inodes) { 261 spin_lock(&sbinfo->stat_lock); 262 sbinfo->free_inodes++; 263 spin_unlock(&sbinfo->stat_lock); 264 } 265 } 266 267 /** 268 * shmem_recalc_inode - recalculate the block usage of an inode 269 * @inode: inode to recalc 270 * 271 * We have to calculate the free blocks since the mm can drop 272 * undirtied hole pages behind our back. 273 * 274 * But normally info->alloced == inode->i_mapping->nrpages + info->swapped 275 * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped) 276 * 277 * It has to be called with the spinlock held. 278 */ 279 static void shmem_recalc_inode(struct inode *inode) 280 { 281 struct shmem_inode_info *info = SHMEM_I(inode); 282 long freed; 283 284 freed = info->alloced - info->swapped - inode->i_mapping->nrpages; 285 if (freed > 0) { 286 info->alloced -= freed; 287 inode->i_blocks -= freed * BLOCKS_PER_PAGE; 288 shmem_inode_unacct_blocks(inode, freed); 289 } 290 } 291 292 bool shmem_charge(struct inode *inode, long pages) 293 { 294 struct shmem_inode_info *info = SHMEM_I(inode); 295 unsigned long flags; 296 297 if (!shmem_inode_acct_block(inode, pages)) 298 return false; 299 300 /* nrpages adjustment first, then shmem_recalc_inode() when balanced */ 301 inode->i_mapping->nrpages += pages; 302 303 spin_lock_irqsave(&info->lock, flags); 304 info->alloced += pages; 305 inode->i_blocks += pages * BLOCKS_PER_PAGE; 306 shmem_recalc_inode(inode); 307 spin_unlock_irqrestore(&info->lock, flags); 308 309 return true; 310 } 311 312 void shmem_uncharge(struct inode *inode, long pages) 313 { 314 struct shmem_inode_info *info = SHMEM_I(inode); 315 unsigned long flags; 316 317 /* nrpages adjustment done by __delete_from_page_cache() or caller */ 318 319 spin_lock_irqsave(&info->lock, flags); 320 info->alloced -= pages; 321 inode->i_blocks -= pages * BLOCKS_PER_PAGE; 322 shmem_recalc_inode(inode); 323 spin_unlock_irqrestore(&info->lock, flags); 324 325 shmem_inode_unacct_blocks(inode, pages); 326 } 327 328 /* 329 * Replace item expected in xarray by a new item, while holding xa_lock. 330 */ 331 static int shmem_replace_entry(struct address_space *mapping, 332 pgoff_t index, void *expected, void *replacement) 333 { 334 XA_STATE(xas, &mapping->i_pages, index); 335 void *item; 336 337 VM_BUG_ON(!expected); 338 VM_BUG_ON(!replacement); 339 item = xas_load(&xas); 340 if (item != expected) 341 return -ENOENT; 342 xas_store(&xas, replacement); 343 return 0; 344 } 345 346 /* 347 * Sometimes, before we decide whether to proceed or to fail, we must check 348 * that an entry was not already brought back from swap by a racing thread. 349 * 350 * Checking page is not enough: by the time a SwapCache page is locked, it 351 * might be reused, and again be SwapCache, using the same swap as before. 352 */ 353 static bool shmem_confirm_swap(struct address_space *mapping, 354 pgoff_t index, swp_entry_t swap) 355 { 356 return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap); 357 } 358 359 /* 360 * Definitions for "huge tmpfs": tmpfs mounted with the huge= option 361 * 362 * SHMEM_HUGE_NEVER: 363 * disables huge pages for the mount; 364 * SHMEM_HUGE_ALWAYS: 365 * enables huge pages for the mount; 366 * SHMEM_HUGE_WITHIN_SIZE: 367 * only allocate huge pages if the page will be fully within i_size, 368 * also respect fadvise()/madvise() hints; 369 * SHMEM_HUGE_ADVISE: 370 * only allocate huge pages if requested with fadvise()/madvise(); 371 */ 372 373 #define SHMEM_HUGE_NEVER 0 374 #define SHMEM_HUGE_ALWAYS 1 375 #define SHMEM_HUGE_WITHIN_SIZE 2 376 #define SHMEM_HUGE_ADVISE 3 377 378 /* 379 * Special values. 380 * Only can be set via /sys/kernel/mm/transparent_hugepage/shmem_enabled: 381 * 382 * SHMEM_HUGE_DENY: 383 * disables huge on shm_mnt and all mounts, for emergency use; 384 * SHMEM_HUGE_FORCE: 385 * enables huge on shm_mnt and all mounts, w/o needing option, for testing; 386 * 387 */ 388 #define SHMEM_HUGE_DENY (-1) 389 #define SHMEM_HUGE_FORCE (-2) 390 391 #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE 392 /* ifdef here to avoid bloating shmem.o when not necessary */ 393 394 static int shmem_huge __read_mostly; 395 396 #if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS) 397 static int shmem_parse_huge(const char *str) 398 { 399 if (!strcmp(str, "never")) 400 return SHMEM_HUGE_NEVER; 401 if (!strcmp(str, "always")) 402 return SHMEM_HUGE_ALWAYS; 403 if (!strcmp(str, "within_size")) 404 return SHMEM_HUGE_WITHIN_SIZE; 405 if (!strcmp(str, "advise")) 406 return SHMEM_HUGE_ADVISE; 407 if (!strcmp(str, "deny")) 408 return SHMEM_HUGE_DENY; 409 if (!strcmp(str, "force")) 410 return SHMEM_HUGE_FORCE; 411 return -EINVAL; 412 } 413 414 static const char *shmem_format_huge(int huge) 415 { 416 switch (huge) { 417 case SHMEM_HUGE_NEVER: 418 return "never"; 419 case SHMEM_HUGE_ALWAYS: 420 return "always"; 421 case SHMEM_HUGE_WITHIN_SIZE: 422 return "within_size"; 423 case SHMEM_HUGE_ADVISE: 424 return "advise"; 425 case SHMEM_HUGE_DENY: 426 return "deny"; 427 case SHMEM_HUGE_FORCE: 428 return "force"; 429 default: 430 VM_BUG_ON(1); 431 return "bad_val"; 432 } 433 } 434 #endif 435 436 static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, 437 struct shrink_control *sc, unsigned long nr_to_split) 438 { 439 LIST_HEAD(list), *pos, *next; 440 LIST_HEAD(to_remove); 441 struct inode *inode; 442 struct shmem_inode_info *info; 443 struct page *page; 444 unsigned long batch = sc ? sc->nr_to_scan : 128; 445 int removed = 0, split = 0; 446 447 if (list_empty(&sbinfo->shrinklist)) 448 return SHRINK_STOP; 449 450 spin_lock(&sbinfo->shrinklist_lock); 451 list_for_each_safe(pos, next, &sbinfo->shrinklist) { 452 info = list_entry(pos, struct shmem_inode_info, shrinklist); 453 454 /* pin the inode */ 455 inode = igrab(&info->vfs_inode); 456 457 /* inode is about to be evicted */ 458 if (!inode) { 459 list_del_init(&info->shrinklist); 460 removed++; 461 goto next; 462 } 463 464 /* Check if there's anything to gain */ 465 if (round_up(inode->i_size, PAGE_SIZE) == 466 round_up(inode->i_size, HPAGE_PMD_SIZE)) { 467 list_move(&info->shrinklist, &to_remove); 468 removed++; 469 goto next; 470 } 471 472 list_move(&info->shrinklist, &list); 473 next: 474 if (!--batch) 475 break; 476 } 477 spin_unlock(&sbinfo->shrinklist_lock); 478 479 list_for_each_safe(pos, next, &to_remove) { 480 info = list_entry(pos, struct shmem_inode_info, shrinklist); 481 inode = &info->vfs_inode; 482 list_del_init(&info->shrinklist); 483 iput(inode); 484 } 485 486 list_for_each_safe(pos, next, &list) { 487 int ret; 488 489 info = list_entry(pos, struct shmem_inode_info, shrinklist); 490 inode = &info->vfs_inode; 491 492 if (nr_to_split && split >= nr_to_split) 493 goto leave; 494 495 page = find_get_page(inode->i_mapping, 496 (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT); 497 if (!page) 498 goto drop; 499 500 /* No huge page at the end of the file: nothing to split */ 501 if (!PageTransHuge(page)) { 502 put_page(page); 503 goto drop; 504 } 505 506 /* 507 * Leave the inode on the list if we failed to lock 508 * the page at this time. 509 * 510 * Waiting for the lock may lead to deadlock in the 511 * reclaim path. 512 */ 513 if (!trylock_page(page)) { 514 put_page(page); 515 goto leave; 516 } 517 518 ret = split_huge_page(page); 519 unlock_page(page); 520 put_page(page); 521 522 /* If split failed leave the inode on the list */ 523 if (ret) 524 goto leave; 525 526 split++; 527 drop: 528 list_del_init(&info->shrinklist); 529 removed++; 530 leave: 531 iput(inode); 532 } 533 534 spin_lock(&sbinfo->shrinklist_lock); 535 list_splice_tail(&list, &sbinfo->shrinklist); 536 sbinfo->shrinklist_len -= removed; 537 spin_unlock(&sbinfo->shrinklist_lock); 538 539 return split; 540 } 541 542 static long shmem_unused_huge_scan(struct super_block *sb, 543 struct shrink_control *sc) 544 { 545 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 546 547 if (!READ_ONCE(sbinfo->shrinklist_len)) 548 return SHRINK_STOP; 549 550 return shmem_unused_huge_shrink(sbinfo, sc, 0); 551 } 552 553 static long shmem_unused_huge_count(struct super_block *sb, 554 struct shrink_control *sc) 555 { 556 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 557 return READ_ONCE(sbinfo->shrinklist_len); 558 } 559 #else /* !CONFIG_TRANSPARENT_HUGE_PAGECACHE */ 560 561 #define shmem_huge SHMEM_HUGE_DENY 562 563 static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, 564 struct shrink_control *sc, unsigned long nr_to_split) 565 { 566 return 0; 567 } 568 #endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE */ 569 570 static inline bool is_huge_enabled(struct shmem_sb_info *sbinfo) 571 { 572 if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && 573 (shmem_huge == SHMEM_HUGE_FORCE || sbinfo->huge) && 574 shmem_huge != SHMEM_HUGE_DENY) 575 return true; 576 return false; 577 } 578 579 /* 580 * Like add_to_page_cache_locked, but error if expected item has gone. 581 */ 582 static int shmem_add_to_page_cache(struct page *page, 583 struct address_space *mapping, 584 pgoff_t index, void *expected, gfp_t gfp) 585 { 586 XA_STATE_ORDER(xas, &mapping->i_pages, index, compound_order(page)); 587 unsigned long i = 0; 588 unsigned long nr = 1UL << compound_order(page); 589 590 VM_BUG_ON_PAGE(PageTail(page), page); 591 VM_BUG_ON_PAGE(index != round_down(index, nr), page); 592 VM_BUG_ON_PAGE(!PageLocked(page), page); 593 VM_BUG_ON_PAGE(!PageSwapBacked(page), page); 594 VM_BUG_ON(expected && PageTransHuge(page)); 595 596 page_ref_add(page, nr); 597 page->mapping = mapping; 598 page->index = index; 599 600 do { 601 void *entry; 602 xas_lock_irq(&xas); 603 entry = xas_find_conflict(&xas); 604 if (entry != expected) 605 xas_set_err(&xas, -EEXIST); 606 xas_create_range(&xas); 607 if (xas_error(&xas)) 608 goto unlock; 609 next: 610 xas_store(&xas, page + i); 611 if (++i < nr) { 612 xas_next(&xas); 613 goto next; 614 } 615 if (PageTransHuge(page)) { 616 count_vm_event(THP_FILE_ALLOC); 617 __inc_node_page_state(page, NR_SHMEM_THPS); 618 } 619 mapping->nrpages += nr; 620 __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr); 621 __mod_node_page_state(page_pgdat(page), NR_SHMEM, nr); 622 unlock: 623 xas_unlock_irq(&xas); 624 } while (xas_nomem(&xas, gfp)); 625 626 if (xas_error(&xas)) { 627 page->mapping = NULL; 628 page_ref_sub(page, nr); 629 return xas_error(&xas); 630 } 631 632 return 0; 633 } 634 635 /* 636 * Like delete_from_page_cache, but substitutes swap for page. 637 */ 638 static void shmem_delete_from_page_cache(struct page *page, void *radswap) 639 { 640 struct address_space *mapping = page->mapping; 641 int error; 642 643 VM_BUG_ON_PAGE(PageCompound(page), page); 644 645 xa_lock_irq(&mapping->i_pages); 646 error = shmem_replace_entry(mapping, page->index, page, radswap); 647 page->mapping = NULL; 648 mapping->nrpages--; 649 __dec_node_page_state(page, NR_FILE_PAGES); 650 __dec_node_page_state(page, NR_SHMEM); 651 xa_unlock_irq(&mapping->i_pages); 652 put_page(page); 653 BUG_ON(error); 654 } 655 656 /* 657 * Remove swap entry from page cache, free the swap and its page cache. 658 */ 659 static int shmem_free_swap(struct address_space *mapping, 660 pgoff_t index, void *radswap) 661 { 662 void *old; 663 664 old = xa_cmpxchg_irq(&mapping->i_pages, index, radswap, NULL, 0); 665 if (old != radswap) 666 return -ENOENT; 667 free_swap_and_cache(radix_to_swp_entry(radswap)); 668 return 0; 669 } 670 671 /* 672 * Determine (in bytes) how many of the shmem object's pages mapped by the 673 * given offsets are swapped out. 674 * 675 * This is safe to call without i_mutex or the i_pages lock thanks to RCU, 676 * as long as the inode doesn't go away and racy results are not a problem. 677 */ 678 unsigned long shmem_partial_swap_usage(struct address_space *mapping, 679 pgoff_t start, pgoff_t end) 680 { 681 XA_STATE(xas, &mapping->i_pages, start); 682 struct page *page; 683 unsigned long swapped = 0; 684 685 rcu_read_lock(); 686 xas_for_each(&xas, page, end - 1) { 687 if (xas_retry(&xas, page)) 688 continue; 689 if (xa_is_value(page)) 690 swapped++; 691 692 if (need_resched()) { 693 xas_pause(&xas); 694 cond_resched_rcu(); 695 } 696 } 697 698 rcu_read_unlock(); 699 700 return swapped << PAGE_SHIFT; 701 } 702 703 /* 704 * Determine (in bytes) how many of the shmem object's pages mapped by the 705 * given vma is swapped out. 706 * 707 * This is safe to call without i_mutex or the i_pages lock thanks to RCU, 708 * as long as the inode doesn't go away and racy results are not a problem. 709 */ 710 unsigned long shmem_swap_usage(struct vm_area_struct *vma) 711 { 712 struct inode *inode = file_inode(vma->vm_file); 713 struct shmem_inode_info *info = SHMEM_I(inode); 714 struct address_space *mapping = inode->i_mapping; 715 unsigned long swapped; 716 717 /* Be careful as we don't hold info->lock */ 718 swapped = READ_ONCE(info->swapped); 719 720 /* 721 * The easier cases are when the shmem object has nothing in swap, or 722 * the vma maps it whole. Then we can simply use the stats that we 723 * already track. 724 */ 725 if (!swapped) 726 return 0; 727 728 if (!vma->vm_pgoff && vma->vm_end - vma->vm_start >= inode->i_size) 729 return swapped << PAGE_SHIFT; 730 731 /* Here comes the more involved part */ 732 return shmem_partial_swap_usage(mapping, 733 linear_page_index(vma, vma->vm_start), 734 linear_page_index(vma, vma->vm_end)); 735 } 736 737 /* 738 * SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists. 739 */ 740 void shmem_unlock_mapping(struct address_space *mapping) 741 { 742 struct pagevec pvec; 743 pgoff_t indices[PAGEVEC_SIZE]; 744 pgoff_t index = 0; 745 746 pagevec_init(&pvec); 747 /* 748 * Minor point, but we might as well stop if someone else SHM_LOCKs it. 749 */ 750 while (!mapping_unevictable(mapping)) { 751 /* 752 * Avoid pagevec_lookup(): find_get_pages() returns 0 as if it 753 * has finished, if it hits a row of PAGEVEC_SIZE swap entries. 754 */ 755 pvec.nr = find_get_entries(mapping, index, 756 PAGEVEC_SIZE, pvec.pages, indices); 757 if (!pvec.nr) 758 break; 759 index = indices[pvec.nr - 1] + 1; 760 pagevec_remove_exceptionals(&pvec); 761 check_move_unevictable_pages(&pvec); 762 pagevec_release(&pvec); 763 cond_resched(); 764 } 765 } 766 767 /* 768 * Remove range of pages and swap entries from page cache, and free them. 769 * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate. 770 */ 771 static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, 772 bool unfalloc) 773 { 774 struct address_space *mapping = inode->i_mapping; 775 struct shmem_inode_info *info = SHMEM_I(inode); 776 pgoff_t start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT; 777 pgoff_t end = (lend + 1) >> PAGE_SHIFT; 778 unsigned int partial_start = lstart & (PAGE_SIZE - 1); 779 unsigned int partial_end = (lend + 1) & (PAGE_SIZE - 1); 780 struct pagevec pvec; 781 pgoff_t indices[PAGEVEC_SIZE]; 782 long nr_swaps_freed = 0; 783 pgoff_t index; 784 int i; 785 786 if (lend == -1) 787 end = -1; /* unsigned, so actually very big */ 788 789 pagevec_init(&pvec); 790 index = start; 791 while (index < end) { 792 pvec.nr = find_get_entries(mapping, index, 793 min(end - index, (pgoff_t)PAGEVEC_SIZE), 794 pvec.pages, indices); 795 if (!pvec.nr) 796 break; 797 for (i = 0; i < pagevec_count(&pvec); i++) { 798 struct page *page = pvec.pages[i]; 799 800 index = indices[i]; 801 if (index >= end) 802 break; 803 804 if (xa_is_value(page)) { 805 if (unfalloc) 806 continue; 807 nr_swaps_freed += !shmem_free_swap(mapping, 808 index, page); 809 continue; 810 } 811 812 VM_BUG_ON_PAGE(page_to_pgoff(page) != index, page); 813 814 if (!trylock_page(page)) 815 continue; 816 817 if (PageTransTail(page)) { 818 /* Middle of THP: zero out the page */ 819 clear_highpage(page); 820 unlock_page(page); 821 continue; 822 } else if (PageTransHuge(page)) { 823 if (index == round_down(end, HPAGE_PMD_NR)) { 824 /* 825 * Range ends in the middle of THP: 826 * zero out the page 827 */ 828 clear_highpage(page); 829 unlock_page(page); 830 continue; 831 } 832 index += HPAGE_PMD_NR - 1; 833 i += HPAGE_PMD_NR - 1; 834 } 835 836 if (!unfalloc || !PageUptodate(page)) { 837 VM_BUG_ON_PAGE(PageTail(page), page); 838 if (page_mapping(page) == mapping) { 839 VM_BUG_ON_PAGE(PageWriteback(page), page); 840 truncate_inode_page(mapping, page); 841 } 842 } 843 unlock_page(page); 844 } 845 pagevec_remove_exceptionals(&pvec); 846 pagevec_release(&pvec); 847 cond_resched(); 848 index++; 849 } 850 851 if (partial_start) { 852 struct page *page = NULL; 853 shmem_getpage(inode, start - 1, &page, SGP_READ); 854 if (page) { 855 unsigned int top = PAGE_SIZE; 856 if (start > end) { 857 top = partial_end; 858 partial_end = 0; 859 } 860 zero_user_segment(page, partial_start, top); 861 set_page_dirty(page); 862 unlock_page(page); 863 put_page(page); 864 } 865 } 866 if (partial_end) { 867 struct page *page = NULL; 868 shmem_getpage(inode, end, &page, SGP_READ); 869 if (page) { 870 zero_user_segment(page, 0, partial_end); 871 set_page_dirty(page); 872 unlock_page(page); 873 put_page(page); 874 } 875 } 876 if (start >= end) 877 return; 878 879 index = start; 880 while (index < end) { 881 cond_resched(); 882 883 pvec.nr = find_get_entries(mapping, index, 884 min(end - index, (pgoff_t)PAGEVEC_SIZE), 885 pvec.pages, indices); 886 if (!pvec.nr) { 887 /* If all gone or hole-punch or unfalloc, we're done */ 888 if (index == start || end != -1) 889 break; 890 /* But if truncating, restart to make sure all gone */ 891 index = start; 892 continue; 893 } 894 for (i = 0; i < pagevec_count(&pvec); i++) { 895 struct page *page = pvec.pages[i]; 896 897 index = indices[i]; 898 if (index >= end) 899 break; 900 901 if (xa_is_value(page)) { 902 if (unfalloc) 903 continue; 904 if (shmem_free_swap(mapping, index, page)) { 905 /* Swap was replaced by page: retry */ 906 index--; 907 break; 908 } 909 nr_swaps_freed++; 910 continue; 911 } 912 913 lock_page(page); 914 915 if (PageTransTail(page)) { 916 /* Middle of THP: zero out the page */ 917 clear_highpage(page); 918 unlock_page(page); 919 /* 920 * Partial thp truncate due 'start' in middle 921 * of THP: don't need to look on these pages 922 * again on !pvec.nr restart. 923 */ 924 if (index != round_down(end, HPAGE_PMD_NR)) 925 start++; 926 continue; 927 } else if (PageTransHuge(page)) { 928 if (index == round_down(end, HPAGE_PMD_NR)) { 929 /* 930 * Range ends in the middle of THP: 931 * zero out the page 932 */ 933 clear_highpage(page); 934 unlock_page(page); 935 continue; 936 } 937 index += HPAGE_PMD_NR - 1; 938 i += HPAGE_PMD_NR - 1; 939 } 940 941 if (!unfalloc || !PageUptodate(page)) { 942 VM_BUG_ON_PAGE(PageTail(page), page); 943 if (page_mapping(page) == mapping) { 944 VM_BUG_ON_PAGE(PageWriteback(page), page); 945 truncate_inode_page(mapping, page); 946 } else { 947 /* Page was replaced by swap: retry */ 948 unlock_page(page); 949 index--; 950 break; 951 } 952 } 953 unlock_page(page); 954 } 955 pagevec_remove_exceptionals(&pvec); 956 pagevec_release(&pvec); 957 index++; 958 } 959 960 spin_lock_irq(&info->lock); 961 info->swapped -= nr_swaps_freed; 962 shmem_recalc_inode(inode); 963 spin_unlock_irq(&info->lock); 964 } 965 966 void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) 967 { 968 shmem_undo_range(inode, lstart, lend, false); 969 inode->i_ctime = inode->i_mtime = current_time(inode); 970 } 971 EXPORT_SYMBOL_GPL(shmem_truncate_range); 972 973 static int shmem_getattr(const struct path *path, struct kstat *stat, 974 u32 request_mask, unsigned int query_flags) 975 { 976 struct inode *inode = path->dentry->d_inode; 977 struct shmem_inode_info *info = SHMEM_I(inode); 978 struct shmem_sb_info *sb_info = SHMEM_SB(inode->i_sb); 979 980 if (info->alloced - info->swapped != inode->i_mapping->nrpages) { 981 spin_lock_irq(&info->lock); 982 shmem_recalc_inode(inode); 983 spin_unlock_irq(&info->lock); 984 } 985 generic_fillattr(inode, stat); 986 987 if (is_huge_enabled(sb_info)) 988 stat->blksize = HPAGE_PMD_SIZE; 989 990 return 0; 991 } 992 993 static int shmem_setattr(struct dentry *dentry, struct iattr *attr) 994 { 995 struct inode *inode = d_inode(dentry); 996 struct shmem_inode_info *info = SHMEM_I(inode); 997 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 998 int error; 999 1000 error = setattr_prepare(dentry, attr); 1001 if (error) 1002 return error; 1003 1004 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { 1005 loff_t oldsize = inode->i_size; 1006 loff_t newsize = attr->ia_size; 1007 1008 /* protected by i_mutex */ 1009 if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) || 1010 (newsize > oldsize && (info->seals & F_SEAL_GROW))) 1011 return -EPERM; 1012 1013 if (newsize != oldsize) { 1014 error = shmem_reacct_size(SHMEM_I(inode)->flags, 1015 oldsize, newsize); 1016 if (error) 1017 return error; 1018 i_size_write(inode, newsize); 1019 inode->i_ctime = inode->i_mtime = current_time(inode); 1020 } 1021 if (newsize <= oldsize) { 1022 loff_t holebegin = round_up(newsize, PAGE_SIZE); 1023 if (oldsize > holebegin) 1024 unmap_mapping_range(inode->i_mapping, 1025 holebegin, 0, 1); 1026 if (info->alloced) 1027 shmem_truncate_range(inode, 1028 newsize, (loff_t)-1); 1029 /* unmap again to remove racily COWed private pages */ 1030 if (oldsize > holebegin) 1031 unmap_mapping_range(inode->i_mapping, 1032 holebegin, 0, 1); 1033 1034 /* 1035 * Part of the huge page can be beyond i_size: subject 1036 * to shrink under memory pressure. 1037 */ 1038 if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) { 1039 spin_lock(&sbinfo->shrinklist_lock); 1040 /* 1041 * _careful to defend against unlocked access to 1042 * ->shrink_list in shmem_unused_huge_shrink() 1043 */ 1044 if (list_empty_careful(&info->shrinklist)) { 1045 list_add_tail(&info->shrinklist, 1046 &sbinfo->shrinklist); 1047 sbinfo->shrinklist_len++; 1048 } 1049 spin_unlock(&sbinfo->shrinklist_lock); 1050 } 1051 } 1052 } 1053 1054 setattr_copy(inode, attr); 1055 if (attr->ia_valid & ATTR_MODE) 1056 error = posix_acl_chmod(inode, inode->i_mode); 1057 return error; 1058 } 1059 1060 static void shmem_evict_inode(struct inode *inode) 1061 { 1062 struct shmem_inode_info *info = SHMEM_I(inode); 1063 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 1064 1065 if (inode->i_mapping->a_ops == &shmem_aops) { 1066 shmem_unacct_size(info->flags, inode->i_size); 1067 inode->i_size = 0; 1068 shmem_truncate_range(inode, 0, (loff_t)-1); 1069 if (!list_empty(&info->shrinklist)) { 1070 spin_lock(&sbinfo->shrinklist_lock); 1071 if (!list_empty(&info->shrinklist)) { 1072 list_del_init(&info->shrinklist); 1073 sbinfo->shrinklist_len--; 1074 } 1075 spin_unlock(&sbinfo->shrinklist_lock); 1076 } 1077 if (!list_empty(&info->swaplist)) { 1078 mutex_lock(&shmem_swaplist_mutex); 1079 list_del_init(&info->swaplist); 1080 mutex_unlock(&shmem_swaplist_mutex); 1081 } 1082 } 1083 1084 simple_xattrs_free(&info->xattrs); 1085 WARN_ON(inode->i_blocks); 1086 shmem_free_inode(inode->i_sb); 1087 clear_inode(inode); 1088 } 1089 1090 static unsigned long find_swap_entry(struct xarray *xa, void *item) 1091 { 1092 XA_STATE(xas, xa, 0); 1093 unsigned int checked = 0; 1094 void *entry; 1095 1096 rcu_read_lock(); 1097 xas_for_each(&xas, entry, ULONG_MAX) { 1098 if (xas_retry(&xas, entry)) 1099 continue; 1100 if (entry == item) 1101 break; 1102 checked++; 1103 if ((checked % XA_CHECK_SCHED) != 0) 1104 continue; 1105 xas_pause(&xas); 1106 cond_resched_rcu(); 1107 } 1108 rcu_read_unlock(); 1109 1110 return entry ? xas.xa_index : -1; 1111 } 1112 1113 /* 1114 * If swap found in inode, free it and move page from swapcache to filecache. 1115 */ 1116 static int shmem_unuse_inode(struct shmem_inode_info *info, 1117 swp_entry_t swap, struct page **pagep) 1118 { 1119 struct address_space *mapping = info->vfs_inode.i_mapping; 1120 void *radswap; 1121 pgoff_t index; 1122 gfp_t gfp; 1123 int error = 0; 1124 1125 radswap = swp_to_radix_entry(swap); 1126 index = find_swap_entry(&mapping->i_pages, radswap); 1127 if (index == -1) 1128 return -EAGAIN; /* tell shmem_unuse we found nothing */ 1129 1130 /* 1131 * Move _head_ to start search for next from here. 1132 * But be careful: shmem_evict_inode checks list_empty without taking 1133 * mutex, and there's an instant in list_move_tail when info->swaplist 1134 * would appear empty, if it were the only one on shmem_swaplist. 1135 */ 1136 if (shmem_swaplist.next != &info->swaplist) 1137 list_move_tail(&shmem_swaplist, &info->swaplist); 1138 1139 gfp = mapping_gfp_mask(mapping); 1140 if (shmem_should_replace_page(*pagep, gfp)) { 1141 mutex_unlock(&shmem_swaplist_mutex); 1142 error = shmem_replace_page(pagep, gfp, info, index); 1143 mutex_lock(&shmem_swaplist_mutex); 1144 /* 1145 * We needed to drop mutex to make that restrictive page 1146 * allocation, but the inode might have been freed while we 1147 * dropped it: although a racing shmem_evict_inode() cannot 1148 * complete without emptying the page cache, our page lock 1149 * on this swapcache page is not enough to prevent that - 1150 * free_swap_and_cache() of our swap entry will only 1151 * trylock_page(), removing swap from page cache whatever. 1152 * 1153 * We must not proceed to shmem_add_to_page_cache() if the 1154 * inode has been freed, but of course we cannot rely on 1155 * inode or mapping or info to check that. However, we can 1156 * safely check if our swap entry is still in use (and here 1157 * it can't have got reused for another page): if it's still 1158 * in use, then the inode cannot have been freed yet, and we 1159 * can safely proceed (if it's no longer in use, that tells 1160 * nothing about the inode, but we don't need to unuse swap). 1161 */ 1162 if (!page_swapcount(*pagep)) 1163 error = -ENOENT; 1164 } 1165 1166 /* 1167 * We rely on shmem_swaplist_mutex, not only to protect the swaplist, 1168 * but also to hold up shmem_evict_inode(): so inode cannot be freed 1169 * beneath us (pagelock doesn't help until the page is in pagecache). 1170 */ 1171 if (!error) 1172 error = shmem_add_to_page_cache(*pagep, mapping, index, 1173 radswap, gfp); 1174 if (error != -ENOMEM) { 1175 /* 1176 * Truncation and eviction use free_swap_and_cache(), which 1177 * only does trylock page: if we raced, best clean up here. 1178 */ 1179 delete_from_swap_cache(*pagep); 1180 set_page_dirty(*pagep); 1181 if (!error) { 1182 spin_lock_irq(&info->lock); 1183 info->swapped--; 1184 spin_unlock_irq(&info->lock); 1185 swap_free(swap); 1186 } 1187 } 1188 return error; 1189 } 1190 1191 /* 1192 * Search through swapped inodes to find and replace swap by page. 1193 */ 1194 int shmem_unuse(swp_entry_t swap, struct page *page) 1195 { 1196 struct list_head *this, *next; 1197 struct shmem_inode_info *info; 1198 struct mem_cgroup *memcg; 1199 int error = 0; 1200 1201 /* 1202 * There's a faint possibility that swap page was replaced before 1203 * caller locked it: caller will come back later with the right page. 1204 */ 1205 if (unlikely(!PageSwapCache(page) || page_private(page) != swap.val)) 1206 goto out; 1207 1208 /* 1209 * Charge page using GFP_KERNEL while we can wait, before taking 1210 * the shmem_swaplist_mutex which might hold up shmem_writepage(). 1211 * Charged back to the user (not to caller) when swap account is used. 1212 */ 1213 error = mem_cgroup_try_charge_delay(page, current->mm, GFP_KERNEL, 1214 &memcg, false); 1215 if (error) 1216 goto out; 1217 /* No memory allocation: swap entry occupies the slot for the page */ 1218 error = -EAGAIN; 1219 1220 mutex_lock(&shmem_swaplist_mutex); 1221 list_for_each_safe(this, next, &shmem_swaplist) { 1222 info = list_entry(this, struct shmem_inode_info, swaplist); 1223 if (info->swapped) 1224 error = shmem_unuse_inode(info, swap, &page); 1225 else 1226 list_del_init(&info->swaplist); 1227 cond_resched(); 1228 if (error != -EAGAIN) 1229 break; 1230 /* found nothing in this: move on to search the next */ 1231 } 1232 mutex_unlock(&shmem_swaplist_mutex); 1233 1234 if (error) { 1235 if (error != -ENOMEM) 1236 error = 0; 1237 mem_cgroup_cancel_charge(page, memcg, false); 1238 } else 1239 mem_cgroup_commit_charge(page, memcg, true, false); 1240 out: 1241 unlock_page(page); 1242 put_page(page); 1243 return error; 1244 } 1245 1246 /* 1247 * Move the page from the page cache to the swap cache. 1248 */ 1249 static int shmem_writepage(struct page *page, struct writeback_control *wbc) 1250 { 1251 struct shmem_inode_info *info; 1252 struct address_space *mapping; 1253 struct inode *inode; 1254 swp_entry_t swap; 1255 pgoff_t index; 1256 1257 VM_BUG_ON_PAGE(PageCompound(page), page); 1258 BUG_ON(!PageLocked(page)); 1259 mapping = page->mapping; 1260 index = page->index; 1261 inode = mapping->host; 1262 info = SHMEM_I(inode); 1263 if (info->flags & VM_LOCKED) 1264 goto redirty; 1265 if (!total_swap_pages) 1266 goto redirty; 1267 1268 /* 1269 * Our capabilities prevent regular writeback or sync from ever calling 1270 * shmem_writepage; but a stacking filesystem might use ->writepage of 1271 * its underlying filesystem, in which case tmpfs should write out to 1272 * swap only in response to memory pressure, and not for the writeback 1273 * threads or sync. 1274 */ 1275 if (!wbc->for_reclaim) { 1276 WARN_ON_ONCE(1); /* Still happens? Tell us about it! */ 1277 goto redirty; 1278 } 1279 1280 /* 1281 * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC 1282 * value into swapfile.c, the only way we can correctly account for a 1283 * fallocated page arriving here is now to initialize it and write it. 1284 * 1285 * That's okay for a page already fallocated earlier, but if we have 1286 * not yet completed the fallocation, then (a) we want to keep track 1287 * of this page in case we have to undo it, and (b) it may not be a 1288 * good idea to continue anyway, once we're pushing into swap. So 1289 * reactivate the page, and let shmem_fallocate() quit when too many. 1290 */ 1291 if (!PageUptodate(page)) { 1292 if (inode->i_private) { 1293 struct shmem_falloc *shmem_falloc; 1294 spin_lock(&inode->i_lock); 1295 shmem_falloc = inode->i_private; 1296 if (shmem_falloc && 1297 !shmem_falloc->waitq && 1298 index >= shmem_falloc->start && 1299 index < shmem_falloc->next) 1300 shmem_falloc->nr_unswapped++; 1301 else 1302 shmem_falloc = NULL; 1303 spin_unlock(&inode->i_lock); 1304 if (shmem_falloc) 1305 goto redirty; 1306 } 1307 clear_highpage(page); 1308 flush_dcache_page(page); 1309 SetPageUptodate(page); 1310 } 1311 1312 swap = get_swap_page(page); 1313 if (!swap.val) 1314 goto redirty; 1315 1316 /* 1317 * Add inode to shmem_unuse()'s list of swapped-out inodes, 1318 * if it's not already there. Do it now before the page is 1319 * moved to swap cache, when its pagelock no longer protects 1320 * the inode from eviction. But don't unlock the mutex until 1321 * we've incremented swapped, because shmem_unuse_inode() will 1322 * prune a !swapped inode from the swaplist under this mutex. 1323 */ 1324 mutex_lock(&shmem_swaplist_mutex); 1325 if (list_empty(&info->swaplist)) 1326 list_add_tail(&info->swaplist, &shmem_swaplist); 1327 1328 if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { 1329 spin_lock_irq(&info->lock); 1330 shmem_recalc_inode(inode); 1331 info->swapped++; 1332 spin_unlock_irq(&info->lock); 1333 1334 swap_shmem_alloc(swap); 1335 shmem_delete_from_page_cache(page, swp_to_radix_entry(swap)); 1336 1337 mutex_unlock(&shmem_swaplist_mutex); 1338 BUG_ON(page_mapped(page)); 1339 swap_writepage(page, wbc); 1340 return 0; 1341 } 1342 1343 mutex_unlock(&shmem_swaplist_mutex); 1344 put_swap_page(page, swap); 1345 redirty: 1346 set_page_dirty(page); 1347 if (wbc->for_reclaim) 1348 return AOP_WRITEPAGE_ACTIVATE; /* Return with page locked */ 1349 unlock_page(page); 1350 return 0; 1351 } 1352 1353 #if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS) 1354 static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) 1355 { 1356 char buffer[64]; 1357 1358 if (!mpol || mpol->mode == MPOL_DEFAULT) 1359 return; /* show nothing */ 1360 1361 mpol_to_str(buffer, sizeof(buffer), mpol); 1362 1363 seq_printf(seq, ",mpol=%s", buffer); 1364 } 1365 1366 static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) 1367 { 1368 struct mempolicy *mpol = NULL; 1369 if (sbinfo->mpol) { 1370 spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */ 1371 mpol = sbinfo->mpol; 1372 mpol_get(mpol); 1373 spin_unlock(&sbinfo->stat_lock); 1374 } 1375 return mpol; 1376 } 1377 #else /* !CONFIG_NUMA || !CONFIG_TMPFS */ 1378 static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) 1379 { 1380 } 1381 static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) 1382 { 1383 return NULL; 1384 } 1385 #endif /* CONFIG_NUMA && CONFIG_TMPFS */ 1386 #ifndef CONFIG_NUMA 1387 #define vm_policy vm_private_data 1388 #endif 1389 1390 static void shmem_pseudo_vma_init(struct vm_area_struct *vma, 1391 struct shmem_inode_info *info, pgoff_t index) 1392 { 1393 /* Create a pseudo vma that just contains the policy */ 1394 vma_init(vma, NULL); 1395 /* Bias interleave by inode number to distribute better across nodes */ 1396 vma->vm_pgoff = index + info->vfs_inode.i_ino; 1397 vma->vm_policy = mpol_shared_policy_lookup(&info->policy, index); 1398 } 1399 1400 static void shmem_pseudo_vma_destroy(struct vm_area_struct *vma) 1401 { 1402 /* Drop reference taken by mpol_shared_policy_lookup() */ 1403 mpol_cond_put(vma->vm_policy); 1404 } 1405 1406 static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, 1407 struct shmem_inode_info *info, pgoff_t index) 1408 { 1409 struct vm_area_struct pvma; 1410 struct page *page; 1411 struct vm_fault vmf; 1412 1413 shmem_pseudo_vma_init(&pvma, info, index); 1414 vmf.vma = &pvma; 1415 vmf.address = 0; 1416 page = swap_cluster_readahead(swap, gfp, &vmf); 1417 shmem_pseudo_vma_destroy(&pvma); 1418 1419 return page; 1420 } 1421 1422 static struct page *shmem_alloc_hugepage(gfp_t gfp, 1423 struct shmem_inode_info *info, pgoff_t index) 1424 { 1425 struct vm_area_struct pvma; 1426 struct address_space *mapping = info->vfs_inode.i_mapping; 1427 pgoff_t hindex; 1428 struct page *page; 1429 1430 if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) 1431 return NULL; 1432 1433 hindex = round_down(index, HPAGE_PMD_NR); 1434 if (xa_find(&mapping->i_pages, &hindex, hindex + HPAGE_PMD_NR - 1, 1435 XA_PRESENT)) 1436 return NULL; 1437 1438 shmem_pseudo_vma_init(&pvma, info, hindex); 1439 page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN, 1440 HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(), true); 1441 shmem_pseudo_vma_destroy(&pvma); 1442 if (page) 1443 prep_transhuge_page(page); 1444 return page; 1445 } 1446 1447 static struct page *shmem_alloc_page(gfp_t gfp, 1448 struct shmem_inode_info *info, pgoff_t index) 1449 { 1450 struct vm_area_struct pvma; 1451 struct page *page; 1452 1453 shmem_pseudo_vma_init(&pvma, info, index); 1454 page = alloc_page_vma(gfp, &pvma, 0); 1455 shmem_pseudo_vma_destroy(&pvma); 1456 1457 return page; 1458 } 1459 1460 static struct page *shmem_alloc_and_acct_page(gfp_t gfp, 1461 struct inode *inode, 1462 pgoff_t index, bool huge) 1463 { 1464 struct shmem_inode_info *info = SHMEM_I(inode); 1465 struct page *page; 1466 int nr; 1467 int err = -ENOSPC; 1468 1469 if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) 1470 huge = false; 1471 nr = huge ? HPAGE_PMD_NR : 1; 1472 1473 if (!shmem_inode_acct_block(inode, nr)) 1474 goto failed; 1475 1476 if (huge) 1477 page = shmem_alloc_hugepage(gfp, info, index); 1478 else 1479 page = shmem_alloc_page(gfp, info, index); 1480 if (page) { 1481 __SetPageLocked(page); 1482 __SetPageSwapBacked(page); 1483 return page; 1484 } 1485 1486 err = -ENOMEM; 1487 shmem_inode_unacct_blocks(inode, nr); 1488 failed: 1489 return ERR_PTR(err); 1490 } 1491 1492 /* 1493 * When a page is moved from swapcache to shmem filecache (either by the 1494 * usual swapin of shmem_getpage_gfp(), or by the less common swapoff of 1495 * shmem_unuse_inode()), it may have been read in earlier from swap, in 1496 * ignorance of the mapping it belongs to. If that mapping has special 1497 * constraints (like the gma500 GEM driver, which requires RAM below 4GB), 1498 * we may need to copy to a suitable page before moving to filecache. 1499 * 1500 * In a future release, this may well be extended to respect cpuset and 1501 * NUMA mempolicy, and applied also to anonymous pages in do_swap_page(); 1502 * but for now it is a simple matter of zone. 1503 */ 1504 static bool shmem_should_replace_page(struct page *page, gfp_t gfp) 1505 { 1506 return page_zonenum(page) > gfp_zone(gfp); 1507 } 1508 1509 static int shmem_replace_page(struct page **pagep, gfp_t gfp, 1510 struct shmem_inode_info *info, pgoff_t index) 1511 { 1512 struct page *oldpage, *newpage; 1513 struct address_space *swap_mapping; 1514 swp_entry_t entry; 1515 pgoff_t swap_index; 1516 int error; 1517 1518 oldpage = *pagep; 1519 entry.val = page_private(oldpage); 1520 swap_index = swp_offset(entry); 1521 swap_mapping = page_mapping(oldpage); 1522 1523 /* 1524 * We have arrived here because our zones are constrained, so don't 1525 * limit chance of success by further cpuset and node constraints. 1526 */ 1527 gfp &= ~GFP_CONSTRAINT_MASK; 1528 newpage = shmem_alloc_page(gfp, info, index); 1529 if (!newpage) 1530 return -ENOMEM; 1531 1532 get_page(newpage); 1533 copy_highpage(newpage, oldpage); 1534 flush_dcache_page(newpage); 1535 1536 __SetPageLocked(newpage); 1537 __SetPageSwapBacked(newpage); 1538 SetPageUptodate(newpage); 1539 set_page_private(newpage, entry.val); 1540 SetPageSwapCache(newpage); 1541 1542 /* 1543 * Our caller will very soon move newpage out of swapcache, but it's 1544 * a nice clean interface for us to replace oldpage by newpage there. 1545 */ 1546 xa_lock_irq(&swap_mapping->i_pages); 1547 error = shmem_replace_entry(swap_mapping, swap_index, oldpage, newpage); 1548 if (!error) { 1549 __inc_node_page_state(newpage, NR_FILE_PAGES); 1550 __dec_node_page_state(oldpage, NR_FILE_PAGES); 1551 } 1552 xa_unlock_irq(&swap_mapping->i_pages); 1553 1554 if (unlikely(error)) { 1555 /* 1556 * Is this possible? I think not, now that our callers check 1557 * both PageSwapCache and page_private after getting page lock; 1558 * but be defensive. Reverse old to newpage for clear and free. 1559 */ 1560 oldpage = newpage; 1561 } else { 1562 mem_cgroup_migrate(oldpage, newpage); 1563 lru_cache_add_anon(newpage); 1564 *pagep = newpage; 1565 } 1566 1567 ClearPageSwapCache(oldpage); 1568 set_page_private(oldpage, 0); 1569 1570 unlock_page(oldpage); 1571 put_page(oldpage); 1572 put_page(oldpage); 1573 return error; 1574 } 1575 1576 /* 1577 * shmem_getpage_gfp - find page in cache, or get from swap, or allocate 1578 * 1579 * If we allocate a new one we do not mark it dirty. That's up to the 1580 * vm. If we swap it in we mark it dirty since we also free the swap 1581 * entry since a page cannot live in both the swap and page cache. 1582 * 1583 * fault_mm and fault_type are only supplied by shmem_fault: 1584 * otherwise they are NULL. 1585 */ 1586 static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, 1587 struct page **pagep, enum sgp_type sgp, gfp_t gfp, 1588 struct vm_area_struct *vma, struct vm_fault *vmf, 1589 vm_fault_t *fault_type) 1590 { 1591 struct address_space *mapping = inode->i_mapping; 1592 struct shmem_inode_info *info = SHMEM_I(inode); 1593 struct shmem_sb_info *sbinfo; 1594 struct mm_struct *charge_mm; 1595 struct mem_cgroup *memcg; 1596 struct page *page; 1597 swp_entry_t swap; 1598 enum sgp_type sgp_huge = sgp; 1599 pgoff_t hindex = index; 1600 int error; 1601 int once = 0; 1602 int alloced = 0; 1603 1604 if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT)) 1605 return -EFBIG; 1606 if (sgp == SGP_NOHUGE || sgp == SGP_HUGE) 1607 sgp = SGP_CACHE; 1608 repeat: 1609 swap.val = 0; 1610 page = find_lock_entry(mapping, index); 1611 if (xa_is_value(page)) { 1612 swap = radix_to_swp_entry(page); 1613 page = NULL; 1614 } 1615 1616 if (sgp <= SGP_CACHE && 1617 ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) { 1618 error = -EINVAL; 1619 goto unlock; 1620 } 1621 1622 if (page && sgp == SGP_WRITE) 1623 mark_page_accessed(page); 1624 1625 /* fallocated page? */ 1626 if (page && !PageUptodate(page)) { 1627 if (sgp != SGP_READ) 1628 goto clear; 1629 unlock_page(page); 1630 put_page(page); 1631 page = NULL; 1632 } 1633 if (page || (sgp == SGP_READ && !swap.val)) { 1634 *pagep = page; 1635 return 0; 1636 } 1637 1638 /* 1639 * Fast cache lookup did not find it: 1640 * bring it back from swap or allocate. 1641 */ 1642 sbinfo = SHMEM_SB(inode->i_sb); 1643 charge_mm = vma ? vma->vm_mm : current->mm; 1644 1645 if (swap.val) { 1646 /* Look it up and read it in.. */ 1647 page = lookup_swap_cache(swap, NULL, 0); 1648 if (!page) { 1649 /* Or update major stats only when swapin succeeds?? */ 1650 if (fault_type) { 1651 *fault_type |= VM_FAULT_MAJOR; 1652 count_vm_event(PGMAJFAULT); 1653 count_memcg_event_mm(charge_mm, PGMAJFAULT); 1654 } 1655 /* Here we actually start the io */ 1656 page = shmem_swapin(swap, gfp, info, index); 1657 if (!page) { 1658 error = -ENOMEM; 1659 goto failed; 1660 } 1661 } 1662 1663 /* We have to do this with page locked to prevent races */ 1664 lock_page(page); 1665 if (!PageSwapCache(page) || page_private(page) != swap.val || 1666 !shmem_confirm_swap(mapping, index, swap)) { 1667 error = -EEXIST; /* try again */ 1668 goto unlock; 1669 } 1670 if (!PageUptodate(page)) { 1671 error = -EIO; 1672 goto failed; 1673 } 1674 wait_on_page_writeback(page); 1675 1676 if (shmem_should_replace_page(page, gfp)) { 1677 error = shmem_replace_page(&page, gfp, info, index); 1678 if (error) 1679 goto failed; 1680 } 1681 1682 error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg, 1683 false); 1684 if (!error) { 1685 error = shmem_add_to_page_cache(page, mapping, index, 1686 swp_to_radix_entry(swap), gfp); 1687 /* 1688 * We already confirmed swap under page lock, and make 1689 * no memory allocation here, so usually no possibility 1690 * of error; but free_swap_and_cache() only trylocks a 1691 * page, so it is just possible that the entry has been 1692 * truncated or holepunched since swap was confirmed. 1693 * shmem_undo_range() will have done some of the 1694 * unaccounting, now delete_from_swap_cache() will do 1695 * the rest. 1696 * Reset swap.val? No, leave it so "failed" goes back to 1697 * "repeat": reading a hole and writing should succeed. 1698 */ 1699 if (error) { 1700 mem_cgroup_cancel_charge(page, memcg, false); 1701 delete_from_swap_cache(page); 1702 } 1703 } 1704 if (error) 1705 goto failed; 1706 1707 mem_cgroup_commit_charge(page, memcg, true, false); 1708 1709 spin_lock_irq(&info->lock); 1710 info->swapped--; 1711 shmem_recalc_inode(inode); 1712 spin_unlock_irq(&info->lock); 1713 1714 if (sgp == SGP_WRITE) 1715 mark_page_accessed(page); 1716 1717 delete_from_swap_cache(page); 1718 set_page_dirty(page); 1719 swap_free(swap); 1720 1721 } else { 1722 if (vma && userfaultfd_missing(vma)) { 1723 *fault_type = handle_userfault(vmf, VM_UFFD_MISSING); 1724 return 0; 1725 } 1726 1727 /* shmem_symlink() */ 1728 if (mapping->a_ops != &shmem_aops) 1729 goto alloc_nohuge; 1730 if (shmem_huge == SHMEM_HUGE_DENY || sgp_huge == SGP_NOHUGE) 1731 goto alloc_nohuge; 1732 if (shmem_huge == SHMEM_HUGE_FORCE) 1733 goto alloc_huge; 1734 switch (sbinfo->huge) { 1735 loff_t i_size; 1736 pgoff_t off; 1737 case SHMEM_HUGE_NEVER: 1738 goto alloc_nohuge; 1739 case SHMEM_HUGE_WITHIN_SIZE: 1740 off = round_up(index, HPAGE_PMD_NR); 1741 i_size = round_up(i_size_read(inode), PAGE_SIZE); 1742 if (i_size >= HPAGE_PMD_SIZE && 1743 i_size >> PAGE_SHIFT >= off) 1744 goto alloc_huge; 1745 /* fallthrough */ 1746 case SHMEM_HUGE_ADVISE: 1747 if (sgp_huge == SGP_HUGE) 1748 goto alloc_huge; 1749 /* TODO: implement fadvise() hints */ 1750 goto alloc_nohuge; 1751 } 1752 1753 alloc_huge: 1754 page = shmem_alloc_and_acct_page(gfp, inode, index, true); 1755 if (IS_ERR(page)) { 1756 alloc_nohuge: page = shmem_alloc_and_acct_page(gfp, inode, 1757 index, false); 1758 } 1759 if (IS_ERR(page)) { 1760 int retry = 5; 1761 error = PTR_ERR(page); 1762 page = NULL; 1763 if (error != -ENOSPC) 1764 goto failed; 1765 /* 1766 * Try to reclaim some spece by splitting a huge page 1767 * beyond i_size on the filesystem. 1768 */ 1769 while (retry--) { 1770 int ret; 1771 ret = shmem_unused_huge_shrink(sbinfo, NULL, 1); 1772 if (ret == SHRINK_STOP) 1773 break; 1774 if (ret) 1775 goto alloc_nohuge; 1776 } 1777 goto failed; 1778 } 1779 1780 if (PageTransHuge(page)) 1781 hindex = round_down(index, HPAGE_PMD_NR); 1782 else 1783 hindex = index; 1784 1785 if (sgp == SGP_WRITE) 1786 __SetPageReferenced(page); 1787 1788 error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg, 1789 PageTransHuge(page)); 1790 if (error) 1791 goto unacct; 1792 error = shmem_add_to_page_cache(page, mapping, hindex, 1793 NULL, gfp & GFP_RECLAIM_MASK); 1794 if (error) { 1795 mem_cgroup_cancel_charge(page, memcg, 1796 PageTransHuge(page)); 1797 goto unacct; 1798 } 1799 mem_cgroup_commit_charge(page, memcg, false, 1800 PageTransHuge(page)); 1801 lru_cache_add_anon(page); 1802 1803 spin_lock_irq(&info->lock); 1804 info->alloced += 1 << compound_order(page); 1805 inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page); 1806 shmem_recalc_inode(inode); 1807 spin_unlock_irq(&info->lock); 1808 alloced = true; 1809 1810 if (PageTransHuge(page) && 1811 DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) < 1812 hindex + HPAGE_PMD_NR - 1) { 1813 /* 1814 * Part of the huge page is beyond i_size: subject 1815 * to shrink under memory pressure. 1816 */ 1817 spin_lock(&sbinfo->shrinklist_lock); 1818 /* 1819 * _careful to defend against unlocked access to 1820 * ->shrink_list in shmem_unused_huge_shrink() 1821 */ 1822 if (list_empty_careful(&info->shrinklist)) { 1823 list_add_tail(&info->shrinklist, 1824 &sbinfo->shrinklist); 1825 sbinfo->shrinklist_len++; 1826 } 1827 spin_unlock(&sbinfo->shrinklist_lock); 1828 } 1829 1830 /* 1831 * Let SGP_FALLOC use the SGP_WRITE optimization on a new page. 1832 */ 1833 if (sgp == SGP_FALLOC) 1834 sgp = SGP_WRITE; 1835 clear: 1836 /* 1837 * Let SGP_WRITE caller clear ends if write does not fill page; 1838 * but SGP_FALLOC on a page fallocated earlier must initialize 1839 * it now, lest undo on failure cancel our earlier guarantee. 1840 */ 1841 if (sgp != SGP_WRITE && !PageUptodate(page)) { 1842 struct page *head = compound_head(page); 1843 int i; 1844 1845 for (i = 0; i < (1 << compound_order(head)); i++) { 1846 clear_highpage(head + i); 1847 flush_dcache_page(head + i); 1848 } 1849 SetPageUptodate(head); 1850 } 1851 } 1852 1853 /* Perhaps the file has been truncated since we checked */ 1854 if (sgp <= SGP_CACHE && 1855 ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) { 1856 if (alloced) { 1857 ClearPageDirty(page); 1858 delete_from_page_cache(page); 1859 spin_lock_irq(&info->lock); 1860 shmem_recalc_inode(inode); 1861 spin_unlock_irq(&info->lock); 1862 } 1863 error = -EINVAL; 1864 goto unlock; 1865 } 1866 *pagep = page + index - hindex; 1867 return 0; 1868 1869 /* 1870 * Error recovery. 1871 */ 1872 unacct: 1873 shmem_inode_unacct_blocks(inode, 1 << compound_order(page)); 1874 1875 if (PageTransHuge(page)) { 1876 unlock_page(page); 1877 put_page(page); 1878 goto alloc_nohuge; 1879 } 1880 failed: 1881 if (swap.val && !shmem_confirm_swap(mapping, index, swap)) 1882 error = -EEXIST; 1883 unlock: 1884 if (page) { 1885 unlock_page(page); 1886 put_page(page); 1887 } 1888 if (error == -ENOSPC && !once++) { 1889 spin_lock_irq(&info->lock); 1890 shmem_recalc_inode(inode); 1891 spin_unlock_irq(&info->lock); 1892 goto repeat; 1893 } 1894 if (error == -EEXIST) 1895 goto repeat; 1896 return error; 1897 } 1898 1899 /* 1900 * This is like autoremove_wake_function, but it removes the wait queue 1901 * entry unconditionally - even if something else had already woken the 1902 * target. 1903 */ 1904 static int synchronous_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) 1905 { 1906 int ret = default_wake_function(wait, mode, sync, key); 1907 list_del_init(&wait->entry); 1908 return ret; 1909 } 1910 1911 static vm_fault_t shmem_fault(struct vm_fault *vmf) 1912 { 1913 struct vm_area_struct *vma = vmf->vma; 1914 struct inode *inode = file_inode(vma->vm_file); 1915 gfp_t gfp = mapping_gfp_mask(inode->i_mapping); 1916 enum sgp_type sgp; 1917 int err; 1918 vm_fault_t ret = VM_FAULT_LOCKED; 1919 1920 /* 1921 * Trinity finds that probing a hole which tmpfs is punching can 1922 * prevent the hole-punch from ever completing: which in turn 1923 * locks writers out with its hold on i_mutex. So refrain from 1924 * faulting pages into the hole while it's being punched. Although 1925 * shmem_undo_range() does remove the additions, it may be unable to 1926 * keep up, as each new page needs its own unmap_mapping_range() call, 1927 * and the i_mmap tree grows ever slower to scan if new vmas are added. 1928 * 1929 * It does not matter if we sometimes reach this check just before the 1930 * hole-punch begins, so that one fault then races with the punch: 1931 * we just need to make racing faults a rare case. 1932 * 1933 * The implementation below would be much simpler if we just used a 1934 * standard mutex or completion: but we cannot take i_mutex in fault, 1935 * and bloating every shmem inode for this unlikely case would be sad. 1936 */ 1937 if (unlikely(inode->i_private)) { 1938 struct shmem_falloc *shmem_falloc; 1939 1940 spin_lock(&inode->i_lock); 1941 shmem_falloc = inode->i_private; 1942 if (shmem_falloc && 1943 shmem_falloc->waitq && 1944 vmf->pgoff >= shmem_falloc->start && 1945 vmf->pgoff < shmem_falloc->next) { 1946 wait_queue_head_t *shmem_falloc_waitq; 1947 DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function); 1948 1949 ret = VM_FAULT_NOPAGE; 1950 if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) && 1951 !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) { 1952 /* It's polite to up mmap_sem if we can */ 1953 up_read(&vma->vm_mm->mmap_sem); 1954 ret = VM_FAULT_RETRY; 1955 } 1956 1957 shmem_falloc_waitq = shmem_falloc->waitq; 1958 prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait, 1959 TASK_UNINTERRUPTIBLE); 1960 spin_unlock(&inode->i_lock); 1961 schedule(); 1962 1963 /* 1964 * shmem_falloc_waitq points into the shmem_fallocate() 1965 * stack of the hole-punching task: shmem_falloc_waitq 1966 * is usually invalid by the time we reach here, but 1967 * finish_wait() does not dereference it in that case; 1968 * though i_lock needed lest racing with wake_up_all(). 1969 */ 1970 spin_lock(&inode->i_lock); 1971 finish_wait(shmem_falloc_waitq, &shmem_fault_wait); 1972 spin_unlock(&inode->i_lock); 1973 return ret; 1974 } 1975 spin_unlock(&inode->i_lock); 1976 } 1977 1978 sgp = SGP_CACHE; 1979 1980 if ((vma->vm_flags & VM_NOHUGEPAGE) || 1981 test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) 1982 sgp = SGP_NOHUGE; 1983 else if (vma->vm_flags & VM_HUGEPAGE) 1984 sgp = SGP_HUGE; 1985 1986 err = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, sgp, 1987 gfp, vma, vmf, &ret); 1988 if (err) 1989 return vmf_error(err); 1990 return ret; 1991 } 1992 1993 unsigned long shmem_get_unmapped_area(struct file *file, 1994 unsigned long uaddr, unsigned long len, 1995 unsigned long pgoff, unsigned long flags) 1996 { 1997 unsigned long (*get_area)(struct file *, 1998 unsigned long, unsigned long, unsigned long, unsigned long); 1999 unsigned long addr; 2000 unsigned long offset; 2001 unsigned long inflated_len; 2002 unsigned long inflated_addr; 2003 unsigned long inflated_offset; 2004 2005 if (len > TASK_SIZE) 2006 return -ENOMEM; 2007 2008 get_area = current->mm->get_unmapped_area; 2009 addr = get_area(file, uaddr, len, pgoff, flags); 2010 2011 if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) 2012 return addr; 2013 if (IS_ERR_VALUE(addr)) 2014 return addr; 2015 if (addr & ~PAGE_MASK) 2016 return addr; 2017 if (addr > TASK_SIZE - len) 2018 return addr; 2019 2020 if (shmem_huge == SHMEM_HUGE_DENY) 2021 return addr; 2022 if (len < HPAGE_PMD_SIZE) 2023 return addr; 2024 if (flags & MAP_FIXED) 2025 return addr; 2026 /* 2027 * Our priority is to support MAP_SHARED mapped hugely; 2028 * and support MAP_PRIVATE mapped hugely too, until it is COWed. 2029 * But if caller specified an address hint, respect that as before. 2030 */ 2031 if (uaddr) 2032 return addr; 2033 2034 if (shmem_huge != SHMEM_HUGE_FORCE) { 2035 struct super_block *sb; 2036 2037 if (file) { 2038 VM_BUG_ON(file->f_op != &shmem_file_operations); 2039 sb = file_inode(file)->i_sb; 2040 } else { 2041 /* 2042 * Called directly from mm/mmap.c, or drivers/char/mem.c 2043 * for "/dev/zero", to create a shared anonymous object. 2044 */ 2045 if (IS_ERR(shm_mnt)) 2046 return addr; 2047 sb = shm_mnt->mnt_sb; 2048 } 2049 if (SHMEM_SB(sb)->huge == SHMEM_HUGE_NEVER) 2050 return addr; 2051 } 2052 2053 offset = (pgoff << PAGE_SHIFT) & (HPAGE_PMD_SIZE-1); 2054 if (offset && offset + len < 2 * HPAGE_PMD_SIZE) 2055 return addr; 2056 if ((addr & (HPAGE_PMD_SIZE-1)) == offset) 2057 return addr; 2058 2059 inflated_len = len + HPAGE_PMD_SIZE - PAGE_SIZE; 2060 if (inflated_len > TASK_SIZE) 2061 return addr; 2062 if (inflated_len < len) 2063 return addr; 2064 2065 inflated_addr = get_area(NULL, 0, inflated_len, 0, flags); 2066 if (IS_ERR_VALUE(inflated_addr)) 2067 return addr; 2068 if (inflated_addr & ~PAGE_MASK) 2069 return addr; 2070 2071 inflated_offset = inflated_addr & (HPAGE_PMD_SIZE-1); 2072 inflated_addr += offset - inflated_offset; 2073 if (inflated_offset > offset) 2074 inflated_addr += HPAGE_PMD_SIZE; 2075 2076 if (inflated_addr > TASK_SIZE - len) 2077 return addr; 2078 return inflated_addr; 2079 } 2080 2081 #ifdef CONFIG_NUMA 2082 static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol) 2083 { 2084 struct inode *inode = file_inode(vma->vm_file); 2085 return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol); 2086 } 2087 2088 static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, 2089 unsigned long addr) 2090 { 2091 struct inode *inode = file_inode(vma->vm_file); 2092 pgoff_t index; 2093 2094 index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 2095 return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index); 2096 } 2097 #endif 2098 2099 int shmem_lock(struct file *file, int lock, struct user_struct *user) 2100 { 2101 struct inode *inode = file_inode(file); 2102 struct shmem_inode_info *info = SHMEM_I(inode); 2103 int retval = -ENOMEM; 2104 2105 spin_lock_irq(&info->lock); 2106 if (lock && !(info->flags & VM_LOCKED)) { 2107 if (!user_shm_lock(inode->i_size, user)) 2108 goto out_nomem; 2109 info->flags |= VM_LOCKED; 2110 mapping_set_unevictable(file->f_mapping); 2111 } 2112 if (!lock && (info->flags & VM_LOCKED) && user) { 2113 user_shm_unlock(inode->i_size, user); 2114 info->flags &= ~VM_LOCKED; 2115 mapping_clear_unevictable(file->f_mapping); 2116 } 2117 retval = 0; 2118 2119 out_nomem: 2120 spin_unlock_irq(&info->lock); 2121 return retval; 2122 } 2123 2124 static int shmem_mmap(struct file *file, struct vm_area_struct *vma) 2125 { 2126 file_accessed(file); 2127 vma->vm_ops = &shmem_vm_ops; 2128 if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && 2129 ((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) < 2130 (vma->vm_end & HPAGE_PMD_MASK)) { 2131 khugepaged_enter(vma, vma->vm_flags); 2132 } 2133 return 0; 2134 } 2135 2136 static struct inode *shmem_get_inode(struct super_block *sb, const struct inode *dir, 2137 umode_t mode, dev_t dev, unsigned long flags) 2138 { 2139 struct inode *inode; 2140 struct shmem_inode_info *info; 2141 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 2142 2143 if (shmem_reserve_inode(sb)) 2144 return NULL; 2145 2146 inode = new_inode(sb); 2147 if (inode) { 2148 inode->i_ino = get_next_ino(); 2149 inode_init_owner(inode, dir, mode); 2150 inode->i_blocks = 0; 2151 inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); 2152 inode->i_generation = prandom_u32(); 2153 info = SHMEM_I(inode); 2154 memset(info, 0, (char *)inode - (char *)info); 2155 spin_lock_init(&info->lock); 2156 info->seals = F_SEAL_SEAL; 2157 info->flags = flags & VM_NORESERVE; 2158 INIT_LIST_HEAD(&info->shrinklist); 2159 INIT_LIST_HEAD(&info->swaplist); 2160 simple_xattrs_init(&info->xattrs); 2161 cache_no_acl(inode); 2162 2163 switch (mode & S_IFMT) { 2164 default: 2165 inode->i_op = &shmem_special_inode_operations; 2166 init_special_inode(inode, mode, dev); 2167 break; 2168 case S_IFREG: 2169 inode->i_mapping->a_ops = &shmem_aops; 2170 inode->i_op = &shmem_inode_operations; 2171 inode->i_fop = &shmem_file_operations; 2172 mpol_shared_policy_init(&info->policy, 2173 shmem_get_sbmpol(sbinfo)); 2174 break; 2175 case S_IFDIR: 2176 inc_nlink(inode); 2177 /* Some things misbehave if size == 0 on a directory */ 2178 inode->i_size = 2 * BOGO_DIRENT_SIZE; 2179 inode->i_op = &shmem_dir_inode_operations; 2180 inode->i_fop = &simple_dir_operations; 2181 break; 2182 case S_IFLNK: 2183 /* 2184 * Must not load anything in the rbtree, 2185 * mpol_free_shared_policy will not be called. 2186 */ 2187 mpol_shared_policy_init(&info->policy, NULL); 2188 break; 2189 } 2190 2191 lockdep_annotate_inode_mutex_key(inode); 2192 } else 2193 shmem_free_inode(sb); 2194 return inode; 2195 } 2196 2197 bool shmem_mapping(struct address_space *mapping) 2198 { 2199 return mapping->a_ops == &shmem_aops; 2200 } 2201 2202 static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, 2203 pmd_t *dst_pmd, 2204 struct vm_area_struct *dst_vma, 2205 unsigned long dst_addr, 2206 unsigned long src_addr, 2207 bool zeropage, 2208 struct page **pagep) 2209 { 2210 struct inode *inode = file_inode(dst_vma->vm_file); 2211 struct shmem_inode_info *info = SHMEM_I(inode); 2212 struct address_space *mapping = inode->i_mapping; 2213 gfp_t gfp = mapping_gfp_mask(mapping); 2214 pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); 2215 struct mem_cgroup *memcg; 2216 spinlock_t *ptl; 2217 void *page_kaddr; 2218 struct page *page; 2219 pte_t _dst_pte, *dst_pte; 2220 int ret; 2221 pgoff_t offset, max_off; 2222 2223 ret = -ENOMEM; 2224 if (!shmem_inode_acct_block(inode, 1)) 2225 goto out; 2226 2227 if (!*pagep) { 2228 page = shmem_alloc_page(gfp, info, pgoff); 2229 if (!page) 2230 goto out_unacct_blocks; 2231 2232 if (!zeropage) { /* mcopy_atomic */ 2233 page_kaddr = kmap_atomic(page); 2234 ret = copy_from_user(page_kaddr, 2235 (const void __user *)src_addr, 2236 PAGE_SIZE); 2237 kunmap_atomic(page_kaddr); 2238 2239 /* fallback to copy_from_user outside mmap_sem */ 2240 if (unlikely(ret)) { 2241 *pagep = page; 2242 shmem_inode_unacct_blocks(inode, 1); 2243 /* don't free the page */ 2244 return -ENOENT; 2245 } 2246 } else { /* mfill_zeropage_atomic */ 2247 clear_highpage(page); 2248 } 2249 } else { 2250 page = *pagep; 2251 *pagep = NULL; 2252 } 2253 2254 VM_BUG_ON(PageLocked(page) || PageSwapBacked(page)); 2255 __SetPageLocked(page); 2256 __SetPageSwapBacked(page); 2257 __SetPageUptodate(page); 2258 2259 ret = -EFAULT; 2260 offset = linear_page_index(dst_vma, dst_addr); 2261 max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); 2262 if (unlikely(offset >= max_off)) 2263 goto out_release; 2264 2265 ret = mem_cgroup_try_charge_delay(page, dst_mm, gfp, &memcg, false); 2266 if (ret) 2267 goto out_release; 2268 2269 ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL, 2270 gfp & GFP_RECLAIM_MASK); 2271 if (ret) 2272 goto out_release_uncharge; 2273 2274 mem_cgroup_commit_charge(page, memcg, false, false); 2275 2276 _dst_pte = mk_pte(page, dst_vma->vm_page_prot); 2277 if (dst_vma->vm_flags & VM_WRITE) 2278 _dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte)); 2279 else { 2280 /* 2281 * We don't set the pte dirty if the vma has no 2282 * VM_WRITE permission, so mark the page dirty or it 2283 * could be freed from under us. We could do it 2284 * unconditionally before unlock_page(), but doing it 2285 * only if VM_WRITE is not set is faster. 2286 */ 2287 set_page_dirty(page); 2288 } 2289 2290 dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); 2291 2292 ret = -EFAULT; 2293 max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); 2294 if (unlikely(offset >= max_off)) 2295 goto out_release_uncharge_unlock; 2296 2297 ret = -EEXIST; 2298 if (!pte_none(*dst_pte)) 2299 goto out_release_uncharge_unlock; 2300 2301 lru_cache_add_anon(page); 2302 2303 spin_lock(&info->lock); 2304 info->alloced++; 2305 inode->i_blocks += BLOCKS_PER_PAGE; 2306 shmem_recalc_inode(inode); 2307 spin_unlock(&info->lock); 2308 2309 inc_mm_counter(dst_mm, mm_counter_file(page)); 2310 page_add_file_rmap(page, false); 2311 set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); 2312 2313 /* No need to invalidate - it was non-present before */ 2314 update_mmu_cache(dst_vma, dst_addr, dst_pte); 2315 pte_unmap_unlock(dst_pte, ptl); 2316 unlock_page(page); 2317 ret = 0; 2318 out: 2319 return ret; 2320 out_release_uncharge_unlock: 2321 pte_unmap_unlock(dst_pte, ptl); 2322 ClearPageDirty(page); 2323 delete_from_page_cache(page); 2324 out_release_uncharge: 2325 mem_cgroup_cancel_charge(page, memcg, false); 2326 out_release: 2327 unlock_page(page); 2328 put_page(page); 2329 out_unacct_blocks: 2330 shmem_inode_unacct_blocks(inode, 1); 2331 goto out; 2332 } 2333 2334 int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, 2335 pmd_t *dst_pmd, 2336 struct vm_area_struct *dst_vma, 2337 unsigned long dst_addr, 2338 unsigned long src_addr, 2339 struct page **pagep) 2340 { 2341 return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, 2342 dst_addr, src_addr, false, pagep); 2343 } 2344 2345 int shmem_mfill_zeropage_pte(struct mm_struct *dst_mm, 2346 pmd_t *dst_pmd, 2347 struct vm_area_struct *dst_vma, 2348 unsigned long dst_addr) 2349 { 2350 struct page *page = NULL; 2351 2352 return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, 2353 dst_addr, 0, true, &page); 2354 } 2355 2356 #ifdef CONFIG_TMPFS 2357 static const struct inode_operations shmem_symlink_inode_operations; 2358 static const struct inode_operations shmem_short_symlink_operations; 2359 2360 #ifdef CONFIG_TMPFS_XATTR 2361 static int shmem_initxattrs(struct inode *, const struct xattr *, void *); 2362 #else 2363 #define shmem_initxattrs NULL 2364 #endif 2365 2366 static int 2367 shmem_write_begin(struct file *file, struct address_space *mapping, 2368 loff_t pos, unsigned len, unsigned flags, 2369 struct page **pagep, void **fsdata) 2370 { 2371 struct inode *inode = mapping->host; 2372 struct shmem_inode_info *info = SHMEM_I(inode); 2373 pgoff_t index = pos >> PAGE_SHIFT; 2374 2375 /* i_mutex is held by caller */ 2376 if (unlikely(info->seals & (F_SEAL_WRITE | F_SEAL_GROW))) { 2377 if (info->seals & F_SEAL_WRITE) 2378 return -EPERM; 2379 if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size) 2380 return -EPERM; 2381 } 2382 2383 return shmem_getpage(inode, index, pagep, SGP_WRITE); 2384 } 2385 2386 static int 2387 shmem_write_end(struct file *file, struct address_space *mapping, 2388 loff_t pos, unsigned len, unsigned copied, 2389 struct page *page, void *fsdata) 2390 { 2391 struct inode *inode = mapping->host; 2392 2393 if (pos + copied > inode->i_size) 2394 i_size_write(inode, pos + copied); 2395 2396 if (!PageUptodate(page)) { 2397 struct page *head = compound_head(page); 2398 if (PageTransCompound(page)) { 2399 int i; 2400 2401 for (i = 0; i < HPAGE_PMD_NR; i++) { 2402 if (head + i == page) 2403 continue; 2404 clear_highpage(head + i); 2405 flush_dcache_page(head + i); 2406 } 2407 } 2408 if (copied < PAGE_SIZE) { 2409 unsigned from = pos & (PAGE_SIZE - 1); 2410 zero_user_segments(page, 0, from, 2411 from + copied, PAGE_SIZE); 2412 } 2413 SetPageUptodate(head); 2414 } 2415 set_page_dirty(page); 2416 unlock_page(page); 2417 put_page(page); 2418 2419 return copied; 2420 } 2421 2422 static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) 2423 { 2424 struct file *file = iocb->ki_filp; 2425 struct inode *inode = file_inode(file); 2426 struct address_space *mapping = inode->i_mapping; 2427 pgoff_t index; 2428 unsigned long offset; 2429 enum sgp_type sgp = SGP_READ; 2430 int error = 0; 2431 ssize_t retval = 0; 2432 loff_t *ppos = &iocb->ki_pos; 2433 2434 /* 2435 * Might this read be for a stacking filesystem? Then when reading 2436 * holes of a sparse file, we actually need to allocate those pages, 2437 * and even mark them dirty, so it cannot exceed the max_blocks limit. 2438 */ 2439 if (!iter_is_iovec(to)) 2440 sgp = SGP_CACHE; 2441 2442 index = *ppos >> PAGE_SHIFT; 2443 offset = *ppos & ~PAGE_MASK; 2444 2445 for (;;) { 2446 struct page *page = NULL; 2447 pgoff_t end_index; 2448 unsigned long nr, ret; 2449 loff_t i_size = i_size_read(inode); 2450 2451 end_index = i_size >> PAGE_SHIFT; 2452 if (index > end_index) 2453 break; 2454 if (index == end_index) { 2455 nr = i_size & ~PAGE_MASK; 2456 if (nr <= offset) 2457 break; 2458 } 2459 2460 error = shmem_getpage(inode, index, &page, sgp); 2461 if (error) { 2462 if (error == -EINVAL) 2463 error = 0; 2464 break; 2465 } 2466 if (page) { 2467 if (sgp == SGP_CACHE) 2468 set_page_dirty(page); 2469 unlock_page(page); 2470 } 2471 2472 /* 2473 * We must evaluate after, since reads (unlike writes) 2474 * are called without i_mutex protection against truncate 2475 */ 2476 nr = PAGE_SIZE; 2477 i_size = i_size_read(inode); 2478 end_index = i_size >> PAGE_SHIFT; 2479 if (index == end_index) { 2480 nr = i_size & ~PAGE_MASK; 2481 if (nr <= offset) { 2482 if (page) 2483 put_page(page); 2484 break; 2485 } 2486 } 2487 nr -= offset; 2488 2489 if (page) { 2490 /* 2491 * If users can be writing to this page using arbitrary 2492 * virtual addresses, take care about potential aliasing 2493 * before reading the page on the kernel side. 2494 */ 2495 if (mapping_writably_mapped(mapping)) 2496 flush_dcache_page(page); 2497 /* 2498 * Mark the page accessed if we read the beginning. 2499 */ 2500 if (!offset) 2501 mark_page_accessed(page); 2502 } else { 2503 page = ZERO_PAGE(0); 2504 get_page(page); 2505 } 2506 2507 /* 2508 * Ok, we have the page, and it's up-to-date, so 2509 * now we can copy it to user space... 2510 */ 2511 ret = copy_page_to_iter(page, offset, nr, to); 2512 retval += ret; 2513 offset += ret; 2514 index += offset >> PAGE_SHIFT; 2515 offset &= ~PAGE_MASK; 2516 2517 put_page(page); 2518 if (!iov_iter_count(to)) 2519 break; 2520 if (ret < nr) { 2521 error = -EFAULT; 2522 break; 2523 } 2524 cond_resched(); 2525 } 2526 2527 *ppos = ((loff_t) index << PAGE_SHIFT) + offset; 2528 file_accessed(file); 2529 return retval ? retval : error; 2530 } 2531 2532 /* 2533 * llseek SEEK_DATA or SEEK_HOLE through the page cache. 2534 */ 2535 static pgoff_t shmem_seek_hole_data(struct address_space *mapping, 2536 pgoff_t index, pgoff_t end, int whence) 2537 { 2538 struct page *page; 2539 struct pagevec pvec; 2540 pgoff_t indices[PAGEVEC_SIZE]; 2541 bool done = false; 2542 int i; 2543 2544 pagevec_init(&pvec); 2545 pvec.nr = 1; /* start small: we may be there already */ 2546 while (!done) { 2547 pvec.nr = find_get_entries(mapping, index, 2548 pvec.nr, pvec.pages, indices); 2549 if (!pvec.nr) { 2550 if (whence == SEEK_DATA) 2551 index = end; 2552 break; 2553 } 2554 for (i = 0; i < pvec.nr; i++, index++) { 2555 if (index < indices[i]) { 2556 if (whence == SEEK_HOLE) { 2557 done = true; 2558 break; 2559 } 2560 index = indices[i]; 2561 } 2562 page = pvec.pages[i]; 2563 if (page && !xa_is_value(page)) { 2564 if (!PageUptodate(page)) 2565 page = NULL; 2566 } 2567 if (index >= end || 2568 (page && whence == SEEK_DATA) || 2569 (!page && whence == SEEK_HOLE)) { 2570 done = true; 2571 break; 2572 } 2573 } 2574 pagevec_remove_exceptionals(&pvec); 2575 pagevec_release(&pvec); 2576 pvec.nr = PAGEVEC_SIZE; 2577 cond_resched(); 2578 } 2579 return index; 2580 } 2581 2582 static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence) 2583 { 2584 struct address_space *mapping = file->f_mapping; 2585 struct inode *inode = mapping->host; 2586 pgoff_t start, end; 2587 loff_t new_offset; 2588 2589 if (whence != SEEK_DATA && whence != SEEK_HOLE) 2590 return generic_file_llseek_size(file, offset, whence, 2591 MAX_LFS_FILESIZE, i_size_read(inode)); 2592 inode_lock(inode); 2593 /* We're holding i_mutex so we can access i_size directly */ 2594 2595 if (offset < 0 || offset >= inode->i_size) 2596 offset = -ENXIO; 2597 else { 2598 start = offset >> PAGE_SHIFT; 2599 end = (inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT; 2600 new_offset = shmem_seek_hole_data(mapping, start, end, whence); 2601 new_offset <<= PAGE_SHIFT; 2602 if (new_offset > offset) { 2603 if (new_offset < inode->i_size) 2604 offset = new_offset; 2605 else if (whence == SEEK_DATA) 2606 offset = -ENXIO; 2607 else 2608 offset = inode->i_size; 2609 } 2610 } 2611 2612 if (offset >= 0) 2613 offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE); 2614 inode_unlock(inode); 2615 return offset; 2616 } 2617 2618 static long shmem_fallocate(struct file *file, int mode, loff_t offset, 2619 loff_t len) 2620 { 2621 struct inode *inode = file_inode(file); 2622 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 2623 struct shmem_inode_info *info = SHMEM_I(inode); 2624 struct shmem_falloc shmem_falloc; 2625 pgoff_t start, index, end; 2626 int error; 2627 2628 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) 2629 return -EOPNOTSUPP; 2630 2631 inode_lock(inode); 2632 2633 if (mode & FALLOC_FL_PUNCH_HOLE) { 2634 struct address_space *mapping = file->f_mapping; 2635 loff_t unmap_start = round_up(offset, PAGE_SIZE); 2636 loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1; 2637 DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq); 2638 2639 /* protected by i_mutex */ 2640 if (info->seals & F_SEAL_WRITE) { 2641 error = -EPERM; 2642 goto out; 2643 } 2644 2645 shmem_falloc.waitq = &shmem_falloc_waitq; 2646 shmem_falloc.start = unmap_start >> PAGE_SHIFT; 2647 shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT; 2648 spin_lock(&inode->i_lock); 2649 inode->i_private = &shmem_falloc; 2650 spin_unlock(&inode->i_lock); 2651 2652 if ((u64)unmap_end > (u64)unmap_start) 2653 unmap_mapping_range(mapping, unmap_start, 2654 1 + unmap_end - unmap_start, 0); 2655 shmem_truncate_range(inode, offset, offset + len - 1); 2656 /* No need to unmap again: hole-punching leaves COWed pages */ 2657 2658 spin_lock(&inode->i_lock); 2659 inode->i_private = NULL; 2660 wake_up_all(&shmem_falloc_waitq); 2661 WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.head)); 2662 spin_unlock(&inode->i_lock); 2663 error = 0; 2664 goto out; 2665 } 2666 2667 /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */ 2668 error = inode_newsize_ok(inode, offset + len); 2669 if (error) 2670 goto out; 2671 2672 if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) { 2673 error = -EPERM; 2674 goto out; 2675 } 2676 2677 start = offset >> PAGE_SHIFT; 2678 end = (offset + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 2679 /* Try to avoid a swapstorm if len is impossible to satisfy */ 2680 if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) { 2681 error = -ENOSPC; 2682 goto out; 2683 } 2684 2685 shmem_falloc.waitq = NULL; 2686 shmem_falloc.start = start; 2687 shmem_falloc.next = start; 2688 shmem_falloc.nr_falloced = 0; 2689 shmem_falloc.nr_unswapped = 0; 2690 spin_lock(&inode->i_lock); 2691 inode->i_private = &shmem_falloc; 2692 spin_unlock(&inode->i_lock); 2693 2694 for (index = start; index < end; index++) { 2695 struct page *page; 2696 2697 /* 2698 * Good, the fallocate(2) manpage permits EINTR: we may have 2699 * been interrupted because we are using up too much memory. 2700 */ 2701 if (signal_pending(current)) 2702 error = -EINTR; 2703 else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced) 2704 error = -ENOMEM; 2705 else 2706 error = shmem_getpage(inode, index, &page, SGP_FALLOC); 2707 if (error) { 2708 /* Remove the !PageUptodate pages we added */ 2709 if (index > start) { 2710 shmem_undo_range(inode, 2711 (loff_t)start << PAGE_SHIFT, 2712 ((loff_t)index << PAGE_SHIFT) - 1, true); 2713 } 2714 goto undone; 2715 } 2716 2717 /* 2718 * Inform shmem_writepage() how far we have reached. 2719 * No need for lock or barrier: we have the page lock. 2720 */ 2721 shmem_falloc.next++; 2722 if (!PageUptodate(page)) 2723 shmem_falloc.nr_falloced++; 2724 2725 /* 2726 * If !PageUptodate, leave it that way so that freeable pages 2727 * can be recognized if we need to rollback on error later. 2728 * But set_page_dirty so that memory pressure will swap rather 2729 * than free the pages we are allocating (and SGP_CACHE pages 2730 * might still be clean: we now need to mark those dirty too). 2731 */ 2732 set_page_dirty(page); 2733 unlock_page(page); 2734 put_page(page); 2735 cond_resched(); 2736 } 2737 2738 if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) 2739 i_size_write(inode, offset + len); 2740 inode->i_ctime = current_time(inode); 2741 undone: 2742 spin_lock(&inode->i_lock); 2743 inode->i_private = NULL; 2744 spin_unlock(&inode->i_lock); 2745 out: 2746 inode_unlock(inode); 2747 return error; 2748 } 2749 2750 static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) 2751 { 2752 struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); 2753 2754 buf->f_type = TMPFS_MAGIC; 2755 buf->f_bsize = PAGE_SIZE; 2756 buf->f_namelen = NAME_MAX; 2757 if (sbinfo->max_blocks) { 2758 buf->f_blocks = sbinfo->max_blocks; 2759 buf->f_bavail = 2760 buf->f_bfree = sbinfo->max_blocks - 2761 percpu_counter_sum(&sbinfo->used_blocks); 2762 } 2763 if (sbinfo->max_inodes) { 2764 buf->f_files = sbinfo->max_inodes; 2765 buf->f_ffree = sbinfo->free_inodes; 2766 } 2767 /* else leave those fields 0 like simple_statfs */ 2768 return 0; 2769 } 2770 2771 /* 2772 * File creation. Allocate an inode, and we're done.. 2773 */ 2774 static int 2775 shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) 2776 { 2777 struct inode *inode; 2778 int error = -ENOSPC; 2779 2780 inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE); 2781 if (inode) { 2782 error = simple_acl_create(dir, inode); 2783 if (error) 2784 goto out_iput; 2785 error = security_inode_init_security(inode, dir, 2786 &dentry->d_name, 2787 shmem_initxattrs, NULL); 2788 if (error && error != -EOPNOTSUPP) 2789 goto out_iput; 2790 2791 error = 0; 2792 dir->i_size += BOGO_DIRENT_SIZE; 2793 dir->i_ctime = dir->i_mtime = current_time(dir); 2794 d_instantiate(dentry, inode); 2795 dget(dentry); /* Extra count - pin the dentry in core */ 2796 } 2797 return error; 2798 out_iput: 2799 iput(inode); 2800 return error; 2801 } 2802 2803 static int 2804 shmem_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) 2805 { 2806 struct inode *inode; 2807 int error = -ENOSPC; 2808 2809 inode = shmem_get_inode(dir->i_sb, dir, mode, 0, VM_NORESERVE); 2810 if (inode) { 2811 error = security_inode_init_security(inode, dir, 2812 NULL, 2813 shmem_initxattrs, NULL); 2814 if (error && error != -EOPNOTSUPP) 2815 goto out_iput; 2816 error = simple_acl_create(dir, inode); 2817 if (error) 2818 goto out_iput; 2819 d_tmpfile(dentry, inode); 2820 } 2821 return error; 2822 out_iput: 2823 iput(inode); 2824 return error; 2825 } 2826 2827 static int shmem_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) 2828 { 2829 int error; 2830 2831 if ((error = shmem_mknod(dir, dentry, mode | S_IFDIR, 0))) 2832 return error; 2833 inc_nlink(dir); 2834 return 0; 2835 } 2836 2837 static int shmem_create(struct inode *dir, struct dentry *dentry, umode_t mode, 2838 bool excl) 2839 { 2840 return shmem_mknod(dir, dentry, mode | S_IFREG, 0); 2841 } 2842 2843 /* 2844 * Link a file.. 2845 */ 2846 static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) 2847 { 2848 struct inode *inode = d_inode(old_dentry); 2849 int ret; 2850 2851 /* 2852 * No ordinary (disk based) filesystem counts links as inodes; 2853 * but each new link needs a new dentry, pinning lowmem, and 2854 * tmpfs dentries cannot be pruned until they are unlinked. 2855 */ 2856 ret = shmem_reserve_inode(inode->i_sb); 2857 if (ret) 2858 goto out; 2859 2860 dir->i_size += BOGO_DIRENT_SIZE; 2861 inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); 2862 inc_nlink(inode); 2863 ihold(inode); /* New dentry reference */ 2864 dget(dentry); /* Extra pinning count for the created dentry */ 2865 d_instantiate(dentry, inode); 2866 out: 2867 return ret; 2868 } 2869 2870 static int shmem_unlink(struct inode *dir, struct dentry *dentry) 2871 { 2872 struct inode *inode = d_inode(dentry); 2873 2874 if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) 2875 shmem_free_inode(inode->i_sb); 2876 2877 dir->i_size -= BOGO_DIRENT_SIZE; 2878 inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); 2879 drop_nlink(inode); 2880 dput(dentry); /* Undo the count from "create" - this does all the work */ 2881 return 0; 2882 } 2883 2884 static int shmem_rmdir(struct inode *dir, struct dentry *dentry) 2885 { 2886 if (!simple_empty(dentry)) 2887 return -ENOTEMPTY; 2888 2889 drop_nlink(d_inode(dentry)); 2890 drop_nlink(dir); 2891 return shmem_unlink(dir, dentry); 2892 } 2893 2894 static int shmem_exchange(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) 2895 { 2896 bool old_is_dir = d_is_dir(old_dentry); 2897 bool new_is_dir = d_is_dir(new_dentry); 2898 2899 if (old_dir != new_dir && old_is_dir != new_is_dir) { 2900 if (old_is_dir) { 2901 drop_nlink(old_dir); 2902 inc_nlink(new_dir); 2903 } else { 2904 drop_nlink(new_dir); 2905 inc_nlink(old_dir); 2906 } 2907 } 2908 old_dir->i_ctime = old_dir->i_mtime = 2909 new_dir->i_ctime = new_dir->i_mtime = 2910 d_inode(old_dentry)->i_ctime = 2911 d_inode(new_dentry)->i_ctime = current_time(old_dir); 2912 2913 return 0; 2914 } 2915 2916 static int shmem_whiteout(struct inode *old_dir, struct dentry *old_dentry) 2917 { 2918 struct dentry *whiteout; 2919 int error; 2920 2921 whiteout = d_alloc(old_dentry->d_parent, &old_dentry->d_name); 2922 if (!whiteout) 2923 return -ENOMEM; 2924 2925 error = shmem_mknod(old_dir, whiteout, 2926 S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV); 2927 dput(whiteout); 2928 if (error) 2929 return error; 2930 2931 /* 2932 * Cheat and hash the whiteout while the old dentry is still in 2933 * place, instead of playing games with FS_RENAME_DOES_D_MOVE. 2934 * 2935 * d_lookup() will consistently find one of them at this point, 2936 * not sure which one, but that isn't even important. 2937 */ 2938 d_rehash(whiteout); 2939 return 0; 2940 } 2941 2942 /* 2943 * The VFS layer already does all the dentry stuff for rename, 2944 * we just have to decrement the usage count for the target if 2945 * it exists so that the VFS layer correctly free's it when it 2946 * gets overwritten. 2947 */ 2948 static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) 2949 { 2950 struct inode *inode = d_inode(old_dentry); 2951 int they_are_dirs = S_ISDIR(inode->i_mode); 2952 2953 if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) 2954 return -EINVAL; 2955 2956 if (flags & RENAME_EXCHANGE) 2957 return shmem_exchange(old_dir, old_dentry, new_dir, new_dentry); 2958 2959 if (!simple_empty(new_dentry)) 2960 return -ENOTEMPTY; 2961 2962 if (flags & RENAME_WHITEOUT) { 2963 int error; 2964 2965 error = shmem_whiteout(old_dir, old_dentry); 2966 if (error) 2967 return error; 2968 } 2969 2970 if (d_really_is_positive(new_dentry)) { 2971 (void) shmem_unlink(new_dir, new_dentry); 2972 if (they_are_dirs) { 2973 drop_nlink(d_inode(new_dentry)); 2974 drop_nlink(old_dir); 2975 } 2976 } else if (they_are_dirs) { 2977 drop_nlink(old_dir); 2978 inc_nlink(new_dir); 2979 } 2980 2981 old_dir->i_size -= BOGO_DIRENT_SIZE; 2982 new_dir->i_size += BOGO_DIRENT_SIZE; 2983 old_dir->i_ctime = old_dir->i_mtime = 2984 new_dir->i_ctime = new_dir->i_mtime = 2985 inode->i_ctime = current_time(old_dir); 2986 return 0; 2987 } 2988 2989 static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *symname) 2990 { 2991 int error; 2992 int len; 2993 struct inode *inode; 2994 struct page *page; 2995 2996 len = strlen(symname) + 1; 2997 if (len > PAGE_SIZE) 2998 return -ENAMETOOLONG; 2999 3000 inode = shmem_get_inode(dir->i_sb, dir, S_IFLNK | 0777, 0, 3001 VM_NORESERVE); 3002 if (!inode) 3003 return -ENOSPC; 3004 3005 error = security_inode_init_security(inode, dir, &dentry->d_name, 3006 shmem_initxattrs, NULL); 3007 if (error) { 3008 if (error != -EOPNOTSUPP) { 3009 iput(inode); 3010 return error; 3011 } 3012 error = 0; 3013 } 3014 3015 inode->i_size = len-1; 3016 if (len <= SHORT_SYMLINK_LEN) { 3017 inode->i_link = kmemdup(symname, len, GFP_KERNEL); 3018 if (!inode->i_link) { 3019 iput(inode); 3020 return -ENOMEM; 3021 } 3022 inode->i_op = &shmem_short_symlink_operations; 3023 } else { 3024 inode_nohighmem(inode); 3025 error = shmem_getpage(inode, 0, &page, SGP_WRITE); 3026 if (error) { 3027 iput(inode); 3028 return error; 3029 } 3030 inode->i_mapping->a_ops = &shmem_aops; 3031 inode->i_op = &shmem_symlink_inode_operations; 3032 memcpy(page_address(page), symname, len); 3033 SetPageUptodate(page); 3034 set_page_dirty(page); 3035 unlock_page(page); 3036 put_page(page); 3037 } 3038 dir->i_size += BOGO_DIRENT_SIZE; 3039 dir->i_ctime = dir->i_mtime = current_time(dir); 3040 d_instantiate(dentry, inode); 3041 dget(dentry); 3042 return 0; 3043 } 3044 3045 static void shmem_put_link(void *arg) 3046 { 3047 mark_page_accessed(arg); 3048 put_page(arg); 3049 } 3050 3051 static const char *shmem_get_link(struct dentry *dentry, 3052 struct inode *inode, 3053 struct delayed_call *done) 3054 { 3055 struct page *page = NULL; 3056 int error; 3057 if (!dentry) { 3058 page = find_get_page(inode->i_mapping, 0); 3059 if (!page) 3060 return ERR_PTR(-ECHILD); 3061 if (!PageUptodate(page)) { 3062 put_page(page); 3063 return ERR_PTR(-ECHILD); 3064 } 3065 } else { 3066 error = shmem_getpage(inode, 0, &page, SGP_READ); 3067 if (error) 3068 return ERR_PTR(error); 3069 unlock_page(page); 3070 } 3071 set_delayed_call(done, shmem_put_link, page); 3072 return page_address(page); 3073 } 3074 3075 #ifdef CONFIG_TMPFS_XATTR 3076 /* 3077 * Superblocks without xattr inode operations may get some security.* xattr 3078 * support from the LSM "for free". As soon as we have any other xattrs 3079 * like ACLs, we also need to implement the security.* handlers at 3080 * filesystem level, though. 3081 */ 3082 3083 /* 3084 * Callback for security_inode_init_security() for acquiring xattrs. 3085 */ 3086 static int shmem_initxattrs(struct inode *inode, 3087 const struct xattr *xattr_array, 3088 void *fs_info) 3089 { 3090 struct shmem_inode_info *info = SHMEM_I(inode); 3091 const struct xattr *xattr; 3092 struct simple_xattr *new_xattr; 3093 size_t len; 3094 3095 for (xattr = xattr_array; xattr->name != NULL; xattr++) { 3096 new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len); 3097 if (!new_xattr) 3098 return -ENOMEM; 3099 3100 len = strlen(xattr->name) + 1; 3101 new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len, 3102 GFP_KERNEL); 3103 if (!new_xattr->name) { 3104 kfree(new_xattr); 3105 return -ENOMEM; 3106 } 3107 3108 memcpy(new_xattr->name, XATTR_SECURITY_PREFIX, 3109 XATTR_SECURITY_PREFIX_LEN); 3110 memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN, 3111 xattr->name, len); 3112 3113 simple_xattr_list_add(&info->xattrs, new_xattr); 3114 } 3115 3116 return 0; 3117 } 3118 3119 static int shmem_xattr_handler_get(const struct xattr_handler *handler, 3120 struct dentry *unused, struct inode *inode, 3121 const char *name, void *buffer, size_t size) 3122 { 3123 struct shmem_inode_info *info = SHMEM_I(inode); 3124 3125 name = xattr_full_name(handler, name); 3126 return simple_xattr_get(&info->xattrs, name, buffer, size); 3127 } 3128 3129 static int shmem_xattr_handler_set(const struct xattr_handler *handler, 3130 struct dentry *unused, struct inode *inode, 3131 const char *name, const void *value, 3132 size_t size, int flags) 3133 { 3134 struct shmem_inode_info *info = SHMEM_I(inode); 3135 3136 name = xattr_full_name(handler, name); 3137 return simple_xattr_set(&info->xattrs, name, value, size, flags); 3138 } 3139 3140 static const struct xattr_handler shmem_security_xattr_handler = { 3141 .prefix = XATTR_SECURITY_PREFIX, 3142 .get = shmem_xattr_handler_get, 3143 .set = shmem_xattr_handler_set, 3144 }; 3145 3146 static const struct xattr_handler shmem_trusted_xattr_handler = { 3147 .prefix = XATTR_TRUSTED_PREFIX, 3148 .get = shmem_xattr_handler_get, 3149 .set = shmem_xattr_handler_set, 3150 }; 3151 3152 static const struct xattr_handler *shmem_xattr_handlers[] = { 3153 #ifdef CONFIG_TMPFS_POSIX_ACL 3154 &posix_acl_access_xattr_handler, 3155 &posix_acl_default_xattr_handler, 3156 #endif 3157 &shmem_security_xattr_handler, 3158 &shmem_trusted_xattr_handler, 3159 NULL 3160 }; 3161 3162 static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size) 3163 { 3164 struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); 3165 return simple_xattr_list(d_inode(dentry), &info->xattrs, buffer, size); 3166 } 3167 #endif /* CONFIG_TMPFS_XATTR */ 3168 3169 static const struct inode_operations shmem_short_symlink_operations = { 3170 .get_link = simple_get_link, 3171 #ifdef CONFIG_TMPFS_XATTR 3172 .listxattr = shmem_listxattr, 3173 #endif 3174 }; 3175 3176 static const struct inode_operations shmem_symlink_inode_operations = { 3177 .get_link = shmem_get_link, 3178 #ifdef CONFIG_TMPFS_XATTR 3179 .listxattr = shmem_listxattr, 3180 #endif 3181 }; 3182 3183 static struct dentry *shmem_get_parent(struct dentry *child) 3184 { 3185 return ERR_PTR(-ESTALE); 3186 } 3187 3188 static int shmem_match(struct inode *ino, void *vfh) 3189 { 3190 __u32 *fh = vfh; 3191 __u64 inum = fh[2]; 3192 inum = (inum << 32) | fh[1]; 3193 return ino->i_ino == inum && fh[0] == ino->i_generation; 3194 } 3195 3196 /* Find any alias of inode, but prefer a hashed alias */ 3197 static struct dentry *shmem_find_alias(struct inode *inode) 3198 { 3199 struct dentry *alias = d_find_alias(inode); 3200 3201 return alias ?: d_find_any_alias(inode); 3202 } 3203 3204 3205 static struct dentry *shmem_fh_to_dentry(struct super_block *sb, 3206 struct fid *fid, int fh_len, int fh_type) 3207 { 3208 struct inode *inode; 3209 struct dentry *dentry = NULL; 3210 u64 inum; 3211 3212 if (fh_len < 3) 3213 return NULL; 3214 3215 inum = fid->raw[2]; 3216 inum = (inum << 32) | fid->raw[1]; 3217 3218 inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]), 3219 shmem_match, fid->raw); 3220 if (inode) { 3221 dentry = shmem_find_alias(inode); 3222 iput(inode); 3223 } 3224 3225 return dentry; 3226 } 3227 3228 static int shmem_encode_fh(struct inode *inode, __u32 *fh, int *len, 3229 struct inode *parent) 3230 { 3231 if (*len < 3) { 3232 *len = 3; 3233 return FILEID_INVALID; 3234 } 3235 3236 if (inode_unhashed(inode)) { 3237 /* Unfortunately insert_inode_hash is not idempotent, 3238 * so as we hash inodes here rather than at creation 3239 * time, we need a lock to ensure we only try 3240 * to do it once 3241 */ 3242 static DEFINE_SPINLOCK(lock); 3243 spin_lock(&lock); 3244 if (inode_unhashed(inode)) 3245 __insert_inode_hash(inode, 3246 inode->i_ino + inode->i_generation); 3247 spin_unlock(&lock); 3248 } 3249 3250 fh[0] = inode->i_generation; 3251 fh[1] = inode->i_ino; 3252 fh[2] = ((__u64)inode->i_ino) >> 32; 3253 3254 *len = 3; 3255 return 1; 3256 } 3257 3258 static const struct export_operations shmem_export_ops = { 3259 .get_parent = shmem_get_parent, 3260 .encode_fh = shmem_encode_fh, 3261 .fh_to_dentry = shmem_fh_to_dentry, 3262 }; 3263 3264 static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, 3265 bool remount) 3266 { 3267 char *this_char, *value, *rest; 3268 struct mempolicy *mpol = NULL; 3269 uid_t uid; 3270 gid_t gid; 3271 3272 while (options != NULL) { 3273 this_char = options; 3274 for (;;) { 3275 /* 3276 * NUL-terminate this option: unfortunately, 3277 * mount options form a comma-separated list, 3278 * but mpol's nodelist may also contain commas. 3279 */ 3280 options = strchr(options, ','); 3281 if (options == NULL) 3282 break; 3283 options++; 3284 if (!isdigit(*options)) { 3285 options[-1] = '\0'; 3286 break; 3287 } 3288 } 3289 if (!*this_char) 3290 continue; 3291 if ((value = strchr(this_char,'=')) != NULL) { 3292 *value++ = 0; 3293 } else { 3294 pr_err("tmpfs: No value for mount option '%s'\n", 3295 this_char); 3296 goto error; 3297 } 3298 3299 if (!strcmp(this_char,"size")) { 3300 unsigned long long size; 3301 size = memparse(value,&rest); 3302 if (*rest == '%') { 3303 size <<= PAGE_SHIFT; 3304 size *= totalram_pages; 3305 do_div(size, 100); 3306 rest++; 3307 } 3308 if (*rest) 3309 goto bad_val; 3310 sbinfo->max_blocks = 3311 DIV_ROUND_UP(size, PAGE_SIZE); 3312 } else if (!strcmp(this_char,"nr_blocks")) { 3313 sbinfo->max_blocks = memparse(value, &rest); 3314 if (*rest) 3315 goto bad_val; 3316 } else if (!strcmp(this_char,"nr_inodes")) { 3317 sbinfo->max_inodes = memparse(value, &rest); 3318 if (*rest) 3319 goto bad_val; 3320 } else if (!strcmp(this_char,"mode")) { 3321 if (remount) 3322 continue; 3323 sbinfo->mode = simple_strtoul(value, &rest, 8) & 07777; 3324 if (*rest) 3325 goto bad_val; 3326 } else if (!strcmp(this_char,"uid")) { 3327 if (remount) 3328 continue; 3329 uid = simple_strtoul(value, &rest, 0); 3330 if (*rest) 3331 goto bad_val; 3332 sbinfo->uid = make_kuid(current_user_ns(), uid); 3333 if (!uid_valid(sbinfo->uid)) 3334 goto bad_val; 3335 } else if (!strcmp(this_char,"gid")) { 3336 if (remount) 3337 continue; 3338 gid = simple_strtoul(value, &rest, 0); 3339 if (*rest) 3340 goto bad_val; 3341 sbinfo->gid = make_kgid(current_user_ns(), gid); 3342 if (!gid_valid(sbinfo->gid)) 3343 goto bad_val; 3344 #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE 3345 } else if (!strcmp(this_char, "huge")) { 3346 int huge; 3347 huge = shmem_parse_huge(value); 3348 if (huge < 0) 3349 goto bad_val; 3350 if (!has_transparent_hugepage() && 3351 huge != SHMEM_HUGE_NEVER) 3352 goto bad_val; 3353 sbinfo->huge = huge; 3354 #endif 3355 #ifdef CONFIG_NUMA 3356 } else if (!strcmp(this_char,"mpol")) { 3357 mpol_put(mpol); 3358 mpol = NULL; 3359 if (mpol_parse_str(value, &mpol)) 3360 goto bad_val; 3361 #endif 3362 } else { 3363 pr_err("tmpfs: Bad mount option %s\n", this_char); 3364 goto error; 3365 } 3366 } 3367 sbinfo->mpol = mpol; 3368 return 0; 3369 3370 bad_val: 3371 pr_err("tmpfs: Bad value '%s' for mount option '%s'\n", 3372 value, this_char); 3373 error: 3374 mpol_put(mpol); 3375 return 1; 3376 3377 } 3378 3379 static int shmem_remount_fs(struct super_block *sb, int *flags, char *data) 3380 { 3381 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 3382 struct shmem_sb_info config = *sbinfo; 3383 unsigned long inodes; 3384 int error = -EINVAL; 3385 3386 config.mpol = NULL; 3387 if (shmem_parse_options(data, &config, true)) 3388 return error; 3389 3390 spin_lock(&sbinfo->stat_lock); 3391 inodes = sbinfo->max_inodes - sbinfo->free_inodes; 3392 if (percpu_counter_compare(&sbinfo->used_blocks, config.max_blocks) > 0) 3393 goto out; 3394 if (config.max_inodes < inodes) 3395 goto out; 3396 /* 3397 * Those tests disallow limited->unlimited while any are in use; 3398 * but we must separately disallow unlimited->limited, because 3399 * in that case we have no record of how much is already in use. 3400 */ 3401 if (config.max_blocks && !sbinfo->max_blocks) 3402 goto out; 3403 if (config.max_inodes && !sbinfo->max_inodes) 3404 goto out; 3405 3406 error = 0; 3407 sbinfo->huge = config.huge; 3408 sbinfo->max_blocks = config.max_blocks; 3409 sbinfo->max_inodes = config.max_inodes; 3410 sbinfo->free_inodes = config.max_inodes - inodes; 3411 3412 /* 3413 * Preserve previous mempolicy unless mpol remount option was specified. 3414 */ 3415 if (config.mpol) { 3416 mpol_put(sbinfo->mpol); 3417 sbinfo->mpol = config.mpol; /* transfers initial ref */ 3418 } 3419 out: 3420 spin_unlock(&sbinfo->stat_lock); 3421 return error; 3422 } 3423 3424 static int shmem_show_options(struct seq_file *seq, struct dentry *root) 3425 { 3426 struct shmem_sb_info *sbinfo = SHMEM_SB(root->d_sb); 3427 3428 if (sbinfo->max_blocks != shmem_default_max_blocks()) 3429 seq_printf(seq, ",size=%luk", 3430 sbinfo->max_blocks << (PAGE_SHIFT - 10)); 3431 if (sbinfo->max_inodes != shmem_default_max_inodes()) 3432 seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes); 3433 if (sbinfo->mode != (0777 | S_ISVTX)) 3434 seq_printf(seq, ",mode=%03ho", sbinfo->mode); 3435 if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID)) 3436 seq_printf(seq, ",uid=%u", 3437 from_kuid_munged(&init_user_ns, sbinfo->uid)); 3438 if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID)) 3439 seq_printf(seq, ",gid=%u", 3440 from_kgid_munged(&init_user_ns, sbinfo->gid)); 3441 #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE 3442 /* Rightly or wrongly, show huge mount option unmasked by shmem_huge */ 3443 if (sbinfo->huge) 3444 seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge)); 3445 #endif 3446 shmem_show_mpol(seq, sbinfo->mpol); 3447 return 0; 3448 } 3449 3450 #endif /* CONFIG_TMPFS */ 3451 3452 static void shmem_put_super(struct super_block *sb) 3453 { 3454 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 3455 3456 percpu_counter_destroy(&sbinfo->used_blocks); 3457 mpol_put(sbinfo->mpol); 3458 kfree(sbinfo); 3459 sb->s_fs_info = NULL; 3460 } 3461 3462 int shmem_fill_super(struct super_block *sb, void *data, int silent) 3463 { 3464 struct inode *inode; 3465 struct shmem_sb_info *sbinfo; 3466 int err = -ENOMEM; 3467 3468 /* Round up to L1_CACHE_BYTES to resist false sharing */ 3469 sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info), 3470 L1_CACHE_BYTES), GFP_KERNEL); 3471 if (!sbinfo) 3472 return -ENOMEM; 3473 3474 sbinfo->mode = 0777 | S_ISVTX; 3475 sbinfo->uid = current_fsuid(); 3476 sbinfo->gid = current_fsgid(); 3477 sb->s_fs_info = sbinfo; 3478 3479 #ifdef CONFIG_TMPFS 3480 /* 3481 * Per default we only allow half of the physical ram per 3482 * tmpfs instance, limiting inodes to one per page of lowmem; 3483 * but the internal instance is left unlimited. 3484 */ 3485 if (!(sb->s_flags & SB_KERNMOUNT)) { 3486 sbinfo->max_blocks = shmem_default_max_blocks(); 3487 sbinfo->max_inodes = shmem_default_max_inodes(); 3488 if (shmem_parse_options(data, sbinfo, false)) { 3489 err = -EINVAL; 3490 goto failed; 3491 } 3492 } else { 3493 sb->s_flags |= SB_NOUSER; 3494 } 3495 sb->s_export_op = &shmem_export_ops; 3496 sb->s_flags |= SB_NOSEC; 3497 #else 3498 sb->s_flags |= SB_NOUSER; 3499 #endif 3500 3501 spin_lock_init(&sbinfo->stat_lock); 3502 if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL)) 3503 goto failed; 3504 sbinfo->free_inodes = sbinfo->max_inodes; 3505 spin_lock_init(&sbinfo->shrinklist_lock); 3506 INIT_LIST_HEAD(&sbinfo->shrinklist); 3507 3508 sb->s_maxbytes = MAX_LFS_FILESIZE; 3509 sb->s_blocksize = PAGE_SIZE; 3510 sb->s_blocksize_bits = PAGE_SHIFT; 3511 sb->s_magic = TMPFS_MAGIC; 3512 sb->s_op = &shmem_ops; 3513 sb->s_time_gran = 1; 3514 #ifdef CONFIG_TMPFS_XATTR 3515 sb->s_xattr = shmem_xattr_handlers; 3516 #endif 3517 #ifdef CONFIG_TMPFS_POSIX_ACL 3518 sb->s_flags |= SB_POSIXACL; 3519 #endif 3520 uuid_gen(&sb->s_uuid); 3521 3522 inode = shmem_get_inode(sb, NULL, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE); 3523 if (!inode) 3524 goto failed; 3525 inode->i_uid = sbinfo->uid; 3526 inode->i_gid = sbinfo->gid; 3527 sb->s_root = d_make_root(inode); 3528 if (!sb->s_root) 3529 goto failed; 3530 return 0; 3531 3532 failed: 3533 shmem_put_super(sb); 3534 return err; 3535 } 3536 3537 static struct kmem_cache *shmem_inode_cachep; 3538 3539 static struct inode *shmem_alloc_inode(struct super_block *sb) 3540 { 3541 struct shmem_inode_info *info; 3542 info = kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL); 3543 if (!info) 3544 return NULL; 3545 return &info->vfs_inode; 3546 } 3547 3548 static void shmem_destroy_callback(struct rcu_head *head) 3549 { 3550 struct inode *inode = container_of(head, struct inode, i_rcu); 3551 if (S_ISLNK(inode->i_mode)) 3552 kfree(inode->i_link); 3553 kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); 3554 } 3555 3556 static void shmem_destroy_inode(struct inode *inode) 3557 { 3558 if (S_ISREG(inode->i_mode)) 3559 mpol_free_shared_policy(&SHMEM_I(inode)->policy); 3560 call_rcu(&inode->i_rcu, shmem_destroy_callback); 3561 } 3562 3563 static void shmem_init_inode(void *foo) 3564 { 3565 struct shmem_inode_info *info = foo; 3566 inode_init_once(&info->vfs_inode); 3567 } 3568 3569 static void shmem_init_inodecache(void) 3570 { 3571 shmem_inode_cachep = kmem_cache_create("shmem_inode_cache", 3572 sizeof(struct shmem_inode_info), 3573 0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode); 3574 } 3575 3576 static void shmem_destroy_inodecache(void) 3577 { 3578 kmem_cache_destroy(shmem_inode_cachep); 3579 } 3580 3581 static const struct address_space_operations shmem_aops = { 3582 .writepage = shmem_writepage, 3583 .set_page_dirty = __set_page_dirty_no_writeback, 3584 #ifdef CONFIG_TMPFS 3585 .write_begin = shmem_write_begin, 3586 .write_end = shmem_write_end, 3587 #endif 3588 #ifdef CONFIG_MIGRATION 3589 .migratepage = migrate_page, 3590 #endif 3591 .error_remove_page = generic_error_remove_page, 3592 }; 3593 3594 static const struct file_operations shmem_file_operations = { 3595 .mmap = shmem_mmap, 3596 .get_unmapped_area = shmem_get_unmapped_area, 3597 #ifdef CONFIG_TMPFS 3598 .llseek = shmem_file_llseek, 3599 .read_iter = shmem_file_read_iter, 3600 .write_iter = generic_file_write_iter, 3601 .fsync = noop_fsync, 3602 .splice_read = generic_file_splice_read, 3603 .splice_write = iter_file_splice_write, 3604 .fallocate = shmem_fallocate, 3605 #endif 3606 }; 3607 3608 static const struct inode_operations shmem_inode_operations = { 3609 .getattr = shmem_getattr, 3610 .setattr = shmem_setattr, 3611 #ifdef CONFIG_TMPFS_XATTR 3612 .listxattr = shmem_listxattr, 3613 .set_acl = simple_set_acl, 3614 #endif 3615 }; 3616 3617 static const struct inode_operations shmem_dir_inode_operations = { 3618 #ifdef CONFIG_TMPFS 3619 .create = shmem_create, 3620 .lookup = simple_lookup, 3621 .link = shmem_link, 3622 .unlink = shmem_unlink, 3623 .symlink = shmem_symlink, 3624 .mkdir = shmem_mkdir, 3625 .rmdir = shmem_rmdir, 3626 .mknod = shmem_mknod, 3627 .rename = shmem_rename2, 3628 .tmpfile = shmem_tmpfile, 3629 #endif 3630 #ifdef CONFIG_TMPFS_XATTR 3631 .listxattr = shmem_listxattr, 3632 #endif 3633 #ifdef CONFIG_TMPFS_POSIX_ACL 3634 .setattr = shmem_setattr, 3635 .set_acl = simple_set_acl, 3636 #endif 3637 }; 3638 3639 static const struct inode_operations shmem_special_inode_operations = { 3640 #ifdef CONFIG_TMPFS_XATTR 3641 .listxattr = shmem_listxattr, 3642 #endif 3643 #ifdef CONFIG_TMPFS_POSIX_ACL 3644 .setattr = shmem_setattr, 3645 .set_acl = simple_set_acl, 3646 #endif 3647 }; 3648 3649 static const struct super_operations shmem_ops = { 3650 .alloc_inode = shmem_alloc_inode, 3651 .destroy_inode = shmem_destroy_inode, 3652 #ifdef CONFIG_TMPFS 3653 .statfs = shmem_statfs, 3654 .remount_fs = shmem_remount_fs, 3655 .show_options = shmem_show_options, 3656 #endif 3657 .evict_inode = shmem_evict_inode, 3658 .drop_inode = generic_delete_inode, 3659 .put_super = shmem_put_super, 3660 #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE 3661 .nr_cached_objects = shmem_unused_huge_count, 3662 .free_cached_objects = shmem_unused_huge_scan, 3663 #endif 3664 }; 3665 3666 static const struct vm_operations_struct shmem_vm_ops = { 3667 .fault = shmem_fault, 3668 .map_pages = filemap_map_pages, 3669 #ifdef CONFIG_NUMA 3670 .set_policy = shmem_set_policy, 3671 .get_policy = shmem_get_policy, 3672 #endif 3673 }; 3674 3675 static struct dentry *shmem_mount(struct file_system_type *fs_type, 3676 int flags, const char *dev_name, void *data) 3677 { 3678 return mount_nodev(fs_type, flags, data, shmem_fill_super); 3679 } 3680 3681 static struct file_system_type shmem_fs_type = { 3682 .owner = THIS_MODULE, 3683 .name = "tmpfs", 3684 .mount = shmem_mount, 3685 .kill_sb = kill_litter_super, 3686 .fs_flags = FS_USERNS_MOUNT, 3687 }; 3688 3689 int __init shmem_init(void) 3690 { 3691 int error; 3692 3693 /* If rootfs called this, don't re-init */ 3694 if (shmem_inode_cachep) 3695 return 0; 3696 3697 shmem_init_inodecache(); 3698 3699 error = register_filesystem(&shmem_fs_type); 3700 if (error) { 3701 pr_err("Could not register tmpfs\n"); 3702 goto out2; 3703 } 3704 3705 shm_mnt = kern_mount(&shmem_fs_type); 3706 if (IS_ERR(shm_mnt)) { 3707 error = PTR_ERR(shm_mnt); 3708 pr_err("Could not kern_mount tmpfs\n"); 3709 goto out1; 3710 } 3711 3712 #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE 3713 if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY) 3714 SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge; 3715 else 3716 shmem_huge = 0; /* just in case it was patched */ 3717 #endif 3718 return 0; 3719 3720 out1: 3721 unregister_filesystem(&shmem_fs_type); 3722 out2: 3723 shmem_destroy_inodecache(); 3724 shm_mnt = ERR_PTR(error); 3725 return error; 3726 } 3727 3728 #if defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && defined(CONFIG_SYSFS) 3729 static ssize_t shmem_enabled_show(struct kobject *kobj, 3730 struct kobj_attribute *attr, char *buf) 3731 { 3732 int values[] = { 3733 SHMEM_HUGE_ALWAYS, 3734 SHMEM_HUGE_WITHIN_SIZE, 3735 SHMEM_HUGE_ADVISE, 3736 SHMEM_HUGE_NEVER, 3737 SHMEM_HUGE_DENY, 3738 SHMEM_HUGE_FORCE, 3739 }; 3740 int i, count; 3741 3742 for (i = 0, count = 0; i < ARRAY_SIZE(values); i++) { 3743 const char *fmt = shmem_huge == values[i] ? "[%s] " : "%s "; 3744 3745 count += sprintf(buf + count, fmt, 3746 shmem_format_huge(values[i])); 3747 } 3748 buf[count - 1] = '\n'; 3749 return count; 3750 } 3751 3752 static ssize_t shmem_enabled_store(struct kobject *kobj, 3753 struct kobj_attribute *attr, const char *buf, size_t count) 3754 { 3755 char tmp[16]; 3756 int huge; 3757 3758 if (count + 1 > sizeof(tmp)) 3759 return -EINVAL; 3760 memcpy(tmp, buf, count); 3761 tmp[count] = '\0'; 3762 if (count && tmp[count - 1] == '\n') 3763 tmp[count - 1] = '\0'; 3764 3765 huge = shmem_parse_huge(tmp); 3766 if (huge == -EINVAL) 3767 return -EINVAL; 3768 if (!has_transparent_hugepage() && 3769 huge != SHMEM_HUGE_NEVER && huge != SHMEM_HUGE_DENY) 3770 return -EINVAL; 3771 3772 shmem_huge = huge; 3773 if (shmem_huge > SHMEM_HUGE_DENY) 3774 SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge; 3775 return count; 3776 } 3777 3778 struct kobj_attribute shmem_enabled_attr = 3779 __ATTR(shmem_enabled, 0644, shmem_enabled_show, shmem_enabled_store); 3780 #endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE && CONFIG_SYSFS */ 3781 3782 #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE 3783 bool shmem_huge_enabled(struct vm_area_struct *vma) 3784 { 3785 struct inode *inode = file_inode(vma->vm_file); 3786 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 3787 loff_t i_size; 3788 pgoff_t off; 3789 3790 if (shmem_huge == SHMEM_HUGE_FORCE) 3791 return true; 3792 if (shmem_huge == SHMEM_HUGE_DENY) 3793 return false; 3794 switch (sbinfo->huge) { 3795 case SHMEM_HUGE_NEVER: 3796 return false; 3797 case SHMEM_HUGE_ALWAYS: 3798 return true; 3799 case SHMEM_HUGE_WITHIN_SIZE: 3800 off = round_up(vma->vm_pgoff, HPAGE_PMD_NR); 3801 i_size = round_up(i_size_read(inode), PAGE_SIZE); 3802 if (i_size >= HPAGE_PMD_SIZE && 3803 i_size >> PAGE_SHIFT >= off) 3804 return true; 3805 /* fall through */ 3806 case SHMEM_HUGE_ADVISE: 3807 /* TODO: implement fadvise() hints */ 3808 return (vma->vm_flags & VM_HUGEPAGE); 3809 default: 3810 VM_BUG_ON(1); 3811 return false; 3812 } 3813 } 3814 #endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE */ 3815 3816 #else /* !CONFIG_SHMEM */ 3817 3818 /* 3819 * tiny-shmem: simple shmemfs and tmpfs using ramfs code 3820 * 3821 * This is intended for small system where the benefits of the full 3822 * shmem code (swap-backed and resource-limited) are outweighed by 3823 * their complexity. On systems without swap this code should be 3824 * effectively equivalent, but much lighter weight. 3825 */ 3826 3827 static struct file_system_type shmem_fs_type = { 3828 .name = "tmpfs", 3829 .mount = ramfs_mount, 3830 .kill_sb = kill_litter_super, 3831 .fs_flags = FS_USERNS_MOUNT, 3832 }; 3833 3834 int __init shmem_init(void) 3835 { 3836 BUG_ON(register_filesystem(&shmem_fs_type) != 0); 3837 3838 shm_mnt = kern_mount(&shmem_fs_type); 3839 BUG_ON(IS_ERR(shm_mnt)); 3840 3841 return 0; 3842 } 3843 3844 int shmem_unuse(swp_entry_t swap, struct page *page) 3845 { 3846 return 0; 3847 } 3848 3849 int shmem_lock(struct file *file, int lock, struct user_struct *user) 3850 { 3851 return 0; 3852 } 3853 3854 void shmem_unlock_mapping(struct address_space *mapping) 3855 { 3856 } 3857 3858 #ifdef CONFIG_MMU 3859 unsigned long shmem_get_unmapped_area(struct file *file, 3860 unsigned long addr, unsigned long len, 3861 unsigned long pgoff, unsigned long flags) 3862 { 3863 return current->mm->get_unmapped_area(file, addr, len, pgoff, flags); 3864 } 3865 #endif 3866 3867 void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) 3868 { 3869 truncate_inode_pages_range(inode->i_mapping, lstart, lend); 3870 } 3871 EXPORT_SYMBOL_GPL(shmem_truncate_range); 3872 3873 #define shmem_vm_ops generic_file_vm_ops 3874 #define shmem_file_operations ramfs_file_operations 3875 #define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev) 3876 #define shmem_acct_size(flags, size) 0 3877 #define shmem_unacct_size(flags, size) do {} while (0) 3878 3879 #endif /* CONFIG_SHMEM */ 3880 3881 /* common code */ 3882 3883 static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, loff_t size, 3884 unsigned long flags, unsigned int i_flags) 3885 { 3886 struct inode *inode; 3887 struct file *res; 3888 3889 if (IS_ERR(mnt)) 3890 return ERR_CAST(mnt); 3891 3892 if (size < 0 || size > MAX_LFS_FILESIZE) 3893 return ERR_PTR(-EINVAL); 3894 3895 if (shmem_acct_size(flags, size)) 3896 return ERR_PTR(-ENOMEM); 3897 3898 inode = shmem_get_inode(mnt->mnt_sb, NULL, S_IFREG | S_IRWXUGO, 0, 3899 flags); 3900 if (unlikely(!inode)) { 3901 shmem_unacct_size(flags, size); 3902 return ERR_PTR(-ENOSPC); 3903 } 3904 inode->i_flags |= i_flags; 3905 inode->i_size = size; 3906 clear_nlink(inode); /* It is unlinked */ 3907 res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size)); 3908 if (!IS_ERR(res)) 3909 res = alloc_file_pseudo(inode, mnt, name, O_RDWR, 3910 &shmem_file_operations); 3911 if (IS_ERR(res)) 3912 iput(inode); 3913 return res; 3914 } 3915 3916 /** 3917 * shmem_kernel_file_setup - get an unlinked file living in tmpfs which must be 3918 * kernel internal. There will be NO LSM permission checks against the 3919 * underlying inode. So users of this interface must do LSM checks at a 3920 * higher layer. The users are the big_key and shm implementations. LSM 3921 * checks are provided at the key or shm level rather than the inode. 3922 * @name: name for dentry (to be seen in /proc/<pid>/maps 3923 * @size: size to be set for the file 3924 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size 3925 */ 3926 struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags) 3927 { 3928 return __shmem_file_setup(shm_mnt, name, size, flags, S_PRIVATE); 3929 } 3930 3931 /** 3932 * shmem_file_setup - get an unlinked file living in tmpfs 3933 * @name: name for dentry (to be seen in /proc/<pid>/maps 3934 * @size: size to be set for the file 3935 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size 3936 */ 3937 struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags) 3938 { 3939 return __shmem_file_setup(shm_mnt, name, size, flags, 0); 3940 } 3941 EXPORT_SYMBOL_GPL(shmem_file_setup); 3942 3943 /** 3944 * shmem_file_setup_with_mnt - get an unlinked file living in tmpfs 3945 * @mnt: the tmpfs mount where the file will be created 3946 * @name: name for dentry (to be seen in /proc/<pid>/maps 3947 * @size: size to be set for the file 3948 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size 3949 */ 3950 struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, const char *name, 3951 loff_t size, unsigned long flags) 3952 { 3953 return __shmem_file_setup(mnt, name, size, flags, 0); 3954 } 3955 EXPORT_SYMBOL_GPL(shmem_file_setup_with_mnt); 3956 3957 /** 3958 * shmem_zero_setup - setup a shared anonymous mapping 3959 * @vma: the vma to be mmapped is prepared by do_mmap_pgoff 3960 */ 3961 int shmem_zero_setup(struct vm_area_struct *vma) 3962 { 3963 struct file *file; 3964 loff_t size = vma->vm_end - vma->vm_start; 3965 3966 /* 3967 * Cloning a new file under mmap_sem leads to a lock ordering conflict 3968 * between XFS directory reading and selinux: since this file is only 3969 * accessible to the user through its mapping, use S_PRIVATE flag to 3970 * bypass file security, in the same way as shmem_kernel_file_setup(). 3971 */ 3972 file = shmem_kernel_file_setup("dev/zero", size, vma->vm_flags); 3973 if (IS_ERR(file)) 3974 return PTR_ERR(file); 3975 3976 if (vma->vm_file) 3977 fput(vma->vm_file); 3978 vma->vm_file = file; 3979 vma->vm_ops = &shmem_vm_ops; 3980 3981 if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && 3982 ((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) < 3983 (vma->vm_end & HPAGE_PMD_MASK)) { 3984 khugepaged_enter(vma, vma->vm_flags); 3985 } 3986 3987 return 0; 3988 } 3989 3990 /** 3991 * shmem_read_mapping_page_gfp - read into page cache, using specified page allocation flags. 3992 * @mapping: the page's address_space 3993 * @index: the page index 3994 * @gfp: the page allocator flags to use if allocating 3995 * 3996 * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)", 3997 * with any new page allocations done using the specified allocation flags. 3998 * But read_cache_page_gfp() uses the ->readpage() method: which does not 3999 * suit tmpfs, since it may have pages in swapcache, and needs to find those 4000 * for itself; although drivers/gpu/drm i915 and ttm rely upon this support. 4001 * 4002 * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in 4003 * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily. 4004 */ 4005 struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, 4006 pgoff_t index, gfp_t gfp) 4007 { 4008 #ifdef CONFIG_SHMEM 4009 struct inode *inode = mapping->host; 4010 struct page *page; 4011 int error; 4012 4013 BUG_ON(mapping->a_ops != &shmem_aops); 4014 error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, 4015 gfp, NULL, NULL, NULL); 4016 if (error) 4017 page = ERR_PTR(error); 4018 else 4019 unlock_page(page); 4020 return page; 4021 #else 4022 /* 4023 * The tiny !SHMEM case uses ramfs without swap 4024 */ 4025 return read_cache_page_gfp(mapping, index, gfp); 4026 #endif 4027 } 4028 EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp); 4029